!46092 PyNative dynamic shape

Merge pull request !46092 from caifubi/r2.0.0-alpha
2022-11-29 12:43:54 +00:00 · 2022-11-29 12:43:54 +00:00 · 0d03bdec89
parent 35ccfc100f ca9d1592d2
commit 0d03bdec89
42 changed files with 392 additions and 160 deletions
--- a/mindspore/ccsrc/backend/common/optimizer/common_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/common/optimizer/common_backend_optimization.cc
@ -28,7 +28,7 @@
 #include "backend/common/pass/convert_attr_to_unify_mindir.h"
 #include "backend/common/pass/optimize_updatestate.h"
 #include "backend/common/pass/conv_transpose_to_conv_bp.h"
-#include "backend/common/pass/reduce_sum_optimizer.h"
+#include "backend/common/pass/reduce_optimizer.h"
 #include "backend/common/pass/add_dynamic_shape_attr.h"
 #include "backend/common/pass/add_akg_kernel_attrs.h"
 #include "backend/common/pass/inplace_assign_for_custom_op.h"
@ -62,7 +62,7 @@ void BackendCommonOptimization(const std::shared_ptr<session::KernelGraph> &kern
  auto common_pm = std::make_shared<PassManager>("common_pm");
  common_pm->AddPass(std::make_shared<AddDynamicShapeAttr>());
  common_pm->AddPass(std::make_shared<ConvertDynamicBroadcastTo>());
-  common_pm->AddPass(std::make_shared<ReduceSumOptimizer>());
+  common_pm->AddPass(std::make_shared<ReduceOptimizer>());
  common_pm->AddPass(std::make_shared<ConvertConstInputToAttr>());
  common_pm->AddPass(std::make_shared<CustomOpConstInputToAttr>());
  common_pm->AddPass(std::make_shared<ConvertConstInputToTensorInput>());
@ -91,7 +91,7 @@ void OpBackendCommonOptimization(const std::shared_ptr<session::KernelGraph> &ke
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto optimizer = std::make_shared<GraphOptimizer>();
  auto common_pm = std::make_shared<PassManager>("op_common_pm");
-  common_pm->AddPass(std::make_shared<ReduceSumOptimizer>());
+  common_pm->AddPass(std::make_shared<ReduceOptimizer>());
  common_pm->AddPass(std::make_shared<ConvertConstInputToTensorInput>());
  optimizer->AddPassManager(common_pm);
  (void)optimizer->Optimize(kernel_graph);
--- a/mindspore/ccsrc/backend/common/pass/reduce_sum_optimizer.cc
+++ b/mindspore/ccsrc/backend/common/pass/reduce_sum_optimizer.cc
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "backend/common/pass/reduce_sum_optimizer.h"
+#include "backend/common/pass/reduce_optimizer.h"
 #include <vector>
 #include "include/common/utils/anfalgo.h"
 #include "utils/ms_context.h"
@ -25,7 +25,7 @@ namespace {
 constexpr int axis_input_index = 2;
 }  // namespace

-AnfNodePtr ReduceSumOptimizer::NewRankOp(const AnfNodePtr &cnode, const KernelGraphPtr &kernel_graph) const {
+AnfNodePtr ReduceOptimizer::NewRankOp(const AnfNodePtr &cnode, const KernelGraphPtr &kernel_graph) const {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(kernel_graph);
  std::vector<AnfNodePtr> rank_inputs;
@ -39,7 +39,7 @@ AnfNodePtr ReduceSumOptimizer::NewRankOp(const AnfNodePtr &cnode, const KernelGr
  return rank_op;
 }

-AnfNodePtr ReduceSumOptimizer::NewRangeOp(const AnfNodePtr &rank_op, const KernelGraphPtr &kernel_graph) const {
+AnfNodePtr ReduceOptimizer::NewRangeOp(const AnfNodePtr &rank_op, const KernelGraphPtr &kernel_graph) const {
  MS_EXCEPTION_IF_NULL(rank_op);
  MS_EXCEPTION_IF_NULL(kernel_graph);
  std::vector<AnfNodePtr> range_inputs;
@ -68,15 +68,15 @@ AnfNodePtr ReduceSumOptimizer::NewRangeOp(const AnfNodePtr &rank_op, const Kerne
  return range_op;
 }

-AnfNodePtr ReduceSumOptimizer::InsertAssistNode(const CNodePtr &cnode, const KernelGraphPtr &) const {
+AnfNodePtr ReduceOptimizer::InsertAssistNode(const CNodePtr &cnode, const KernelGraphPtr &) const {
  // the input dim is unknown, need rank + range, don't supported now;
  MS_LOG(EXCEPTION)
    << "Can not support the case that input is dim unknown and axis is empty or axis contain value less 0. node: "
    << trace::DumpSourceLines(cnode);
 }

-AnfNodePtr ReduceSumOptimizer::CreateValueNodeWithVector(const CNodePtr &cnode, const KernelGraphPtr &kernel_graph,
-                                                         const std::vector<int64_t> &axis) const {
+AnfNodePtr ReduceOptimizer::CreateValueNodeWithVector(const CNodePtr &cnode, const KernelGraphPtr &kernel_graph,
+                                                      const std::vector<int64_t> &axis) const {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto new_value_node = NewValueNode(MakeValue<std::vector<int64_t>>(axis));
@ -92,8 +92,8 @@ AnfNodePtr ReduceSumOptimizer::CreateValueNodeWithVector(const CNodePtr &cnode,
  return new_node;
 }

-AnfNodePtr ReduceSumOptimizer::HandleAxisWithEmptyTensor(const CNodePtr &cnode, const KernelGraphPtr &kernel_graph,
-                                                         const AnfNodePtr &axis_input) const {
+AnfNodePtr ReduceOptimizer::HandleAxisWithEmptyTensor(const CNodePtr &cnode, const KernelGraphPtr &kernel_graph,
+                                                      const AnfNodePtr &axis_input) const {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(axis_input);
  MS_EXCEPTION_IF_NULL(kernel_graph);
@ -125,7 +125,7 @@ AnfNodePtr ReduceSumOptimizer::HandleAxisWithEmptyTensor(const CNodePtr &cnode,
 // 2: the value of axis_input contain the value less 0,
 // the new tensor of the new value node should be "shape.size() + the_old_value_less_0",
 // the shape is the first input'shape of ReduceSum;
-AnfNodePtr ReduceSumOptimizer::NewAssistValueNode(const CNodePtr &cnode, const KernelGraphPtr &kernel_graph) const {
+AnfNodePtr ReduceOptimizer::NewAssistValueNode(const CNodePtr &cnode, const KernelGraphPtr &kernel_graph) const {
  // axis is a tuple ,maybe empty or contain a value less 0;
  if (cnode->inputs().size() <= axis_input_index) {
    return nullptr;
@ -170,18 +170,18 @@ AnfNodePtr ReduceSumOptimizer::NewAssistValueNode(const CNodePtr &cnode, const K
  return nullptr;
 }

-const AnfNodePtr ReduceSumOptimizer::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                             const EquivPtr &) const {
+const AnfNodePtr ReduceOptimizer::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                          const EquivPtr &) const {
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  auto op_name = common::AnfAlgo::GetCNodeName(cnode);
-  if (op_name != kReduceSumOpName) {
+  if (op_name != kReduceSumOpName && op_name != kReduceMeanOpName) {
    MS_LOG(DEBUG) << "Current node is not: " << kReduceSumOpName << ", skip!";
    return nullptr;
  }
-  if (!common::AnfAlgo::IsDynamicShape(cnode)) {
+  if (!common::AnfAlgo::IsDynamicShape(cnode) && !common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, cnode)) {
    MS_LOG(DEBUG) << "Current node is not dynamic shape, skip!";
    return nullptr;
  }
@ -191,7 +191,7 @@ const AnfNodePtr ReduceSumOptimizer::Process(const FuncGraphPtr &func_graph, con
  return NewAssistValueNode(cnode, kernel_graph);
 }

-const BaseRef ReduceSumOptimizer::DefinePattern() const {
+const BaseRef ReduceOptimizer::DefinePattern() const {
  std::shared_ptr<Var> V = std::make_shared<CondVar>(UnVisited);
  std::shared_ptr<Var> Xs = std::make_shared<SeqVar>();
  return VectorRef({V, Xs});
--- a/mindspore/ccsrc/backend/common/pass/reduce_sum_optimizer.h
+++ b/mindspore/ccsrc/backend/common/pass/reduce_sum_optimizer.h
@ -13,18 +13,18 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_REDUCE_SUM_OPTIMIZER_H_
-#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_REDUCE_SUM_OPTIMIZER_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_REDUCE_OPTIMIZER_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_REDUCE_OPTIMIZER_H_
 #include <memory>
 #include <vector>
 #include "backend/common/optimizer/optimizer.h"

 namespace mindspore {
 namespace opt {
-class ReduceSumOptimizer : public PatternProcessPass {
+class ReduceOptimizer : public PatternProcessPass {
 public:
-  explicit ReduceSumOptimizer(bool multigraph = true) : PatternProcessPass("reduce_sum_optimizer", multigraph) {}
-  ~ReduceSumOptimizer() override = default;
+  explicit ReduceOptimizer(bool multigraph = true) : PatternProcessPass("reduce_optimizer", multigraph) {}
+  ~ReduceOptimizer() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const override;

@ -41,4 +41,4 @@ class ReduceSumOptimizer : public PatternProcessPass {
 }  // namespace opt
 }  // namespace mindspore

-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_REDUCE_SUM_OPTIMIZER_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_REDUCE_OPTIMIZER_H_
--- a/mindspore/ccsrc/backend/common/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/common/session/session_basic.cc
@ -444,9 +444,8 @@ void SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel, const InputTenso
  std::ostringstream buf;
  auto prim = common::AnfAlgo::GetCNodePrimitive(kernel);
  MS_EXCEPTION_IF_NULL(prim);
-  buf << GetOpRunDeviceTarget(prim) << "_";
-  buf << prim->id() << "_";
-  bool has_const_input = false;
+  buf << GetOpRunDeviceTarget(prim) << "_dynamic" << op_run_info->base_op_run_info.use_dynamic_shape_process << "_";
+  buf << prim->name() << "_";
  for (size_t i = 0; i < input_tensors.size(); ++i) {
    auto &tensor = input_tensors[i];
    MS_EXCEPTION_IF_NULL(tensor);
@ -472,7 +471,6 @@ void SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel, const InputTenso
    }
    // For constant input
    if (input_tensors_mask[i] == kValueNodeTensorMask) {
-      has_const_input = true;
      buf << common::AnfAlgo::GetTensorValueString(tensor);
    }
    buf << "_";
@ -483,20 +481,6 @@ void SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel, const InputTenso
  (void)std::for_each(attr_map.begin(), attr_map.end(),
                      [&buf](const auto &element) { buf << element.second->ToString(); });

-  // Generally, different inputs can have different output; but different constant inputs may lead to different output
-  if (has_const_input) {
-    buf << "_";
-    const AbstractBasePtr &abstract = kernel->abstract();
-    MS_EXCEPTION_IF_NULL(abstract);
-    auto build_shape = abstract->BuildShape();
-    MS_EXCEPTION_IF_NULL(build_shape);
-    auto build_type = abstract->BuildType();
-    MS_EXCEPTION_IF_NULL(build_type);
-    // Get output shape
-    buf << build_shape->ToString();
-    // Get output dtype
-    buf << build_type->type_id();
-  }
  *graph_info = buf.str();
 }

@ -840,6 +824,10 @@ void SessionBasic::GetOpInputTensors(const CNodePtr &cnode,
                                     InputTensorInfo *input_tensor_info) const {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(input_tensor_info);
+  auto context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context);
+  std::string device_target = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  auto is_mutable = common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, cnode);
  std::vector<size_t> const_input_attr_index = {};
  GetConstValueDepend(cnode, &const_input_attr_index);
  const auto input_tensor_num = common::AnfAlgo::GetInputTensorNum(cnode);
@ -866,8 +854,13 @@ void SessionBasic::GetOpInputTensors(const CNodePtr &cnode,
          is_forward_output = true;
        }
      }
-      input_tensor_info->input_tensors_mask.emplace_back(
-        (is_value_node || !is_forward_output) ? kValueNodeTensorMask : kParameterDataTensorMask);
+      if (is_mutable && device_target == kAscendDevice) {
+        input_tensor_info->input_tensors_mask.emplace_back(
+          (is_value_node && !is_forward_output) ? kValueNodeTensorMask : kParameterDataTensorMask);
+      } else {
+        input_tensor_info->input_tensors_mask.emplace_back(
+          (is_value_node || !is_forward_output) ? kValueNodeTensorMask : kParameterDataTensorMask);
+      }
    } else if (real_input->isa<Parameter>()) {
      tensor = GetParameterOutputTensor(real_input, parameter_index, graph_inputs);
      input_tensor_info->input_tensors_mask.emplace_back(tensor->is_parameter() ? kParameterWeightTensorMask
@ -1276,6 +1269,11 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructSingleOpGraph(const BackendO
  // set execution order
  auto cnode = graph->NewCNode(inputs);
  MS_EXCEPTION_IF_NULL(cnode);
+  auto is_mutable_kernel = common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, cnode) &&
+                           common::AnfAlgo::GetNodeAttr<bool>(cnode, kAttrMutableKernel);
+  if (is_mutable_kernel) {
+    graph->set_flag(kAttrMutableKernel, true);
+  }
  // set abstract,which include inferred shapes and types
  cnode->set_abstract(op_run_info->base_op_run_info.abstract);
  common::AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(op_run_info->base_op_run_info.has_dynamic_output),
@ -1288,7 +1286,7 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructSingleOpGraph(const BackendO
  // set execution order
  std::vector<CNodePtr> exe_order = {cnode};
  graph->set_execution_order(exe_order);
-  if (is_ascend) {
+  if (is_ascend && !is_mutable_kernel) {
    graph->set_output(cnode);
  } else {
    CreateOutputNode(cnode, graph);
--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@ -682,9 +682,13 @@ void MindRTBackend::RunGraphBySingleOp(const GraphCompilerInfo &graph_compiler_i
      graph_compiler_->CalculateForwardOpOutputCount(graph, inputs[graph_index], &forward_op_output_tensor_id_);
    }

+    auto is_mutable = graph->has_flag(kAttrMutableKernel);
    bool use_dynamic_shape_process = root_graph_->has_flag(kFlagUseDynamicShapeProcess);
    py::gil_scoped_release release;
    for (const auto &kernel : graph->execution_order()) {
+      if (is_mutable) {
+        common::AnfAlgo::SetNodeAttr(kAttrMutableKernel, MakeValue(true), kernel);
+      }
      InputTensorInfo input_tensor_info;
      VectorRef op_outputs;
      if (common::AnfAlgo::IsControlOpExecInBackend(kernel)) {
@ -712,6 +716,9 @@ void MindRTBackend::RunGraphBySingleOp(const GraphCompilerInfo &graph_compiler_i
        graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, use_dynamic_shape_process,
                                                        &op_run_info, &graph_info, &graph_output_info);
        if (use_dynamic_shape_process) {
+          op_run_info->op_prim->AddAttr(kAttrMutableKernel, MakeValue(true));
+          op_run_info->op_prim->AddAttr(kAttrInputIsDynamicShape, MakeValue(true));
+          op_run_info->op_prim->AddAttr(kAttrOutputIsDynamicShape, MakeValue(true));
          RunOpDynamic(op_run_info, &op_outputs);
        } else {
          RunOp(op_run_info, &op_outputs);
@ -725,6 +732,9 @@ void MindRTBackend::RunGraphBySingleOp(const GraphCompilerInfo &graph_compiler_i
    }
    WaitTaskFinish();
  }
+  if (is_dynamic_ || root_graph_->has_flag(kFlagUseDynamicShapeProcess)) {
+    ClearResource();
+  }
 }

 void MindRTBackend::RunGraphByCondition(const ActorInfo &actor_info, const GraphCompilerInfo &graph_compiler_info,
@ -1398,5 +1408,15 @@ void MindRTBackend::UpdateOutput(const std::vector<session::KernelWithIndex> &ou
    outputs->emplace_back(output_tensor);
  }
 }
+
+void MindRTBackend::ClearResource() {
+  graph_compiler_ = std::make_shared<GraphCompiler>();
+  graph_id_to_device_context_.clear();
+  func_graph_to_kernel_graph_ids_.clear();
+  graph_info_to_device_context_.clear();
+  control_nodes_.clear();
+  actor_to_graph_compiler_info_.clear();
+  cnode_ref_counts_.clear();
+}
 }  // namespace compile
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/graph_compiler/backend.h
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.h
@ -130,6 +130,9 @@ class BACKEND_EXPORT MindRTBackend : public MindRTBackendBase {

  void OpRunCallback(const std::shared_ptr<pynative::OpTaskContext> &context);

+  // Clean the compilation cache to avoid memory leakage in dynamic shape scenarios.
+  void ClearResource();
+
  // Cache output tensor ref count of kernels for back propagation graph in PyNative mode.
  std::map<GraphId, std::map<KernelWithIndex, size_t>> cnode_ref_counts_;

--- a/mindspore/ccsrc/backend/graph_compiler/backend_base.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend_base.cc
@ -285,9 +285,9 @@ const ActorInfo &MindRTBackendBase::CompileGraphs(const FuncGraphPtr &func_graph
    }

    AnfUtils::CloseAbstractLock();
-    bool is_dynamic = IsFuncGraphDynamicShapeOrStruct(func_graph, func_graph_cell_id);
+    is_dynamic_ = IsFuncGraphDynamicShapeOrStruct(func_graph, func_graph_cell_id);
    AnfUtils::OpenAbstractLock();
-    if (!is_dynamic) {
+    if (!is_dynamic_) {
      auto iter = graph_actor_infos_.find(func_graph_cell_id);
      if (iter != graph_actor_infos_.end()) {
        return iter->second;
@ -405,9 +405,13 @@ void MindRTBackendBase::CompileGraph(const GraphSegmentPtr &segment, device::Run

    auto context_ptr = MsContext::GetInstance();
    MS_EXCEPTION_IF_NULL(context_ptr);
-    // Compile graph.
-    auto graph_id =
-      graph_compiler_->CompileGraph(segment, outputs, device_context, run_mode, ms_execution_mode_ == kPynativeMode);
+    GraphId graph_id;
+    if (is_dynamic_ || root_graph_->has_flag(kFlagUseDynamicShapeProcess)) {
+      graph_id = graph_compiler_->CompileDynamicGraph(segment, outputs, device_context);
+    } else {
+      graph_id =
+        graph_compiler_->CompileGraph(segment, outputs, device_context, run_mode, ms_execution_mode_ == kPynativeMode);
+    }

    graph_id_to_device_context_[graph_id] = device_context;

--- a/mindspore/ccsrc/backend/graph_compiler/backend_base.h
+++ b/mindspore/ccsrc/backend/graph_compiler/backend_base.h
@ -150,6 +150,7 @@ class BACKEND_EXPORT MindRTBackendBase : public Backend {
  // Save the mapping between cell id and actor info.
  mindspore::HashMap<std::string, ActorInfo> graph_actor_infos_;
  bool enable_backend_dynamic_detect_{false};
+  bool is_dynamic_{false};
  FuncGraphPtr root_graph_;
  GraphPartitionPtr graph_partition_;
  std::shared_ptr<GraphCompiler> graph_compiler_;
--- a/mindspore/ccsrc/include/common/utils/utils.h
+++ b/mindspore/ccsrc/include/common/utils/utils.h
@ -608,6 +608,7 @@ constexpr auto kAttrInputIsDynamicShape = "input_is_dynamic_shape";
 constexpr auto kAttrOutputIsDynamicShape = "output_is_dynamic_shape";
 constexpr auto kAttrPynativeNextOpName = "next_op";
 constexpr auto kAttrPynativeNextIndex = "next_index";
+constexpr auto kAttrMutableKernel = "mutable_kernel";
 constexpr auto kAttrCompileInfo = "compile_info";
 constexpr auto kAttrFusionType = "fusion_type";
 constexpr auto kAttrStride = "stride";
--- a/mindspore/ccsrc/pipeline/pynative/forward/forward.cc
+++ b/mindspore/ccsrc/pipeline/pynative/forward/forward.cc
@ -89,9 +89,8 @@ void GetSingleOpGraphInfo(const FrontendOpRunInfoPtr &op_run_info, const std::st
                      << tensors_mask.size();
  }
  std::ostringstream buf;
-  buf << cur_target << "_";
+  buf << cur_target << "_dynamic" << op_run_info->base_op_run_info.use_dynamic_shape_process << "_";
  buf << op_run_info->base_op_run_info.op_name << "_";
-  bool has_const_input = false;
  const auto &op_prim = op_run_info->op_prim;
  MS_EXCEPTION_IF_NULL(op_prim);
  bool has_hidden_side_effect = op_prim->HasAttr(GRAPH_FLAG_SIDE_EFFECT_HIDDEN);
@ -120,7 +119,6 @@ void GetSingleOpGraphInfo(const FrontendOpRunInfoPtr &op_run_info, const std::st
    }
    // For constant input
    if (tensors_mask[index] == kValueNodeTensorMask) {
-      has_const_input = true;
      buf << common::AnfAlgo::GetTensorValueString(input_tensor);
    }
    buf << "_";
@ -130,20 +128,6 @@ void GetSingleOpGraphInfo(const FrontendOpRunInfoPtr &op_run_info, const std::st
  (void)std::for_each(attr_map.begin(), attr_map.end(),
                      [&buf](const auto &element) { buf << element.second->ToString(); });

-  // Constant input affects output, operators like DropoutGenMask whose output is related to values of input when input
-  // shapes are the same but values are different
-  if (has_const_input) {
-    buf << "_";
-    auto abstr = op_run_info->base_op_run_info.abstract;
-    MS_EXCEPTION_IF_NULL(abstr);
-    auto build_shape = abstr->BuildShape();
-    MS_EXCEPTION_IF_NULL(build_shape);
-    buf << build_shape->ToString();
-    auto build_type = abstr->BuildType();
-    MS_EXCEPTION_IF_NULL(build_type);
-    buf << build_type->type_id();
-  }
-
  // Operator with hidden side effect.
  if (has_hidden_side_effect) {
    buf << "_" << std::to_string(op_prim->id());
@ -210,8 +194,7 @@ FrontendOpRunInfoPtr ForwardExecutor::GenerateOpRunInfo(const py::args &args) co
  // Used for async run
  op_run_info->grad_flag = grad()->grad_flag();
  op_run_info->custom_bprop_cell_count = grad()->custom_bprop_cell_count();
-  op_run_info->base_op_run_info.use_dynamic_shape_process =
-    (device_target_ == kAscendDevice ? false : grad()->use_dynamic_shape_process());
+  op_run_info->base_op_run_info.use_dynamic_shape_process = grad()->use_dynamic_shape_process();
  op_run_info->base_op_run_info.op_name = args[static_cast<size_t>(RunOpArgsEnum::PY_NAME)].cast<std::string>();
  op_run_info->base_op_run_info.lazy_build = lazy_build_;
  PyNativeAlgo::PyParser::SetPrim(op_run_info, args[static_cast<size_t>(RunOpArgsEnum::PY_PRIM)]);
@ -454,6 +437,9 @@ ValuePtr ForwardExecutor::RunOpInMs(const FrontendOpRunInfoPtr &op_run_info) {
  MS_EXCEPTION_IF_NULL(cur_mind_rt_backend);
  bool use_dynamic_shape_process = op_run_info->base_op_run_info.use_dynamic_shape_process;
  if (use_dynamic_shape_process) {
+    backend_op_run_info->op_prim->AddAttr(kAttrMutableKernel, MakeValue(true));
+    backend_op_run_info->op_prim->AddAttr(kAttrInputIsDynamicShape, MakeValue(true));
+    backend_op_run_info->op_prim->AddAttr(kAttrOutputIsDynamicShape, MakeValue(true));
    cur_mind_rt_backend->RunOpDynamic(backend_op_run_info, &outputs);
  } else {
    cur_mind_rt_backend->RunOp(backend_op_run_info, &outputs);
--- a/mindspore/ccsrc/pipeline/pynative/grad/grad.cc
+++ b/mindspore/ccsrc/pipeline/pynative/grad/grad.cc
@ -674,8 +674,7 @@ void GradExecutor::GetGradGraph(const ad::GradAttr &grad_attr, const std::vector
  auto bprop_graph = GetBpropGraph(grad_attr, w_args, p_args);
  MS_EXCEPTION_IF_NULL(bprop_graph);
  bprop_graph->set_flag(kFlagIsPynativeBpropGraph, true);
-  bool use_dynamic_shape_process = (forward()->device_target() == kAscendDevice ? false : use_dynamic_shape_process_);
-  bprop_graph->set_flag(kFlagUseDynamicShapeProcess, use_dynamic_shape_process);
+  bprop_graph->set_flag(kFlagUseDynamicShapeProcess, use_dynamic_shape_process_);
  MS_EXCEPTION_IF_NULL(top_input_args_info_);
  bprop_graph->set_attr(kAttrFuncGraphCellId, MakeValue(top_input_args_info_->obj_id));
  auto resource = top_cell()->resource();
@ -1750,12 +1749,12 @@ bool GradExecutor::IsDynamicDetectNodeInfoChange(const DynamicDetectNodeInfoPtr
  MS_EXCEPTION_IF_NULL(old_node_info);

  // 1.Detect ms_function phase
-  if (is_ms_function_node != old_node_info->is_graph_node ||
-      (is_ms_function_node && graph_phase != old_node_info->graph_phase)) {
-    MS_LOG(DEBUG) << "graph is dynamic, old is_graph_node:" << old_node_info->is_graph_node
+  if (is_ms_function_node) {
+    MS_LOG(DEBUG) << "Graph info, old is_graph_node:" << old_node_info->is_graph_node
                  << " new is_graph_node:" << is_ms_function_node << " old graph_phase" << old_node_info->graph_phase
                  << " new graph_phase:" << graph_phase;
-    return true;
+    return is_ms_function_node != old_node_info->is_graph_node ||
+           (is_ms_function_node && graph_phase != old_node_info->graph_phase);
  }

  // 2.Detect cnode prim
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_types_convert.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ge_types_convert.cc
@ -50,7 +50,7 @@ ge::DataType GeTypesConvert::TransTypeIdToGeDataType(TypeId type_id) {
    {TypeId::kNumberTypeInt, ge::DataType::DT_INT32},       {TypeId::kNumberTypeInt64, ge::DataType::DT_INT64},
    {TypeId::kNumberTypeUInt32, ge::DataType::DT_UINT32},   {TypeId::kNumberTypeUInt, ge::DataType::DT_UINT32},
    {TypeId::kNumberTypeUInt64, ge::DataType::DT_UINT64},   {TypeId::kNumberTypeBool, ge::DataType::DT_BOOL},
-    {TypeId::kNumberTypeInt64, ge::DataType::DT_DOUBLE},    {TypeId::kTypeUnknown, ge::DataType::DT_UNDEFINED}};
+    {TypeId::kNumberTypeFloat64, ge::DataType::DT_DOUBLE},  {TypeId::kTypeUnknown, ge::DataType::DT_UNDEFINED}};
  auto iter = data_type_map.find(type_id);
  if (iter == data_type_map.end()) {
    MS_LOG(EXCEPTION) << "Invalid data type:" << type_id << ": " << TypeIdLabel(type_id);
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_select_ascend.cc
@ -35,6 +35,10 @@
 #include "utils/trace_base.h"
 #include "mindspore/core/ops/op_name.h"

+#include "kernel/common_utils.h"
+#include "kernel/kernel.h"
+#include "kernel/kernel_build_info.h"
+
 namespace mindspore {
 namespace device {
 namespace ascend {
@ -704,6 +708,45 @@ void ResetPreFixedFormat(const CNodePtr &kernel_node, kernel::KernelBuildInfoPtr
  common::AnfAlgo::EraseNodeAttr(kAttrFixedInputFormat, kernel_node);
  common::AnfAlgo::EraseNodeAttr(kAttrFixedOutputFormat, kernel_node);
 }
+
+void SetKernelWithDefaultInfo(const CNodePtr &kernel_node) {
+  auto builder = kernel::KernelBuildInfo::KernelBuildInfoBuilder();
+  builder.SetProcessor(kernel::AICORE);
+  builder.SetFusionType(kernel::UNKNOWN_FUSION_TYPE);
+  builder.SetOpPattern(kernel::kCommonPattern);
+  builder.SetKernelType(TBE_KERNEL);
+
+  auto input_size = common::AnfAlgo::GetInputTensorNum(kernel_node);
+  std::vector<std::string> inputs_format;
+  std::vector<TypeId> inputs_device_type;
+  std::vector<std::string> inputs_reshape_type(input_size, "");
+  for (size_t i = 0; i < input_size; ++i) {
+    auto type_id = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, i);
+    if (type_id == kTypeUnknown) {
+      type_id = common::AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, i);
+    }
+    auto format = AnfAlgo::GetPrevNodeOutputFormat(kernel_node, i);
+    inputs_device_type.emplace_back(type_id);
+    inputs_format.emplace_back(format);
+  }
+  builder.SetInputsDeviceType(inputs_device_type);
+  builder.SetInputsFormat(inputs_format);
+  builder.SetInputsReshapeType(inputs_reshape_type);
+
+  auto output_size = common::AnfAlgo::GetOutputTensorNum(kernel_node);
+  std::vector<std::string> outputs_format;
+  std::vector<TypeId> outputs_device_type;
+  std::vector<std::string> outputs_reshape_type(output_size, "");
+  for (size_t i = 0; i < output_size; ++i) {
+    auto type_id = common::AnfAlgo::GetOutputInferDataType(kernel_node, i);
+    outputs_device_type.emplace_back(type_id);
+    outputs_format.emplace_back(kOpFormat_DEFAULT);
+  }
+  builder.SetOutputsDeviceType(outputs_device_type);
+  builder.SetOutputsFormat(outputs_format);
+  builder.SetOutputsReshapeType(outputs_reshape_type);
+  AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), kernel_node.get());
+}
 }  // namespace

 void SetTensorDeviceInfo(const CNodePtr &kernel_node) {
@ -890,6 +933,11 @@ void SetRaiseOrReduceFlag(const CNodePtr &kernel_node, KernelSelectStatus status

 void SetAclKernelInfo(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
+  if (!common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, kernel_node)) {
+    MS_LOG(INFO) << "No is_dynamic_kernel attr found, cannot set ACL KERNEL for " << kernel_node->DebugString();
+    return;
+  }
+
  KernelType kernel_type = AnfAlgo::GetKernelType(kernel_node);
  if (kernel_type != AICPU_KERNEL && kernel_type != TBE_KERNEL) {
    MS_LOG(INFO) << "Current node don't support acl kernel launch! Node info:" << kernel_node->DebugString();
@ -903,22 +951,11 @@ void SetAclKernelInfo(const CNodePtr &kernel_node) {
    MS_LOG(INFO) << "Current mode or device don't support acl kernel launch! Node info:" << kernel_node->DebugString();
    return;
  }
-  if (!common::AnfAlgo::IsDynamicShape(kernel_node)) {
-    return;
-  }
+
  if (common::AnfAlgo::IsGraphKernel(kernel_node) || IsPrimitiveCNode(kernel_node, prim::kPrimCustom)) {
    MS_LOG(INFO) << "Current node is graph kernel or custom io! Node info:" << kernel_node->DebugString();
    return;
  }
-  auto op_type = common::AnfAlgo::GetCNodeName(kernel_node);
-  if (kAclBlackList.count(op_type) != 0) {
-    MS_LOG(INFO) << "Current node in acl black list! Node info:" << kernel_node->DebugString();
-    return;
-  }
-  if (kAclKernelSet.count(op_type) == 0) {
-    MS_LOG(INFO) << "Current node in acl black list! Node info:" << kernel_node->DebugString();
-    return;
-  }

  // Update node's kernel type to acl.
  auto new_builder =
@ -988,15 +1025,24 @@ std::tuple<KernelSelectStatus, std::string, ExceptionType> SelectKernelInfoWithM
  }
  // The kernel info can not find in ai_cpu kernel lists and ai_core kernel lists
  if (select_status == kNoMatched) {
-    GatherInputAndOutputInferType(aicpu_in_out_info, kernel_node);
-    std::get<0>(result) = select_status;
-    auto [msg, etype] = CollectNotMatchMessage(kernel_info_list, aicpu_kernel_info_list, aicore_in_out_info,
-                                               aicpu_in_out_info, kernel_node);
-    constexpr int one = 1;
-    constexpr int two = 2;
-    std::get<one>(result) = msg;
-    std::get<two>(result) = etype;
-    return result;
+    if (common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, kernel_node)) {
+      MS_LOG(WARNING) << "NOT FOUND KernelBuildInfo for " << kernel_node->fullname_with_scope()
+                      << ". Set default KernelBuildInfo.";
+      SetKernelWithDefaultInfo(kernel_node);
+      SetTensorDeviceInfo(kernel_node);
+      select_status = kStatusAllMatched;
+      SetAclKernelInfo(kernel_node);
+    } else {
+      GatherInputAndOutputInferType(aicpu_in_out_info, kernel_node);
+      std::get<0>(result) = select_status;
+      auto [msg, etype] = CollectNotMatchMessage(kernel_info_list, aicpu_kernel_info_list, aicore_in_out_info,
+                                                 aicpu_in_out_info, kernel_node);
+      constexpr int one = 1;
+      constexpr int two = 2;
+      std::get<one>(result) = msg;
+      std::get<two>(result) = etype;
+      return result;
+    }
  }
  SetRaiseOrReduceFlag(kernel_node, select_status);
  std::get<0>(result) = select_status;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_graph_optimization.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_graph_optimization.cc
@ -16,6 +16,8 @@

 #include "plugin/device/ascend/hal/hardware/ascend_graph_optimization.h"
 #include <set>
+#include <unordered_set>
+#include <memory>
 #include <string>
 #include "backend/common/optimizer/common_backend_optimization.h"
 #include "plugin/device/ascend/optimizer/ascend_backend_optimization.h"
@ -53,6 +55,8 @@ void RemoveUnusedValueNode(const KernelGraphPtr &graph) {
    graph->RemoveNodeFromGraph(value_node);
  }
 }
+
+const std::unordered_set<std::string> kDefaultFormatAclOps = {kAddNOpName};
 }  // namespace

 void AscendGraphOptimization::Reset() {
@ -87,9 +91,14 @@ void AscendGraphOptimization::OptimizeGraph(const KernelGraphPtr &graph) {

 void AscendGraphOptimization::OptimizeSingleOpGraph(const KernelGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
-  opt::RunOpAscendBackendIRFusionOptimization(graph);
-  SelectKernel(graph);
-  opt::RunOpAscendBackendOptimization(graph);
+
+  if (graph->has_flag(kAttrMutableKernel)) {
+    AclOpOptimize(graph);
+  } else {
+    opt::RunOpAscendBackendIRFusionOptimization(graph);
+    SelectKernel(graph);
+    opt::RunOpAscendBackendOptimization(graph);
+  }

  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
@ -102,6 +111,55 @@ void AscendGraphOptimization::OptimizeSingleOpGraph(const KernelGraphPtr &graph)
  memo_.clear();
 }

+void AscendGraphOptimization::AclOpOptimize(const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  opt::RunOpIRFissionForAcl(graph);
+
+  auto nodes = graph->execution_order();
+  for (auto &node : nodes) {
+    common::AnfAlgo::SetNodeAttr(kAttrMutableKernel, MakeValue(true), node);
+  }
+  SelectKernel(graph);
+
+  // Change format to DefaultFormat.
+  bool need_change_format = false;
+  for (auto &node : nodes) {
+    if (kDefaultFormatAclOps.count(common::AnfAlgo::GetCNodeName(node))) {
+      need_change_format = true;
+      auto new_builder =
+        std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
+      MS_EXCEPTION_IF_NULL(new_builder);
+      auto inputs_format = AnfAlgo::GetAllInputFormats(node);
+      auto outputs_format = AnfAlgo::GetAllOutputFormats(node);
+      new_builder->SetInputsFormat(std::vector<std::string>(inputs_format.size(), kOpFormat_DEFAULT));
+      new_builder->SetOutputsFormat(std::vector<std::string>(outputs_format.size(), kOpFormat_DEFAULT));
+      AnfAlgo::SetSelectKernelBuildInfo(new_builder->Build(), node.get());
+    }
+  }
+
+  bool has_aicpu = std::any_of(nodes.begin(), nodes.end(),
+                               [](const CNodePtr &node) { return AnfAlgo::GetKernelType(node) == AICPU_KERNEL; });
+  if (has_aicpu || need_change_format) {
+    // Insert Cast and TransData.
+    opt::RunOpAscendBackendOptimization(graph);
+  } else {
+    // Only insert Cast.
+    opt::AscendMixPrecision(graph);
+  }
+
+  // Replace all TBE_KERNEL with ACL_KERNEL.
+  for (const auto &node : graph->execution_order()) {
+    if (AnfAlgo::GetKernelType(node) == TBE_KERNEL) {
+      auto new_builder =
+        std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
+      MS_EXCEPTION_IF_NULL(new_builder);
+      new_builder->SetKernelType(ACL_KERNEL);
+      MS_LOG(INFO) << "SUCCESS SET ACL KERNEL FOR" << node->DebugString();
+      AnfAlgo::SetSelectKernelBuildInfo(new_builder->Build(), node.get());
+    }
+  }
+}
+
 void AscendGraphOptimization::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  CheckControlFlowDynamicShape(graph);
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_graph_optimization.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_graph_optimization.h
@ -66,6 +66,7 @@ class AscendGraphOptimization {

  void GetAllGraphs(const KernelGraphPtr &root_graph);
  void CheckControlFlowDynamicShape(const KernelGraphPtr &root_graph);
+  void AclOpOptimize(const KernelGraphPtr &graph);

  // Manager for the optimized graphs
  FuncGraphManagerPtr graph_manager_;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
@ -306,9 +306,8 @@ bool AscendKernelExecutor::MemoryCopyAsync(const CNodePtr &node, const vector<Ad
                                           const vector<AddressPtr> &outputs) const {
  MS_LOG(DEBUG) << "Launch MemoryCopyAsync instead for kernel " << node->fullname_with_scope();
  if (inputs.size() != 1 || outputs.size() != 1) {
-    MS_LOG(ERROR) << "Kernel " << node->fullname_with_scope() << " input output size should be 1 but"
-                  << " input size is:" << inputs.size() << " output size is:" << outputs.size();
-    return false;
+    MS_LOG(WARNING) << "Kernel " << node->fullname_with_scope() << " input output size should be 1 but"
+                    << " input size is:" << inputs.size() << " output size is:" << outputs.size();
  }

  const auto stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_build.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_build.cc
@ -28,13 +28,13 @@ namespace mindspore {
 namespace kernel {
 namespace {
 static const std::unordered_set<std::string> kAclStaticList = {kPackOpName,
+                                                               kAddNOpName,
                                                               kTensorMoveOpName,
                                                               kConcatDOpName,
                                                               kCheckValidOpName,
                                                               kBiasAddOpName,
                                                               kBiasAddGradOpName,
                                                               kConv3DTransposeOpName,
-                                                               kTileOpName,
                                                               kROIAlignName,
                                                               kDynamicGRUV2OpName,
                                                               kSoftmaxCrossEntropyWithLogitsOpName};
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_mod.cc
@ -17,6 +17,7 @@

 #include <vector>
 #include <map>
+#include "runtime/rt.h"
 #include "ir/tensor.h"
 #include "include/common/utils/anfalgo.h"
 #include "kernel/common_utils.h"
@ -36,20 +37,18 @@ int AclKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);

-  if (!common::AnfAlgo::IsDynamicShape(cnode)) {
-    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
-  }
-
+  size_t input_num = common::AnfAlgo::GetInputTensorNum(cnode);
  std::vector<size_t> useless_input_lists;
  // Update input size list
-  for (size_t i = 0; i < input_size_list_.size(); ++i) {
+  for (size_t i = 0; i < input_num; ++i) {
    auto index = AnfAlgo::GetInputGraphIdxByKernelIdx(node, i);
    if (index >= input_size_list_.size()) {
      MS_LOG(EXCEPTION) << "Error real index:" << index;
    }
-    TypeId type_id = AnfAlgo::GetInputDeviceDataType(node, index);
+    auto [input, idx] = common::AnfAlgo::GetPrevNodeOutput(node, index);
+    auto type_id = AnfAlgo::GetOutputDeviceDataType(input, idx);
    auto type_size = GetTypeByte(TypeIdToType(type_id));
-    auto shape = AnfAlgo::GetInputDeviceShape(node, index);
+    auto shape = AnfAlgo::GetOutputDeviceShape(input, idx);
    if (IsDynamic(shape)) {
      MS_LOG(ERROR) << "Please check infer op shape before resize, error input index is:" << i;
      return 1;
@ -65,10 +64,24 @@ int AclKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector
      (void)useless_input_lists.emplace_back(i);
    }
  }
+
+  auto acl_input_size = GeOpConvertor::GetAclInputSize(cnode);
+  if (acl_input_size > input_num) {
+    for (size_t i = input_num; i < acl_input_size; i++) {
+      input_size_list_[i] = SIZE_MAX;
+    }
+  }
  common::AnfAlgo::SetNodeAttr(kAttrUselessInput, MakeValue(useless_input_lists), node);

  // Update output size list
+  size_t output_num = common::AnfAlgo::GetOutputTensorNum(cnode);
  AscendKernelMod::UpdateOutputSizeList();
+  auto acl_output_size = GeOpConvertor::GetAclOutputSize(cnode);
+  if (acl_output_size > output_num) {
+    for (size_t i = output_num; i < acl_output_size; i++) {
+      output_size_list_[i] = SIZE_MAX;
+    }
+  }

  if (!AclUtils::UpdateTensorDesc(node, &input_desc_list_, &output_desc_list_)) {
    MS_LOG(EXCEPTION) << "Fail to update op desc: " << node->fullname_with_scope();
@ -159,6 +172,10 @@ bool AclKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
    return false;
  }

+  if (rtStreamSynchronize(stream_ptr) != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "aclopCompileAndExecute sync failed";
+  }
+
  MS_LOG(INFO) << "Success launch of node: " << op_type_;
  return true;
 }
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_utils.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/acl/acl_kernel_utils.cc
@ -25,6 +25,8 @@
 #include "kernel/common_utils.h"
 #include "backend/common/session/anf_runtime_algorithm.h"

+#include "plugin/device/ascend/hal/device/ge_types_convert.h"
+
 namespace mindspore {
 namespace kernel {
 namespace {
@ -537,10 +539,11 @@ std::vector<GeTensorDescPtr> AclUtils::GetInputTensorDesc(const AnfNodePtr &anf_
      continue;
    }
    (void)already_add_index.insert(index + 1);
-    auto ori_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(anf_node, index);
-    auto input_shape = AnfAlgo::GetInputDeviceShape(anf_node, index);
-    auto input_type = AnfAlgo::GetInputDeviceDataType(anf_node, index);
-    auto input_format = AnfAlgo::GetInputFormat(anf_node, index);
+    auto [input, idx] = common::AnfAlgo::GetPrevNodeOutput(anf_node, index);
+    auto ori_shape = common::AnfAlgo::GetOutputInferShape(input, idx);
+    auto input_shape = AnfAlgo::GetOutputDeviceShape(input, idx);
+    auto input_type = AnfAlgo::GetOutputDeviceDataType(input, idx);
+    auto input_format = AnfAlgo::GetOutputFormat(input, idx);
    auto ori_format = IsOneOf3DFormat(input_format) ? kOpFormat_NCDHW : kOpFormat_DEFAULT;
    auto input_desc = GeOpConvertor::GetTensorDesc(input_shape, input_type, input_format, ori_shape, ori_format);
    MS_EXCEPTION_IF_NULL(input_desc);
@ -591,6 +594,7 @@ std::set<std::string> AclUtils::GetUselessOutputs(const AnfNodePtr &node) {

 std::vector<GeTensorDescPtr> AclUtils::GetOutputTensorDesc(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
+
  size_t output_num = common::AnfAlgo::GetOutputTensorNum(anf_node);
  std::vector<GeTensorDescPtr> res;
  auto useless_outputs = GetUselessOutputs(anf_node);
@ -630,6 +634,9 @@ std::shared_ptr<OpInfo> AclUtils::GetKernelOpInfo(const AnfNodePtr &node) {
  auto node_name = common::AnfAlgo::GetCNodeName(node);
  auto is_dynamic_shape = common::AnfAlgo::IsDynamicShape(node);
  auto op_info_ptr = kernel::OpLib::FindOp(node_name, kernel::kImplyTBE, is_dynamic_shape);
+  if (op_info_ptr == nullptr) {
+    return kernel::OpLib::FindOp(node_name, kernel::kImplyAICPU);
+  }
  return op_info_ptr;
 }

@ -666,6 +673,7 @@ std::vector<std::string> AclUtils::GetOpInputAnchorNames(const AnfNodePtr &node)

 std::vector<std::string> AclUtils::GetOpOutputAnchorNames(const AnfNodePtr &node) {
  auto op_info_ptr = GetKernelOpInfo(node);
+  MS_EXCEPTION_IF_NULL(op_info_ptr);
  auto outputs_ptr = op_info_ptr->outputs_ptr();
  std::vector<std::string> output_names;
  for (const auto &out_item : outputs_ptr) {
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_kernel_build.cc
@ -432,7 +432,8 @@ void CreateExtInfo(const std::shared_ptr<AnfNode> &anf_node, const std::shared_p
    return;
  }

-  if (!common::AnfAlgo::IsDynamicShape(anf_node)) {
+  if (!common::AnfAlgo::IsDynamicShape(anf_node) &&
+      !common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, anf_node->cast<CNodePtr>())) {
    return;
  }

@ -476,7 +477,8 @@ KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) {
    op_name = kInitData;
  }
  std::shared_ptr<AicpuOpKernelMod> kernel_mod_ptr;
-  if (common::AnfAlgo::IsDynamicShape(anf_node)) {
+  if (common::AnfAlgo::IsDynamicShape(anf_node) ||
+      common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, anf_node->cast<CNodePtr>())) {
    kernel_mod_ptr = std::make_shared<DynamicAicpuOpKernelMod>(anf_node);
  } else {
    kernel_mod_ptr = std::make_shared<AicpuOpKernelMod>(anf_node);
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/dynamic_aicpu_kernel_mod.cc
@ -61,9 +61,6 @@ int DynamicAicpuOpKernelMod::Resize(const BaseOperatorPtr &base_operator, const
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
-  if (!common::AnfAlgo::IsDynamicShape(cnode)) {
-    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
-  }
  if (common::AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
    auto wingman_queue = device::GetTdtWingManQueue(cnode);
    std::vector<device::DataQueueItem> data;
@ -142,11 +139,6 @@ bool DynamicAicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, cons
  MS_EXCEPTION_IF_NULL(cnode);
  MS_LOG(INFO) << "Start launch of node: " << cnode->fullname_with_scope();

-  // is dynamic shape
-  if (!common::AnfAlgo::IsDynamicShape(cnode)) {
-    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
-  }
-
  // copy extinfo to device
  AllocateExtInfoDeviceAddr(cnode);
  MS_EXCEPTION_IF_NULL(ext_info_handler_);
@ -186,10 +178,6 @@ void DynamicAicpuOpKernelMod::SyncData() {
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  MS_LOG(INFO) << "Aicpu " << cnode->fullname_with_scope() << " PostExecute";
-  // is dynamic shape
-  if (!common::AnfAlgo::IsDynamicShape(cnode)) {
-    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
-  }

  if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE ||
      common::AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) {
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/host/host_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/host/host_kernel_mod.cc
@ -84,9 +84,6 @@ int HostKernelMod::Resize(const BaseOperatorPtr &, const std::vector<KernelTenso
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
-  if (!common::AnfAlgo::IsDynamicShape(cnode)) {
-    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
-  }

  if (!Init(cnode)) {
    MS_LOG(EXCEPTION) << "Init failed, node:" << cnode->fullname_with_scope();
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_dynamic_shape_util.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_dynamic_shape_util.cc
@ -107,7 +107,7 @@ std::shared_ptr<OpInfo> TbeDynamicShapeUtil::FindOp(const std::string &op_name,

 std::shared_ptr<OpInfo> TbeDynamicShapeUtil::FindOp(const std::string &op_name, const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(cnode);
-  auto is_dynamic_shape = GetDynamicShapeAttr(cnode);
+  auto is_dynamic_shape = GetDynamicShapeAttr(cnode) || common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, cnode);
  auto op_info = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kImplyTBE, is_dynamic_shape);
  // If have no dynamic shape op, get static shape op
  if (op_info != nullptr && !op_info->dynamic_shape() && is_dynamic_shape) {
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_backend_optimization.cc
@ -495,6 +495,46 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
 #endif
 }

+void RunOpIRFissionForAcl(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!context_ptr->get_param<bool>(MS_CTX_IR_FUSION_FLAG)) {
+    MS_LOG(INFO) << "IRFusion is not enable, skip";
+    return;
+  }
+#ifdef ENABLE_DUMP_IR
+  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  if (save_graphs) {
+    DumpIR("hwopt_d_ir_fusion_before.ir", kernel_graph);
+  }
+#endif
+  auto optimizer = std::make_shared<GraphOptimizer>();
+  auto ir_fusion_pm = std::make_shared<PassManager>("ir_fission_pm");
+  ir_fusion_pm->AddPass(std::make_shared<ClipByNormFission>());
+  ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
+  ir_fusion_pm->AddPass(std::make_shared<TensorScatterUpdateFission>());
+  ir_fusion_pm->AddPass(std::make_shared<TensorScatterAddFission>());
+  ir_fusion_pm->AddPass(std::make_shared<TensorScatterSubFission>());
+  ir_fusion_pm->AddPass(std::make_shared<TensorScatterMaxFission>());
+  ir_fusion_pm->AddPass(std::make_shared<TensorScatterMinFission>());
+  ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
+  const auto &pass_creators =
+    opt::Factory<PatternProcessPass>::Instance().GetPassCreatorsByType(kPassType::kIRFusionFissionPass);
+  for (const auto &pass_creator : pass_creators) {
+    ir_fusion_pm->AddPass(pass_creator.second());
+  }
+
+  optimizer->AddPassManager(ir_fusion_pm);
+  (void)optimizer->Optimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+#ifdef ENABLE_DUMP_IR
+  if (save_graphs) {
+    DumpIR("hwopt_d_ir_fusion_after.ir", kernel_graph);
+  }
+#endif
+}
+
 void RunOpAscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  // data layout optimization
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_backend_optimization.h
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_backend_optimization.h
@ -21,6 +21,7 @@ namespace mindspore {
 namespace opt {
 void RunOpAscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+void RunOpIRFissionForAcl(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void RunOpAscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph);
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ascend_helper.cc
@ -470,7 +470,7 @@ CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &
    common::AnfAlgo::SetNodeAttr(kAttrInputIsDynamicShape, MakeValue(true), cast);
    common::AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(true), cast);
  }
-  common::AnfAlgo::SetNodeAttr("dst_type", TypeIdToType(origin_type), cast);
+  common::AnfAlgo::SetNodeAttr("dst_type", TypeIdToType(output_type), cast);
  AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cast.get());
  common::AnfAlgo::SetOutputTypeAndDetailShape({origin_type}, {origin_shape}, cast.get());
  common::AnfAlgo::SetNodeAttr(kIsBackendCast, MakeValue(true), cast);
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/convert_unsupported_transnode_to_aicpu.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/format_type/convert_unsupported_transnode_to_aicpu.cc
@ -28,9 +28,13 @@ const BaseRef ConvertUnSupportNodeToAICPU::DefinePattern() const {
  return VectorRef({X, Xs});
 }

-const AnfNodePtr ConvertUnSupportNodeToAICPU::Process(const mindspore::FuncGraphPtr &,
+const AnfNodePtr ConvertUnSupportNodeToAICPU::Process(const mindspore::FuncGraphPtr &graph,
                                                      const mindspore::AnfNodePtr &node,
                                                      const mindspore::EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  if (graph->has_flag(kAttrMutableKernel)) {
+    return nullptr;
+  }
  if (node == nullptr || !node->isa<CNode>()) {
    return nullptr;
  }
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ge/convert_resize_nearest_neighbor_x_dtype.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ge/convert_resize_nearest_neighbor_x_dtype.cc
@ -49,9 +49,9 @@ const AnfNodePtr ConvertDataTypeForCNodeInput(const AnfNodePtr &node, size_t inp
  } else if (infer_type == src_type) {
    // Create cast primitive.
    PrimitivePtr cast_prim = std::make_shared<Primitive>(prim::kPrimCast->name());
-    (void)cast_prim->AddAttr("dst_type", MakeValue(static_cast<size_t>(dest_type)));
-    (void)cast_prim->AddAttr("DstT", MakeValue(static_cast<size_t>(dest_type)));
-    (void)cast_prim->AddAttr("SrcT", MakeValue(static_cast<size_t>(src_type)));
+    (void)cast_prim->AddAttr("dst_type", TypeIdToType(dest_type));
+    (void)cast_prim->AddAttr("DstT", TypeIdToType(dest_type));
+    (void)cast_prim->AddAttr("SrcT", TypeIdToType(src_type));
    // Create dest type node.
    auto dest_type_ptr = TypeIdToType(dest_type);
    auto dest_type_node = NewValueNode(dest_type_ptr);
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/ascend_vm_op_adapter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/ascend_vm_op_adapter.cc
@ -27,12 +27,12 @@
 #include "plugin/device/ascend/kernel/tbe/tbe_dynamic_shape_util.h"

 namespace mindspore::opt {
-const AnfNodePtr AscendVmOpAdapter::Process(const FuncGraphPtr &, const AnfNodePtr &node, const EquivPtr &) const {
+const AnfNodePtr AscendVmOpAdapter::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &) const {
  if (node == nullptr || !AnfUtils::IsRealCNodeKernel(node)) {
    return nullptr;
  }
  auto op_name = common::AnfAlgo::GetCNodeName(node);
-  auto is_dynamic = common::AnfAlgo::IsDynamicShape(node);
+  auto is_dynamic = common::AnfAlgo::IsDynamicShape(node) || graph->has_flag(kAttrMutableKernel);
  auto op_adaptation_info =
    OpAdaptationInfoRegister::GetInstance().GetOpAdaptationInfo(op_name, kAscendDevice, is_dynamic);
  if (op_adaptation_info == nullptr) {
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/dropout_unify_mindir.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/dropout_unify_mindir.cc
@ -181,7 +181,7 @@ CNodePtr CreateDropoutGenMaskCNode(const FuncGraphPtr &func_graph, const CNodePt
  std::vector<AnfNodePtr> dropout_gen_mask_inputs =
    use_v3 ? std::vector<AnfNodePtr>{NewValueNode(std::make_shared<Primitive>(kDropoutGenMaskV3OpName))}
           : std::vector<AnfNodePtr>{NewValueNode(std::make_shared<Primitive>(kDropoutGenMaskOpName))};
-  if (input_shape->IsDynamic()) {
+  if (input_shape->IsDynamic() || common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, dropout)) {
    CNodePtr dynamic_shape = CreateDynamicShapeCNode(func_graph, dropout->input(kIndex1), input_shape);
    dynamic_shape->set_scope(dropout->scope());
    dropout_gen_mask_inputs.push_back(dynamic_shape);
@ -198,7 +198,7 @@ CNodePtr CreateDropoutGenMaskCNode(const FuncGraphPtr &func_graph, const CNodePt
  }

  std::shared_ptr<abstract::AbstractTensor> gen_mask_abstract;
-  if (input_shape->IsDynamic()) {
+  if (input_shape->IsDynamic() || common::AnfAlgo::HasNodeAttr(kAttrMutableKernel, dropout)) {
    ShapeVector mask_shp = {abstract::Shape::kShapeDimAny};
    ShapeVector mask_min_shp = CalGenMaskOutputShape(input_shape->min_shape());
    ShapeVector mask_max_shp = CalGenMaskOutputShape(input_shape->max_shape());
@ -404,7 +404,6 @@ const AnfNodePtr DropoutUnifyMindIR1::Process(const FuncGraphPtr &func_graph, co
  MS_EXCEPTION_IF_NULL(dropout_cnode);

  auto inputx_type_id = GetInputXDataType(dropout_cnode);
-  auto keep_prob_value = CreateKeepPorbValueNode(func_graph, dropout_cnode, inputx_type_id);

  CheckCNodeInputSize(dropout_cnode, kDropoutInputTensorNum);
  auto dropout_input = dropout_cnode->input(kIndex1);
@ -416,12 +415,16 @@ const AnfNodePtr DropoutUnifyMindIR1::Process(const FuncGraphPtr &func_graph, co
    dropout_gen_mask = GetRecomputeDropoutGenMask(func_graph, dropout_cnode);
  }
  if (dropout_gen_mask == nullptr) {
-    dropout_gen_mask = CreateDropoutGenMaskCNode(func_graph, dropout_cnode, keep_prob_value, input_shape, use_v3);
+    dropout_gen_mask = CreateDropoutGenMaskCNode(func_graph, dropout_cnode,
+                                                 CreateKeepPorbValueNode(func_graph, dropout_cnode, inputx_type_id),
+                                                 input_shape, use_v3);
  }
  // CreateDropoutDoMask
  auto do_mask_abstract = std::make_shared<abstract::AbstractTensor>(TypeIdToType(inputx_type_id), input_shape);
  auto dropout_do_mask = CreateDropoutDoMaskCNode(
-    func_graph, dropout_cnode, {dropout_input, dropout_gen_mask, keep_prob_value}, do_mask_abstract, use_v3);
+    func_graph, dropout_cnode,
+    {dropout_input, dropout_gen_mask, CreateKeepPorbValueNode(func_graph, dropout_cnode, inputx_type_id)},
+    do_mask_abstract, use_v3);

  std::vector<AnfNodePtr> make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple), dropout_do_mask, dropout_gen_mask};
  auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.cc
@ -306,6 +306,7 @@ GraphId GraphCompiler::CompileGraph(const GraphSegmentPtr &segment, const AnfNod
  KernelGraphPtr graph =
    session_->ConstructKernelGraph(nodes, outputs, device_terget, true, IsEnableZeroCopy(run_in_pynative));
  MS_EXCEPTION_IF_NULL(graph);
+
  opt::EliminateIllegalDataTypePass(graph);
  SetGraphDependency(graph, segment);

@ -371,6 +372,54 @@ GraphId GraphCompiler::CompileGraph(const GraphSegmentPtr &segment, const AnfNod
  return graph_id;
 }

+GraphId GraphCompiler::CompileDynamicGraph(const GraphSegmentPtr &segment, const AnfNodePtrList &outputs,
+                                           const DeviceContext *device_context) {
+  MS_EXCEPTION_IF_NULL(session_);
+  MS_EXCEPTION_IF_NULL(segment);
+  MS_EXCEPTION_IF_NULL(device_context);
+  MS_LOG(INFO) << "Status record: start compile graph.";
+  auto nodes = segment->nodes_;
+  auto device_terget = device_context->GetDeviceType();
+  // Generate kernel graph.
+  KernelGraphPtr graph = session_->ConstructKernelGraph(nodes, outputs, device_terget, true, false);
+  MS_EXCEPTION_IF_NULL(graph);
+
+  graph->set_flag(kAttrMutableKernel, true);
+
+  opt::EliminateIllegalDataTypePass(graph);
+  // Unify the MindIR, must be before of the graph optimization.
+  auto deprecated_kernel_executor =
+    dynamic_cast<device::DeprecatedKernelExecutor *>(device_context->kernel_executor_.get());
+  if (deprecated_kernel_executor != nullptr) {
+    deprecated_kernel_executor->UnifyMindIR(graph);
+  } else {
+    opt::CommonUnifyMindIR(graph);
+  }
+
+  // The graph common optimization.
+  graph->UpdateGraphAquireGilAttr();
+  opt::BackendCommonOptimization(graph);
+  graph->SetInputNodes();
+  auto manager = MakeManager({graph});
+  if (manager) {
+    manager->AddFuncGraph(graph);
+    graph->set_manager(manager);
+  }
+  session_->SetInputNodeUsage(graph, manager);
+  graph->SetOptimizerFlag();
+  graph->set_run_mode(device::RunMode::kKernelMode);
+
+  // Graph kernel does not support pynative mode now, print a warning here.
+  graphkernel::GraphKernelFlags::GetInstance().CheckSupport();
+
+  GraphId graph_id = graph->graph_id();
+  graph->set_root_graph_id(graph_id);
+  session_->DumpGraphs({graph});
+
+  MS_LOG(INFO) << "Status record: end compile graph. graph id: " << graph_id;
+  return graph_id;
+}
+
 GraphId GraphCompiler::CompileWholeGraphForGraphRunMode(const FuncGraphPtr &func_graph,
                                                        const DeviceContext *device_context) {
  MS_EXCEPTION_IF_NULL(session_);
@ -607,10 +656,10 @@ void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const
  MS_EXCEPTION_IF_NULL(session_);
  MS_EXCEPTION_IF_NULL(graph_info);
  *op_run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info, graph_output_info);
+  (*op_run_info)->base_op_run_info.use_dynamic_shape_process = use_dynamic_shape_process;
  session_->GetSingleOpGraphInfo(kernel, tensor_info, graph_info, *op_run_info);
  MS_EXCEPTION_IF_NULL(*op_run_info);
  (*op_run_info)->base_op_run_info.graph_info = *graph_info;
-  (*op_run_info)->base_op_run_info.use_dynamic_shape_process = use_dynamic_shape_process;
 }

 void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const {
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_compiler.h
@ -101,6 +101,9 @@ class GraphCompiler {
  GraphId CompileGraph(const GraphSegmentPtr &segment, const AnfNodePtrList &outputs,
                       const DeviceContext *device_context, device::RunMode run_mode, bool run_in_pynative = false);

+  GraphId CompileDynamicGraph(const GraphSegmentPtr &segment, const AnfNodePtrList &outputs,
+                              const DeviceContext *device_context);
+
  // Construct kernel graph from function graph and compile kernel graph in Graph mode,
  // the detailed implementation of compiling graph is in 'CompileGraphImpl'.
  GraphId CompileWholeGraphForGraphRunMode(const FuncGraphPtr &func_graph, const DeviceContext *device_context);
--- a/mindspore/ccsrc/runtime/pynative/run_op_helper.cc
+++ b/mindspore/ccsrc/runtime/pynative/run_op_helper.cc
@ -127,12 +127,13 @@ void UpdateRefNodeOutputDeviceAddress(const KernelGraphPtr &graph) {
    auto output_index = output_pair.second;
    auto &input_node = input_pair.first;
    auto input_node_output_index = input_pair.second;
-
-    auto input_addr = AnfAlgo::GetMutableOutputAddr(input_node, input_node_output_index, false);
-    auto ref_node_output_addr = AnfAlgo::GetMutableOutputAddr(ref_node, output_index, false);
-    if (input_addr != ref_node_output_addr) {
-      AnfAlgo::SetOutputAddr(input_addr, output_index, ref_node.get());
+    if (!AnfAlgo::OutputAddrExist(input_node, input_node_output_index, false)) {
+      MS_LOG(WARNING) << "Output address not exist, node " << input_node->fullname_with_scope() << " index "
+                      << input_node_output_index;
+      continue;
    }
+    auto input_addr = AnfAlgo::GetMutableOutputAddr(input_node, input_node_output_index, false);
+    AnfAlgo::SetOutputAddr(input_addr, output_index, ref_node.get());
  }
 }

@ -503,7 +504,7 @@ void LaunchKernelsDynamic(const KernelGraphPtr &graph, const device::DeviceConte
    auto workspaces = CreateKernelWorkspaceAddressDynamic(runtime_info, device_context, node);

    if (!MallocForKernelOutput(runtime_info, node, device_context)) {
-      MS_LOG(EXCEPTION) << "Malloc for kernel output failed, Memory isn't enough, node:" << node->fullname_with_scope();
+      MS_LOG(EXCEPTION) << "Malloc for kernel output failed, node:" << node->fullname_with_scope();
    }
    auto outputs = CreateKernelOutputAddress(runtime_info);
    const size_t stream_id = AnfAlgo::GetStreamId(node);
--- a/mindspore/ccsrc/transform/graph_ir/op_declare/array_ops_declare.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare/array_ops_declare.cc
@ -96,6 +96,7 @@ REG_ADPT_DESC(Expand, "Expand", ADPT_DESC(Expand))

 // ExpandDims
 INPUT_MAP(ExpandDims) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(axis)}};
+ATTR_INPUT_MAP(ExpandDims) = {{"axis", 2}};
 ATTR_MAP(ExpandDims) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(ExpandDims) = {{0, OUTPUT_DESC(y)}};
 REG_ADPT_DESC(ExpandDims, kNameExpandDims, ADPT_DESC(ExpandDims))
--- a/mindspore/ccsrc/transform/graph_ir/op_declare/elewise_calculation_ops_declare.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare/elewise_calculation_ops_declare.cc
@ -424,7 +424,7 @@ REG_ADPT_DESC(RealDiv, kNameRealDiv, ADPT_DESC(RealDiv))
 // Cast
 INPUT_MAP(Cast) = {{1, INPUT_DESC(x)}};
 INPUT_ATTR_MAP(Cast) = {{2, ATTR_DESC(dst_type, AnyTraits<GEType>())}};
-ATTR_MAP(Cast) = EMPTY_ATTR_MAP;
+ATTR_MAP(Cast) = {{"dst_type", ATTR_DESC(dst_type, AnyTraits<GEType>())}};
 OUTPUT_MAP(Cast) = {{0, OUTPUT_DESC(y)}};
 REG_ADPT_DESC(Cast, prim::kPrimCast->name(), ADPT_DESC(Cast))

--- a/mindspore/ccsrc/transform/graph_ir/op_declare/selection_ops_declare.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare/selection_ops_declare.cc
@ -59,6 +59,7 @@ REG_ADPT_DESC(InTopKD, kNameInTopKD, ADPT_DESC(InTopKD))

 // OneHot
 INPUT_MAP(OneHot) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(depth)}, {3, INPUT_DESC(on_value)}, {4, INPUT_DESC(off_value)}};
+ATTR_INPUT_MAP(OneHot) = {{"depth", 2}};
 ATTR_MAP(OneHot) = {{"axis", ATTR_DESC(axis, AnyTraits<int64_t>())}};
 OUTPUT_MAP(OneHot) = {{0, OUTPUT_DESC(y)}};
 REG_ADPT_DESC(OneHot, prim::kPrimOneHot->name(), ADPT_DESC(OneHot))
@ -68,7 +69,7 @@ INPUT_MAP(GatherV2) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(indices)}, {3, INPUT_D
 ATTR_INPUT_MAP(GatherV2) = {{"axis", 3}};
 ATTR_MAP(GatherV2) = EMPTY_ATTR_MAP;
 OUTPUT_MAP(GatherV2) = {{0, OUTPUT_DESC(y)}};
-REG_ADPT_DESC(GatherV2, prim::kPrimGather->name(), ADPT_DESC(GatherV2))
+REG_ADPT_DESC(GatherV2, prim::kPrimGatherV2->name(), ADPT_DESC(GatherV2))
 REG_ADPT_DESC(Gather, prim::kPrimGather->name(), ADPT_DESC(GatherV2))

 // ScatterNd
--- a/mindspore/ccsrc/transform/graph_ir/op_declare/split_combination_ops_declare.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare/split_combination_ops_declare.cc
@ -30,7 +30,8 @@ INPUT_MAP(Pack) = EMPTY_INPUT_MAP;
 DYN_INPUT_MAP(Pack) = {{1, DYN_INPUT_DESC(x)}};
 ATTR_MAP(Pack) = {{"num", ATTR_DESC(N, AnyTraits<int64_t>())}, {"axis", ATTR_DESC(axis, AnyTraits<int64_t>())}};
 OUTPUT_MAP(Pack) = {{0, OUTPUT_DESC(y)}};
-REG_ADPT_DESC(Pack, prim::kStack, ADPT_DESC(Pack))
+REG_ADPT_DESC(Pack1, prim::kStack, ADPT_DESC(Pack))
+REG_ADPT_DESC(Pack2, prim::kPack, ADPT_DESC(Pack))

 // ParallelConcat
 INPUT_MAP(ParallelConcat) = EMPTY_INPUT_MAP;
--- a/mindspore/ccsrc/utils/anfalgo.cc
+++ b/mindspore/ccsrc/utils/anfalgo.cc
@ -1703,7 +1703,7 @@ std::string AnfAlgo::GetTensorValueString(const tensor::TensorPtr &tensor) {
  std::ostringstream buf;
  auto fn = [&buf, data_size](auto addr) {
    for (size_t i = 0; i < data_size; ++i) {
-      buf << *(addr + i);
+      buf << *(addr + i) << ",";
    }
  };

@ -1713,6 +1713,8 @@ std::string AnfAlgo::GetTensorValueString(const tensor::TensorPtr &tensor) {
    fn(reinterpret_cast<int *>(tensor->data_c()));
  } else if (dtype->type_id() == kNumberTypeInt8) {
    fn(reinterpret_cast<int8_t *>(tensor->data_c()));
+  } else if (dtype->type_id() == kNumberTypeUInt8) {
+    fn(reinterpret_cast<uint8_t *>(tensor->data_c()));
  } else if (dtype->type_id() == kNumberTypeInt16) {
    fn(reinterpret_cast<int16_t *>(tensor->data_c()));
  } else if (dtype->type_id() == kNumberTypeInt32) {
--- a/mindspore/core/ops/core_ops.h
+++ b/mindspore/core/ops/core_ops.h
@ -166,6 +166,7 @@ constexpr auto kDynamicShape = "DynamicShape";
 constexpr auto kTensorShape = "TensorShape";
 constexpr auto kCheckNumerics = "CheckNumerics";
 constexpr auto kStack = "Stack";
+constexpr auto kPack = "Pack";
 constexpr auto kLogNormalReverse = "LogNormalReverse";
 constexpr auto kUnstack = "Unstack";
 constexpr auto kTupleGetItem = "TupleGetItem";
--- a/mindspore/python/mindspore/ops/_op_impl/tbe/batch_matmul_ds.py
+++ b/mindspore/python/mindspore/ops/_op_impl/tbe/batch_matmul_ds.py
@ -31,7 +31,6 @@ batch_matmul_op_info = TBERegOp("BatchMatMul") \
    .input(2, "bias", False, "optional", "all") \
    .output(0, "y", False, "required", "all") \
    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F32_Default, DataType.F32_FracNZ) \
    .get_op_info()


--- a/mindspore/python/mindspore/ops/_op_impl/tbe/matmul_ds.py
+++ b/mindspore/python/mindspore/ops/_op_impl/tbe/matmul_ds.py
@ -36,8 +36,6 @@ matmul_op_info = TBERegOp("MatMul") \
                  DataType.F16_FracNZ) \
    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F32_Default, DataType.I8_Default,
                  DataType.F16_FracNZ) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F32_Default, DataType.I8_Default,
-                  DataType.F32_FracNZ) \
    .get_op_info()


--- a/tests/st/pynative/hook/test_pynative_forward_hook.py
+++ b/tests/st/pynative/hook/test_pynative_forward_hook.py
@ -296,7 +296,7 @@ class CompareMultiNet2(nn.Cell):
        return x


-@pytest.mark.level0
+@pytest.mark.level2
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@ -353,7 +353,7 @@ def test_pynative_forward_hook():
    assert np.allclose(grad[1][0].asnumpy(), expect_grad[1][0].asnumpy(), 0.000001, 0.000001)


-@pytest.mark.level0
+@pytest.mark.level2
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training