diff --git a/graphengine b/graphengine
index 622af6c1c50..2dbfefcdd0d 160000
--- a/graphengine
+++ b/graphengine
@@ -1 +1 @@
-Subproject commit 622af6c1c50034bea5a08bd409c5a410782bfe53
+Subproject commit 2dbfefcdd0d4b958801403dbaf9efe46447dccd2
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py b/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py
index a241bf9e104..01c79970429 100755
--- a/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/compiler.py
@@ -17,8 +17,6 @@ import json
 import os
 import sys
 from te.platform.cce_conf import te_set_version
-from te.platform.fusion_manager import op_build_cfg_dis, op_build_cfg_en, set_current_op_name, \
-    init_op_pattern, set_op_params, set_op_build_type, get_op_pattern, set_current_op_func_name
 from te.platform.fusion_util import fusion_op
 from common import check_kernel_info, get_args, get_build_in_impl_path, get_ddk_version
 
@@ -27,7 +25,6 @@ build_in_impl_path = get_build_in_impl_path()
 
 # op function list
 op_build = "compile"
-op_pre_build = "pre_build"
 fusion_pattern_start_flag = "fusion_pattern_start"
 fusion_pattern_end_flag = "fusion_pattern_end"
 
@@ -83,19 +80,7 @@ def build_op(build_type, json_str):
         else:
             op_module = __import__("impl."+op_name, globals(), locals(), [op_name], 0)
         # get function
-        if build_type == op_pre_build:
-            # set op parameter
-            op_build_cfg_dis()
-            set_current_op_func_name(op_name)
-            set_current_op_name(kernel_name)
-            init_op_pattern()
-            set_op_params(*outputs_args, *attrs_args, kernel_name=kernel_name)
-            set_op_build_type('prebuild')
-            if custom_flag:
-                py_fn_name = kernel_info['op_info']['name']
-            else:
-                py_fn_name = op_name
-        elif build_type == op_build:
+        if build_type == op_build:
             if custom_flag:
                 py_fn_name = kernel_info['op_info']['name']
             else:
@@ -106,13 +91,6 @@ def build_op(build_type, json_str):
         if op_func is None:
             raise ValueError("Op:{} function {} is not supported by Tbe.".format(op_name, build_type))
 
-        # pre build
-        if build_type == op_pre_build:
-            op_func(*inputs_args, *outputs_args, *attrs_args, kernel_name=kernel_name)
-            # disable only pattern configuration
-            op_build_cfg_en()
-            return get_op_pattern()
-
         # call function
         if kernel_name[0:19] == "bounding_box_encode":
             return op_func(*inputs_args, *outputs_args, *attrs_args, kernel_name_val=kernel_name)
@@ -120,8 +98,6 @@ def build_op(build_type, json_str):
         return op_func(*inputs_args, *outputs_args, *attrs_args, kernel_name=kernel_name)
 
     except Exception as e:
-        if build_type == op_pre_build:
-            op_build_cfg_en()
         raise RuntimeError(e)
 
 
@@ -136,14 +112,9 @@ def compile_fusion_op(json_str):
         Exception: If specific keyword is not found.
     """
     args = json.loads(json_str)
+    te_set_version(ddk_version)
     if 'fusion_op' not in args or not args['fusion_op']:
         raise ValueError("Json string Errors, key:fusion_op not found.")
-    if 'prebuild_ops' not in args or not args['prebuild_ops']:
-        raise ValueError("Json string Errors, key:prebuild_ops not found.")
-
-    pre_build_op_list = args['prebuild_ops']
-    for op in pre_build_op_list:
-        build_op(op_pre_build, json.dumps(op))
     fusion_op_arg = args['fusion_op']
     return fusion_op(json.dumps(fusion_op_arg))
 
@@ -159,8 +130,6 @@ def compile_with_json(json_str):
     json_info = json.loads(json_str)
     if "fusion_op" in json_info:
         ret = compile_fusion_op(json_str)
-    elif "compile_type" in json_info:
-        ret = build_op(op_pre_build, json_str)
     else:
         ret = build_op(op_build, json_str)
     return ret
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
index cdb2fe10477..f7c02236c52 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
@@ -20,6 +20,8 @@
 #include <vector>
 #include <memory>
 #include <algorithm>
+#include <map>
+#include <climits>
 #include "runtime/device/kernel_runtime.h"
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"
 #include "backend/kernel_compiler/akg/akg_kernel_build.h"
@@ -218,7 +220,7 @@ void SetNodeInputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef
       mindspore::TensorShape_Dim *dim = tensorShape->add_dim();
       dim->set_size((::google::protobuf::int64)item);
     }
-    node_inputs->set_tensor_type((mindspore::DataType)input_data_type);
+    node_inputs->set_tensor_type(input_data_type);
     node_inputs->set_mem_device("HBM");
   }
 }
@@ -245,7 +247,7 @@ void SetNodeOutputs(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef
     }
     TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
     int32_t output_data_type = AicpuOpUtil::MsTypeToProtoType(output_type);
-    node_outputs->set_tensor_type((mindspore::DataType)output_data_type);
+    node_outputs->set_tensor_type(output_data_type);
     node_outputs->set_mem_device("HBM");
   }
 }
@@ -287,6 +289,109 @@ bool CreateNodeDefBytes(const std::shared_ptr<AnfNode> &anf_node,
   return true;
 }
 
+bool CreateExtInfo(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr) {
+  if (!anf_node->isa<CNode>()) {
+    return true;
+  }
+
+  if (!AnfAlgo::IsDynamicShape(anf_node)) {
+    return true;
+  }
+
+  MS_LOG(INFO) << "CreateExtInfo start, " << anf_node->fullname_with_scope();
+
+  int32_t unknown_shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
+  uint64_t ext_info_head_len = kExtInfoHeadSize;
+  std::string ext_info;
+  size_t input_num = AnfAlgo::GetInputTensorNum(anf_node);
+  size_t output_num = AnfAlgo::GetOutputTensorNum(anf_node);
+
+  // 1.addr:unknown shape type
+  uint64_t ext_info_len = ext_info.size();
+  ext_info_len += ext_info_head_len + sizeof(int32_t);
+
+  // 2.addr:input ShapeAndType
+  ext_info_len += ext_info_head_len + input_num * sizeof(ShapeAndType);
+
+  // 3.addr:output ShapeAndType
+  ext_info_len += ext_info_head_len + output_num * sizeof(ShapeAndType);
+
+  uint64_t ext_info_offset = ext_info.size();
+  ext_info.resize(ext_info_len, 0);
+  char *ext_info_buf = ext_info.data();
+
+  // deal1: unknown shape type
+  ExtInfo *info = reinterpret_cast<ExtInfo *>(ext_info_buf + ext_info_offset);
+  info->infoType = FWK_ADPT_EXT_SHAPE_TYPE;
+  info->infoLen = sizeof(int32_t);
+  ext_info_offset += ext_info_head_len;
+  int32_t *shape_type = reinterpret_cast<int32_t *>(ext_info_buf + ext_info_offset);
+  *shape_type = unknown_shape_type;
+  ext_info_offset += info->infoLen;
+
+  // deal2:input ShapeAndType
+  info = reinterpret_cast<ExtInfo *>(ext_info_buf + ext_info_offset);
+  info->infoType = FWK_ADPT_EXT_INPUT_SHAPE;
+  info->infoLen = input_num * sizeof(ShapeAndType);
+  ext_info_offset += ext_info_head_len;
+
+  ShapeAndType *inputs = reinterpret_cast<ShapeAndType *>(ext_info_buf + ext_info_offset);
+  for (size_t input_index = 0; input_index < input_num; input_index++) {
+    TypeId input_type = AnfAlgo::GetInputDeviceDataType(anf_node, input_index);
+    std::vector<size_t> input_shape;
+    int32_t input_data_type;
+    if (input_type == kObjectTypeString) {
+      auto cnode = anf_node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
+      auto input_node = cnode->inputs()[input_index + 1];
+      auto value_ptr = GetValueNode(input_node);
+      auto value = GetValue<std::string>(value_ptr);
+      input_shape.push_back(1);
+      input_shape.push_back(value.size());
+      input_data_type = AicpuOpUtil::MsTypeToProtoType(kTypeUnknown);
+    } else {
+      input_shape = AnfAlgo::GetInputDeviceShape(anf_node, input_index);
+      input_data_type = AicpuOpUtil::MsTypeToProtoType(input_type);
+    }
+    inputs[input_index].type = input_data_type;
+
+    size_t input_shape_index = 0;
+    for (; input_shape_index < input_shape.size(); input_shape_index++) {
+      inputs[input_index].dims[input_shape_index] = SizeToLong(input_shape[input_shape_index]);
+    }
+    if (input_shape.size() < kMaxShapeDims) {
+      inputs[input_index].dims[input_shape_index] = LLONG_MIN;
+    }
+  }
+  ext_info_offset += info->infoLen;
+
+  // deal3:output ShapeAndType
+  info = reinterpret_cast<ExtInfo *>(ext_info_buf + ext_info_offset);
+  info->infoType = FWK_ADPT_EXT_OUTPUT_SHAPE;
+  info->infoLen = output_num * sizeof(ShapeAndType);
+  ext_info_offset += ext_info_head_len;
+
+  ShapeAndType *outputs = reinterpret_cast<ShapeAndType *>(ext_info_buf + ext_info_offset);
+  for (size_t output_index = 0; output_index < output_num; output_index++) {
+    std::vector<size_t> output_shape = AnfAlgo::GetOutputDeviceShape(anf_node, output_index);
+    TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
+    int32_t output_data_type = AicpuOpUtil::MsTypeToProtoType(output_type);
+    outputs[output_index].type = output_data_type;
+
+    size_t output_shape_index = 0;
+    for (; output_shape_index < output_shape.size(); output_shape_index++) {
+      outputs[output_index].dims[output_shape_index] = SizeToLong(output_shape[output_shape_index]);
+    }
+    if (output_shape_index < kMaxShapeDims) {
+      outputs[output_index].dims[output_shape_index] = LLONG_MIN;
+    }
+  }
+
+  // set ext info
+  kernel_mod_ptr->SetExtInfo(ext_info);
+  return true;
+}
+
 KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
   std::string op_name = AnfAlgo::GetCNodeName(anf_node);
@@ -300,6 +405,11 @@ KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) {
   if (!CreateNodeDefBytes(anf_node, kernel_mod_ptr)) {
     MS_LOG(EXCEPTION) << "Create nodeDefBytes faild!";
   }
+
+  if (!CreateExtInfo(anf_node, kernel_mod_ptr)) {
+    MS_LOG(EXCEPTION) << "Create nodeDefBytes faild!";
+  }
+
   if (!SetIOSize(anf_node, kernel_mod_ptr)) {
     MS_LOG(EXCEPTION) << "Set input output size list failed.";
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
index d00fab381ec..c7d7a3f1a2a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
@@ -43,6 +43,7 @@ AicpuOpKernelMod::~AicpuOpKernelMod() {
   input_size_list_.clear();
   output_size_list_.clear();
   workspace_size_list_.clear();
+  ext_info_.clear();
 }
 
 void AicpuOpKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
@@ -54,6 +55,7 @@ const std::vector<size_t> &AicpuOpKernelMod::GetWorkspaceSizeList() const { retu
 void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &inputList) { inputList_ = inputList; }
 void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { outputList_ = outputList; }
 void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); }
+void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; }
 void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; }
 void AicpuOpKernelMod::SetAnfNode(const mindspore::AnfNodePtr &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
@@ -84,16 +86,30 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
 
   auto node_def_len = node_def_str_.length();
   param_len += node_def_len;
+  param_len += sizeof(uint32_t);
+
+  AicpuParamHead aicpu_param_head;
+  aicpu_param_head.length = param_len;
+  aicpu_param_head.ioAddrNum = io_addrs_num;
+
+  if (ext_info_.empty()) {
+    MS_LOG(INFO) << "Static Shape Kernel";
+    aicpu_param_head.extInfoLength = 0;
+    aicpu_param_head.extInfoAddr = 0;
+  } else {
+    MS_LOG(INFO) << "Dynamic Kernel Ext Info size:" << ext_info_.size();
+  }
 
-  // Create taskArgs: AicpuParamHead + ioAddrs + notifyId + customizedAttr
-  AicpuParamHead paramHead = {static_cast<uint32_t>(param_len), static_cast<uint32_t>(io_addrs_num)};
   args_.clear();
-  (void)args_.append(reinterpret_cast<const char *>(&paramHead), sizeof(AicpuParamHead));
+  (void)args_.append(reinterpret_cast<const char *>(&aicpu_param_head), sizeof(AicpuParamHead));
   // TaskArgs append ioAddrs
   if (io_addrs_size != 0) {
     (void)args_.append(reinterpret_cast<const char *>(io_addrs.data()), io_addrs_size);
   }
 
+  // size for node_def
+  args_.append(reinterpret_cast<const char *>(&node_def_len), sizeof(uint32_t));
+
   // When it's aicpu customized ops, taskArgs should append customized attr
   if (node_def_len != 0) {
     (void)args_.append(reinterpret_cast<const char *>(node_def_str_.data()), node_def_len);
@@ -145,8 +161,9 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
     node_name_ = kTopKV2;
   }
 
-  AicpuTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::AicpuTaskInfo>(
-    kernel_name_, stream_id, node_so_, node_name_, node_def_str_, input_data_addrs, output_data_addrs, NeedDump());
+  AicpuTaskInfoPtr task_info_ptr =
+    make_shared<ge::model_runner::AicpuTaskInfo>(kernel_name_, stream_id, node_so_, node_name_, node_def_str_,
+                                                 ext_info_, input_data_addrs, output_data_addrs, NeedDump());
 
   MS_LOG(INFO) << "AicpuOpKernelMod GenTask end";
   return {task_info_ptr};
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
index 9bc75d11101..7d006cc67dd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
@@ -36,6 +36,7 @@ class AicpuOpKernelMod : public AscendKernelMod {
   void SetOutputList(const std::vector<int64_t> &outputList);
   void SetAnfNode(const AnfNodePtr &anf_node);
   void SetNodeDef(const std::string &nodeDef);
+  void SetExtInfo(const std::string &ext_info);
   void SetNodeName(const std::string &node_name);
 
   /**
@@ -58,6 +59,7 @@ class AicpuOpKernelMod : public AscendKernelMod {
   std::string node_def_str_;
   std::string node_name_;
   std::string node_so_;
+  std::string ext_info_;
   std::vector<int64_t> inputList_;
   std::vector<int64_t> outputList_;
   AnfNodePtr anf_node_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h
index d68aef3f860..01a8f577189 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h
@@ -21,7 +21,6 @@
 #include <map>
 #include <string>
 #include "backend/kernel_compiler/kernel.h"
-
 namespace mindspore {
 namespace kernel {
 constexpr auto kInitDataSetQueue = "InitDataSetQueue";
@@ -50,6 +49,36 @@ struct AicpuParamHead {
   uint64_t extInfoAddr;    // extInfo address
 } __attribute__((packed));
 
+const uint32_t kExtInfoHeadSize = 8;
+struct ExtInfo {
+  int32_t infoType;  // extend type
+  uint32_t infoLen;  // length for infoMsg
+  char infoMsg[0];   // extend value
+} __attribute__((packed));
+
+// Extent info ShapeAndType
+const uint32_t kMaxShapeDims = 8;
+struct ShapeAndType {
+  int32_t type;
+  int64_t dims[kMaxShapeDims];
+} __attribute__((packed));
+
+// Extend Info type for task
+enum FWKTaskExtInfoType {
+  FWK_ADPT_EXT_SHAPE_TYPE = 0,
+  FWK_ADPT_EXT_INPUT_SHAPE,
+  FWK_ADPT_EXT_OUTPUT_SHAPE,
+  FWK_ADPT_EXT_INVALID
+};
+
+// for unknown shape op type
+enum UnknowShapeOpType {
+  DEPEND_IN_SHAPE = 1,     // op out shape get by input shape
+  DEPEND_CONST_VALUE = 2,  // op out shape get by const op value
+  DEPEND_SHAPE_RANGE = 3,  // op out shape get by range
+  DEPEND_COMPUTE = 4       // op out shape get by totally computing
+};
+
 class AicpuOpUtil {
  public:
   static int MsTypeToProtoType(TypeId ms_type);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/attr.proto b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/attr.proto
index a0ab4bd1e76..fee2172c1d1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/attr.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/attr.proto
@@ -26,7 +26,7 @@ message AttrValue {
     repeated int64 i = 3 [ packed = true ];       //"array(int)"
     repeated float f = 4 [ packed = true ];       //"array(float)"
     repeated bool b = 5 [ packed = true ];        //"array(bool)"
-    repeated DataType type = 6 [ packed = true ]; //"array(type)"
+    repeated int32 type = 6 [ packed = true ]; //"array(type)"
     repeated TensorShape shape = 7;               //"array(shape)"
     repeated Tensor tensor = 8;                   //"array(tensor)"
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/node_def.proto b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/node_def.proto
index b0c0e0f349b..b1a76957d59 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/node_def.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/node_def.proto
@@ -18,9 +18,16 @@ package mindspore;
 import "attr.proto";
 import "tensor.proto";
 
+message DynamicIdx {
+  int32 idx = 1;
+  int32 num = 2;
+}
+
 message NodeDef {
   string op = 2;
   map<string, AttrValue> attrs = 3;
   repeated Tensor inputs = 4;
   repeated Tensor outputs = 5;
+  map<string, DynamicIdx> dym_inputs = 6;
+  map<string, DynamicIdx> dym_outputs = 7;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor.proto b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor.proto
index b4fd66595a1..1240a97ab73 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor.proto
@@ -26,9 +26,12 @@ message Tensor {
   TensorShape tensor_shape = 1;
 
   // tensor content data type
-  DataType tensor_type = 2;
+  int32 tensor_type = 2;
 
   // tensor memory device
   // data located memory device , "DDR" "HBM" OR "NONE"
   string mem_device = 3;
+  string name = 4;
+  uint64 data_ptr = 5;
+  uint64 data_size = 6;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor_shape.proto b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor_shape.proto
index 70534e8ebab..12b07e09673 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor_shape.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/tensor_shape.proto
@@ -31,5 +31,5 @@ message TensorShape {
   bool unknown_rank = 3;
 
   // data format "NHWC" "NCHW" "NC1HWC0" OR "NONE"
-  string data_format = 4;
+  int32 data_format = 4;
 };
diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/types.proto b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/types.proto
index 574259d97df..4cbff252bf5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/types.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/types.proto
@@ -19,17 +19,30 @@ option cc_enable_arenas = true;
 package mindspore;
 
 enum DataType {
-  MS_UNKNOWN = 0;
-  MS_BOOL = 1;
+  MS_FLOAT32 = 0;
+  MS_FLOAT16 = 1;
   MS_INT8 = 2;
-  MS_UINT8 = 3;
-  MS_INT16 = 4;
-  MS_UINT16 = 5;
-  MS_INT32 = 6;
-  MS_UINT32 = 7;
-  MS_INT64 = 8;
-  MS_UINT64 = 9;
-  MS_FLOAT16 = 10;
-  MS_FLOAT32 = 11;
-  MS_FLOAT64 = 12;
+  MS_INT32 = 3;
+  MS_UINT8 = 4;
+  MS_INT16 = 6;
+  MS_UINT16 = 7;
+  MS_UINT32 = 8;
+  MS_INT64 = 9;
+  MS_UINT64 = 10;
+  MS_FLOAT64 = 11;
+  MS_BOOL = 12;
+  MS_STRING = 13;
+  MS_DUAL_SUB_INT8 = 14;
+  MS_DUAL_SUB_UINT8 = 15;
+  MS_COMPLEX64 = 16;
+  MS_COMPLEX128 = 17;
+  MS_QINT8 = 18;
+  MS_QINT16 = 19;
+  MS_QINT32 = 20;
+  MS_QUINT8 = 21;
+  MS_QUINT16 = 22;
+  MS_RESOURCE = 23;
+  MS_STRING_REF = 24;
+  MS_DUAL = 25;
+  MS_UNKNOWN = 26;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel.h b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
index c41223220c4..add268374c4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@@ -37,7 +37,6 @@ enum FusionType {
   COMMREDUCE,
   SEGMENT,
   OPAQUE,
-  DYNAMIC,
   UNKNOWN_FUSION_TYPE = -1,
 };
 enum OpPattern {
@@ -80,8 +79,8 @@ class KernelPack {
   bool LoadKernelMeta(const std::string &json_f, const std::string &processor);
   bool ReadFromJsonFile(const std::string &json_f, const std::string &processor);
   const std::string Serialize() const;
-  const FlexArray *const GetJson() const { return json_; }
-  const FlexArray *const GetKernel() const { return kernel_; }
+  const FlexArray *GetJson() const { return json_; }
+  const FlexArray *GetKernel() const { return kernel_; }
   ~KernelPack() {
     if (json_) {
       delete[] json_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
index 1bae3fa2257..55d74686171 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
@@ -19,53 +19,36 @@
 #include <map>
 #include <string>
 #include <memory>
-#include <utility>
 #include "backend/kernel_compiler/tbe/tbe_kernel_build.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h"
 #include "backend/kernel_compiler/tbe/tbe_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
+#include "utils/ms_context.h"
 
 namespace mindspore {
 namespace kernel {
 using mindspore::kernel::tbe::TbeUtils;
-static bool GenPreBuildKernelJson(const std::vector<AnfNodePtr> &compute_nodes,
-                                  std::vector<nlohmann::json> *prebuild_op_list) {
-  MS_EXCEPTION_IF_NULL(prebuild_op_list);
-  TbeKernelJsonCreator creator(PREBUILD);
-  for (const auto &anf_node : compute_nodes) {
-    nlohmann::json prebuild;
-    if (!creator.GenTbeSingleKernelJson(anf_node, &prebuild)) {
-      MS_LOG(ERROR) << "GenTbeSingleKernelJson failed";
-      return false;
-    }
-    (*prebuild_op_list).push_back(prebuild);
-  }
-  return true;
-}
-
 std::map<int32_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo> &fusion_scopes) {
   MS_LOG(INFO) << "kernel fusion build start, scope size:" << fusion_scopes.size();
   std::map<int32_t, KernelModPtr> kernel_mod_ret;
   auto build_manger = std::make_shared<ParallelBuildManager>();
   MS_EXCEPTION_IF_NULL(build_manger);
   for (const auto &fusion_scope_iter : fusion_scopes) {
-    auto scope_id = fusion_scope_iter.scope_id;
+    string fusion_kernel_name;
     nlohmann::json fusion_op;
-    string fusion_kernel = "te_fusion";
     if (!TbeKernelBuild::GenFusionScopeJson(fusion_scope_iter.input_nodes, fusion_scope_iter.compute_nodes, &fusion_op,
-                                            &fusion_kernel)) {
+                                            &fusion_kernel_name)) {
       continue;
     }
     // gen kernel_name & check cache
     std::string json_str = fusion_op.dump();
     size_t hash_id = std::hash<std::string>()(json_str);
-    auto json_name = fusion_kernel.append("_").append(std::to_string(hash_id));
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    auto json_name =
+      fusion_kernel_name.append("_").append(std::to_string(hash_id)).append("_").append(std::to_string(device_id));
     fusion_op["fusion_op_name"] = json_name;
-    // gen json for prebuild
-    std::vector<nlohmann::json> prebuild_op_list;
-    if (!GenPreBuildKernelJson(fusion_scope_iter.compute_nodes, &prebuild_op_list)) {
-      continue;
-    }
     // get io size
     std::vector<size_t> input_size_list;
     std::vector<size_t> output_size_list;
@@ -80,20 +63,20 @@ std::map<int32_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
       auto kernel_mod =
         build_manger->GenKernelMod(json_name, tbe::kProcessorAiCore, input_size_list, output_size_list, kernel_pack);
       if (kernel_mod != nullptr) {
-        kernel_mod_ret[scope_id] = kernel_mod;
+        kernel_mod_ret[fusion_scope_iter.scope_id] = kernel_mod;
         continue;
       }
     }
     // fusion build
     nlohmann::json fusion_json;
     fusion_json["fusion_op"] = fusion_op;
-    fusion_json["prebuild_ops"] = prebuild_op_list;
     auto task_id = build_manger->StartCompileOp(fusion_json);
     TbeUtils::SaveJsonInfo(json_name, fusion_json.dump());
     if (task_id < 0) {
       MS_EXCEPTION(ArgumentError) << "start compile failed.";
     }
-    build_manger->SaveTaskInfo(task_id, nullptr, json_name, input_size_list, output_size_list, scope_id);
+    build_manger->SaveTaskInfo(task_id, nullptr, json_name, input_size_list, output_size_list,
+                               fusion_scope_iter.scope_id);
   }
 
   int build_failed_num = 0;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h
index 089f41f2b8b..1579953e36e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h
@@ -16,6 +16,7 @@
 
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_KERNELFUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_KERNELFUSION_H_
+#include <utility>
 #include <vector>
 #include <map>
 #include "backend/kernel_compiler/kernel.h"
@@ -25,11 +26,9 @@ namespace kernel {
  * @brief fuse op and return a callable mod
  */
 struct FusionScopeInfo {
-  FusionScopeInfo() {}
-  FusionScopeInfo(int32_t id, const std::vector<AnfNodePtr> &in, const std::vector<AnfNodePtr> &comp,
-                  const std::vector<AnfNodePtr> &out)
-      : scope_id(id), input_nodes(in), compute_nodes(comp), output_nodes(out) {}
-  int32_t scope_id;
+  FusionScopeInfo(int32_t id, std::vector<AnfNodePtr> in, std::vector<AnfNodePtr> comp, std::vector<AnfNodePtr> out)
+      : scope_id(id), input_nodes(std::move(in)), compute_nodes(std::move(comp)), output_nodes(std::move(out)) {}
+  int32_t scope_id{};
   std::vector<AnfNodePtr> input_nodes;
   std::vector<AnfNodePtr> compute_nodes;
   std::vector<AnfNodePtr> output_nodes;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/oplib/oplib.h b/mindspore/ccsrc/backend/kernel_compiler/oplib/oplib.h
index 808fa14413c..2dfa0ea7728 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/oplib/oplib.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/oplib/oplib.h
@@ -40,14 +40,13 @@ class OpLib {
 
  private:
   static bool RegOpFromLocalInfo();
-  static bool DecodeOpInfo(const nlohmann::json &obj, const OpImplyType imply_type, const std::string &impl_path);
-  static bool DecodeAttr(const nlohmann::json &obj, const OpImplyType imply_type,
-                         const std::shared_ptr<OpInfo> &op_info);
+  static bool DecodeOpInfo(const nlohmann::json &obj, OpImplyType imply_type, const std::string &impl_path);
+  static bool DecodeAttr(const nlohmann::json &obj, OpImplyType imply_type, const std::shared_ptr<OpInfo> &op_info);
   static bool DecodeDtypeFormat(const nlohmann::json &dtype_format, const std::shared_ptr<OpIOInfo> &op_io,
                                 size_t index);
   static void DecodeTBESpecificInfo(const nlohmann::json &obj, const std::shared_ptr<OpInfo> &op_info);
   static void DecodeAKGSpecificInfo(const nlohmann::json &obj, const std::shared_ptr<OpInfo> &op_info);
-  static bool DecodeInputOutput(const nlohmann::json &obj, const OpImplyType imply_type, const OpIOType io_type,
+  static bool DecodeInputOutput(const nlohmann::json &obj, OpImplyType imply_type, OpIOType io_type,
                                 const std::shared_ptr<OpInfo> &op_info, const nlohmann::json &dtype_format);
   static bool GetRefInfo(const std::shared_ptr<OpInfo> &op_info);
   static bool CheckRepetition(const std::shared_ptr<OpInfo> &op_info);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
index 07e2893294f..ca972899ba5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
@@ -173,7 +173,7 @@ void TbeAdapter::NormalizeFuncName(std::string *func_name) {
   *func_name = name_tmp;
   auto iter = tbe_func_adapter_map.find(*func_name);
   if (iter != tbe_func_adapter_map.end()) {
-    MS_LOG(INFO) << "map actual op from me " << *func_name << " to tbe op" << iter->second;
+    MS_LOG(INFO) << "Map actual op from me: " << *func_name << " to tbe op: " << iter->second;
     *func_name = iter->second;
   }
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.h
index b37cf68da64..027b8e4b884 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.h
@@ -27,7 +27,7 @@
 //       the TBE back-end operator implementation difference
 namespace mindspore {
 namespace kernel {
-enum kCreaterType : int { SINGLE_BUILD = 0, PREBUILD, OP_SELECT_FORMAT, CHECK_SUPPORTED, OP_PRE_COMPILE };
+enum kCreaterType : int { SINGLE_BUILD = 0, OP_SELECT_FORMAT, CHECK_SUPPORTED, OP_PRE_COMPILE };
 namespace tbe {
 using FAttrsPass = void (*)(const AnfNodePtr &anf_node, const std::vector<std::shared_ptr<OpAttr>> &op_info_attrs,
                             nlohmann::json *attrs_json);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc
index 34165c47995..806b06d6f21 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc
@@ -63,7 +63,7 @@ const std::unordered_map<std::string, size_t> type_nbyte_maps = {
 
 const std::unordered_map<std::string, FusionType> fusion_type_maps = {
   {"CONVLUTION", FusionType::CONVLUTION}, {"ELEMWISE", FusionType::ELEMWISE}, {"COMMREDUCE", FusionType::COMMREDUCE},
-  {"SEGMENT", FusionType::SEGMENT},       {"DYNAMIC", FusionType::DYNAMIC},   {"OPAQUE", FusionType::OPAQUE},
+  {"SEGMENT", FusionType::SEGMENT},       {"OPAQUE", FusionType::OPAQUE},
 };
 
 TypeId DtypeToTypeId(const std::string &dtypes) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
index 39e788f89ca..b8f0562e085 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
@@ -24,6 +24,7 @@
 #include "backend/kernel_compiler/tbe/tbe_adapter.h"
 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_utils.h"
+#include "utils/ms_context.h"
 
 namespace mindspore {
 namespace kernel {
@@ -71,14 +72,20 @@ constexpr auto kVTypeListListInt = "listListInt";
 constexpr auto kJValue = "value";
 constexpr auto kJDynIndex = "dyn_index";
 constexpr auto kJFuncName = "func_name";
-
-std::string NormalizeFullScopeName(const string &full_scope_name) {
-  // exp:Default/ReLU-op0 -->Default_ReLU_op0
-  string normal_ret = full_scope_name;
-  std::replace(normal_ret.begin(), normal_ret.end(), '/', '_');
-  std::replace(normal_ret.begin(), normal_ret.end(), '-', '_');
-  return normal_ret;
-}
+constexpr auto kJL1AddrOffset = "L1_addr_offset";
+constexpr auto kJL1FusionType = "L1_fusion_type";
+constexpr auto kJL1WorkspaceSize = "L1_workspace_size";
+constexpr auto kJAddrType = "addr_type";
+constexpr auto kJSliceOffset = "slice_offset";
+constexpr auto kJSplitIndex = "split_index";
+constexpr auto kJTotalShape = "total_shape";
+constexpr auto kJValidShape = "valid_shape";
+constexpr auto kJModuleName = "module_name";
+constexpr auto kJPattern = "pattern";
+constexpr auto kJPyModulePath = "py_module_path";
+constexpr auto kJPreBuildOutsAttrs = "prebuild_outs_attrs";
+constexpr auto kJKwdArgs = "kwds_args";
+constexpr auto kJListArgs = "list_args";
 
 bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspore::AnfNode> &anf_node,
                                                   nlohmann::json *kernel_json) {
@@ -117,13 +124,12 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
   op_info_json[kJAttrs] = attrs_json;
   std::string json_str = op_info_json.dump();
   size_t hash_id = std::hash<std::string>()(json_str);
-  json_name_ = op_name + "_" + std::to_string(hash_id);
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  json_name_ = op_name + "_" + std::to_string(hash_id) + "_" + std::to_string(device_id);
   json_info_ = json_str;
-  if (creater_type_ == PREBUILD) {
-    op_info_json[kJKernelName] = NormalizeFullScopeName(anf_node->fullname_with_scope());
-  } else {
-    op_info_json[kJKernelName] = json_name_;
-  }
+  op_info_json[kJKernelName] = json_name_;
   (*kernel_json)[kJOpInfo] = op_info_json;
   (*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
   if (creater_type_ == SINGLE_BUILD) {
@@ -581,25 +587,25 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &kernel_json, std::vector<si
 
 bool TbeKernelBuild::GenFusionScopeJson(const std::vector<mindspore::AnfNodePtr> &input_nodes,
                                         const std::vector<mindspore::AnfNodePtr> &compute_nodes,
-                                        nlohmann::json *fusion_str, std::string *fusion_kernel) {
-  MS_EXCEPTION_IF_NULL(fusion_str);
-  MS_EXCEPTION_IF_NULL(fusion_kernel);
+                                        nlohmann::json *fusion_json, std::string *fusion_kernel_name) {
+  MS_EXCEPTION_IF_NULL(fusion_json);
+  MS_EXCEPTION_IF_NULL(fusion_kernel_name);
   // get input layer info
   std::vector<std::vector<mindspore::AnfNodePtr>> input_layers;
   std::map<const AnfNodePtr, FusionDataType> spec_data_input;
   if (!GetInputLayers(input_nodes, compute_nodes, &input_layers, &spec_data_input)) {
     return false;
   }
-  // gen fusion scopre_op jsom
+  // gen fusion scopre_op json
   std::vector<nlohmann::json> compute_list;
-  (*fusion_kernel) = kFusionKernelNamePrfix;
+  (*fusion_kernel_name) = kFusionKernelNamePrfix;
   // index: fusion build option input record, next one from 0
   static size_t index = 0;
   auto layer_iter = input_layers.begin();
   auto compute_op_iter = compute_nodes.begin();
   for (; compute_op_iter != compute_nodes.end(); ++compute_op_iter, ++layer_iter) {
     nlohmann::json compute_op_str;
-    (void)GenFusionComputeJson(*compute_op_iter, &layer_iter, &compute_op_str, fusion_kernel, &index);
+    (void)GenFusionComputeJson(*compute_op_iter, &layer_iter, &compute_op_str, fusion_kernel_name, &index);
     compute_list.push_back(compute_op_str);
   }
   index = 0;
@@ -617,36 +623,122 @@ bool TbeKernelBuild::GenFusionScopeJson(const std::vector<mindspore::AnfNodePtr>
   }
   index = 0;
   data_list.insert(data_list.end(), compute_list.begin(), compute_list.end());
-  (*fusion_str)[kFusionOpList] = data_list;
+  (*fusion_json)[kFusionOpList] = data_list;
   return true;
 }
 
+void TbeKernelBuild::GenPreDescJson(nlohmann::json *output_desc) {
+  MS_EXCEPTION_IF_NULL(output_desc);
+  (*output_desc)[kJL1AddrOffset] = 0;
+  (*output_desc)[kJL1FusionType] = -1;
+  (*output_desc)[kJL1WorkspaceSize] = -1;
+  (*output_desc)[kJAddrType] = 0;
+}
+
+void TbeKernelBuild::GenFusionComputeCommonJson(const mindspore::CNodePtr &cnode, nlohmann::json *compute_op_str,
+                                                std::string *fusion_kernel_name) {
+  MS_EXCEPTION_IF_NULL(compute_op_str);
+  MS_EXCEPTION_IF_NULL(fusion_kernel_name);
+  // gen others
+  auto origin_type = AnfAlgo::GetCNodeName(cnode);
+  // replace special op type for buffer fusion op
+  auto type = GetRealOpType(origin_type);
+  (*compute_op_str)[kJtype] = type;
+  tbe::TbeAdapter::NormalizeFuncName(&type);
+  (*compute_op_str)[kJFuncName] = type;
+  (*compute_op_str)[kJModuleName] = std::string("impl.") + type;
+  (*compute_op_str)[kJName] = cnode->fullname_with_scope();
+  (*compute_op_str)[kJPattern] = GetNodeFusionType(cnode);
+  (*compute_op_str)[kJPyModulePath] = "/usr/local/Ascend/opp/op_impl/build_in/ai_core/tbe";
+  (void)(*fusion_kernel_name).append("_");
+  (void)(*fusion_kernel_name).append(type);
+}
+
+void TbeKernelBuild::GenFusionComputePreBuildJson(const mindspore::CNodePtr &cnode, nlohmann::json *compute_op_str) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_EXCEPTION_IF_NULL(compute_op_str);
+  // kwds args
+  nlohmann::json json_prebuild_args;
+  json_prebuild_args[kJKwdArgs] = nlohmann::json::object();
+  // list_args
+  nlohmann::json json_list_args;
+  // list_args: output args
+  auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
+  for (size_t i = 0; i < output_size; ++i) {
+    nlohmann::json output_desc;
+    GenDescJson(cnode, i, i, &output_desc);
+    output_desc[kJDtype] = output_desc[kJDataType];
+    json_list_args.push_back(output_desc);
+  }
+  // list_args: attr args
+  auto op_name = AnfAlgo::GetCNodeName(cnode);
+  auto opinfo = OpLib::FindOp(op_name, OpImplyType::kTBE);
+  MS_EXCEPTION_IF_NULL(opinfo);
+  TbeKernelJsonCreator json_creater(SINGLE_BUILD);
+  nlohmann::json json_attr_args;
+  if (!json_creater.GenTbeAttrJson(cnode, opinfo, &json_attr_args)) {
+    MS_LOG(INFO) << "Fusion warning: get prebuild args of attr failed.";
+  }
+  for (const auto &attr : json_attr_args) {
+    //    if(attr[kJName] != "isRef" && attr["valid"] == true) {
+    if (attr[kJName] != "isRef" && attr[kJValid] == true) {
+      json_list_args.push_back(attr[kJValue]);
+    }
+  }
+  json_prebuild_args[kJListArgs] = json_list_args;
+  (*compute_op_str)[kJPreBuildOutsAttrs] = json_prebuild_args;
+}
+
+void TbeKernelBuild::GenSuffixDescJson(nlohmann::json *output_desc) {
+  MS_EXCEPTION_IF_NULL(output_desc);
+  (*output_desc)[kJSliceOffset] = nlohmann::json::array();
+  (*output_desc)[kJSplitIndex] = 0;
+  (*output_desc)[kJTotalShape] = nlohmann::json::array();
+  (*output_desc)[kJValidShape] = nlohmann::json::array();
+}
+
+// anf_node: this node is used to get output desc(type\foramt\shape ...)
+// node_out_idx: node output index
+// desc_output_idx: this index use to add json
+// nlohmann::json *output_desc: for return
+// FusionDataType fusion_data_type: speceial process json desc output shape [kFusionAddN, kFusionReLUGradV2]
 void TbeKernelBuild::GenDescJson(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t node_out_idx,
                                  size_t desc_output_idx, nlohmann::json *output_desc, FusionDataType fusion_data_type) {
+  GenPreDescJson(output_desc);
+  // data_type
+  auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, node_out_idx);
+  (*output_desc)[kJDataType] = tbe::TypeIdToString(type_id);
+  // name
   std::string output_desc_name = anf_node->fullname_with_scope();
   if (node_out_idx > 0) {
     output_desc_name = output_desc_name + "_" + std::to_string(node_out_idx);
   }
-  (*output_desc)[kJName] = NormalizeFullScopeName(output_desc_name);
-  auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, node_out_idx);
-  (*output_desc)[kJDataType] = tbe::TypeIdToString(type_id);
+  (*output_desc)[kJName] = output_desc_name;
+  // ori_format
+  (*output_desc)[kJOriFormat] = kOpFormat_NCHW;
+  // ori_shape
   auto ori_shape = AnfAlgo::GetOutputInferShape(anf_node, node_out_idx);
   if (ori_shape.empty()) {
     ori_shape.emplace_back(1);
   }
   (*output_desc)[kJOriShape] = ori_shape;
+  // !! Note: output_index, only node's output use it
+  (*output_desc)[kJOutputIndex] = desc_output_idx;
+  // shape
   auto shape = AnfAlgo::GetOutputDeviceShape(anf_node, node_out_idx);
   if (shape.empty()) {
     shape.emplace_back(1);
   }
   (*output_desc)[kJShape] = shape;
+  // !! Note: format: only data node's output use it
   auto format = AnfAlgo::GetOutputFormat(anf_node, node_out_idx);
   if (format == kOpFormat_DEFAULT) {
     format = ori_shape.size() == 4 ? kOpFormat_NCHW : kOpFormat_ND;
+  } else if (format == kOpFormat_FRAC_Z) {
+    format = kOpFormat_FRACTAL_Z;
   }
   (*output_desc)[kJFormat] = format;
-  (*output_desc)[kJOriFormat] = kOpFormat_NCHW;
-  (*output_desc)[kJOutputIndex] = desc_output_idx;
+  // special node
   if (fusion_data_type == kFusionAddN && format == kOpFormat_NC1HWC0) {
     std::vector<size_t> spec_shape = {};
     spec_shape.emplace_back(shape[0]);
@@ -663,12 +755,13 @@ void TbeKernelBuild::GenDescJson(const std::shared_ptr<mindspore::AnfNode> &anf_
     (*output_desc)[kJShape] = spec_shape;
     (*output_desc)[kJDataType] = kVTypeBool;
   }
+  GenSuffixDescJson(output_desc);
 }
 
 void TbeKernelBuild::GenReusedOutputDesc(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
                                          size_t output_index, nlohmann::json *output_desc) {
   std::string output_desc_name = anf_node->fullname_with_scope() + "_" + std::to_string(index);
-  (*output_desc)[kJName] = NormalizeFullScopeName(output_desc_name);
+  (*output_desc)[kJName] = output_desc_name;
   (*output_desc)[kJOutputIndex] = output_index;
   std::vector<size_t> shape;
   (*output_desc)[kJShape] = shape;
@@ -692,6 +785,9 @@ bool TbeKernelBuild::GetSpecInputLayers(const std::string &op_name,
   return true;
 }
 
+// <input_nodes> : contains parameter/data node, input order may doesn't match tbe input order;
+// <compute_nodes> : contains cnode, inputs order may doesn't match tbe input order;
+// Special process node list: reference tbe_adapter.cc [except: Conv2DBackpropInput]
 bool TbeKernelBuild::GetInputLayers(const std::vector<mindspore::AnfNodePtr> &input_nodes,
                                     const std::vector<mindspore::AnfNodePtr> &compute_nodes,
                                     std::vector<std::vector<mindspore::AnfNodePtr>> *input_layers,
@@ -722,7 +818,7 @@ bool TbeKernelBuild::GetInputLayers(const std::vector<mindspore::AnfNodePtr> &in
         MS_LOG(INFO) << "Fusion info: add compute node's [" << i << "] input: " << input->fullname_with_scope();
         layer.emplace_back((*find_iter));
       } else {
-        MS_LOG(INFO) << "Fusion warnig: this input [" << i << "] may be pre compute(" << input->fullname_with_scope()
+        MS_LOG(INFO) << "Fusion warning: this input [" << i << "] may be pre compute(" << input->fullname_with_scope()
                      << ") node's output.";
       }
     }
@@ -750,8 +846,9 @@ bool TbeKernelBuild::GenFusionDataInputJson(const std::shared_ptr<mindspore::Anf
   MS_EXCEPTION_IF_NULL(data_str);
   MS_EXCEPTION_IF_NULL(index);
   std::vector<nlohmann::json> output_desc_list;
+  // if data_input is null, this is optional input.
   if (!data_input) {
-    MS_LOG(INFO) << "Data input is optional node";
+    MS_LOG(INFO) << "Fusion info: data input is optional node";
     auto name = std::string(kOptional) + std::to_string(*index);
     (*data_str)[kJName] = name;
     nlohmann::json output_desc;
@@ -767,12 +864,16 @@ bool TbeKernelBuild::GenFusionDataInputJson(const std::shared_ptr<mindspore::Anf
     auto kernel_idx = AnfAlgo::VisitKernel(data_input, 0);
     auto real_node = kernel_idx.first;
     size_t real_idx = kernel_idx.second;
-    MS_LOG(INFO) << "Real name " << real_node->fullname_with_scope() << " index:" << real_idx;
+    MS_LOG(INFO) << "Fusion info: Real name: " << real_node->fullname_with_scope() << ". index:" << real_idx;
     // kJOutputDesc
     nlohmann::json output_desc;
     GenDescJson(real_node, real_idx, real_idx, &output_desc, fusion_data_type);
     output_desc_list.push_back(output_desc);
-    (*data_str)[kJName] = NormalizeFullScopeName(real_node->fullname_with_scope());
+    auto full_name = real_node->fullname_with_scope();
+    if (real_idx > 0) {
+      full_name = full_name.append("_").append(std::to_string(real_idx));
+    }
+    (*data_str)[kJName] = full_name;
   }
   (*data_str)[kJOutputDesc] = output_desc_list;
   (*data_str)[kJtype] = "Data";
@@ -808,6 +909,7 @@ bool TbeKernelBuild::IsDynamicInput(const mindspore::CNodePtr &cnode) {
 size_t TbeKernelBuild::GetOptionalInput(const mindspore::CNodePtr &cnode, bool is_dynamic_input) {
   MS_EXCEPTION_IF_NULL(cnode);
   if (is_dynamic_input) {
+    // Node can not have optional & dynamic input.
     return 0;
   }
   MS_EXCEPTION_IF_NULL(cnode);
@@ -831,22 +933,46 @@ std::string TbeKernelBuild::GetRealOpType(const std::string &origin_type) {
   return result;
 }
 
+std::string TbeKernelBuild::GetNodeFusionType(const mindspore::CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto node_type = AnfAlgo::GetCNodeName(cnode);
+  static std::map<std::string, std::string> fusion_type_map = {{kConv2DOpName, "Convolution"},
+                                                               {kBNTrainingReduceOpName, "bn_reduce"},
+                                                               {kBNTrainingUpdateOpName, "bn_update"},
+                                                               {kReluV2OpName, "ElemWise"},
+                                                               {kTensorAddOpName, "ElemWise"},
+                                                               {kConv2DBackpropInputOpName, "Conv2d_backprop_input"},
+                                                               {kAddNOpName, "ElemWise"},
+                                                               {kReluGradV2OpName, "ElemWise"},
+                                                               {kRealDivOpName, "ElemWise"}};
+  auto find = fusion_type_map.find(node_type);
+  if (find == fusion_type_map.end()) {
+    MS_LOG(INFO) << "Fusion warning: get node fusion type failed, origin node type: " << node_type
+                 << " return null string.";
+    return "";
+  } else {
+    return find->second;
+  }
+}
+
 bool TbeKernelBuild::GenFusionComputeInputJson(const mindspore::CNodePtr &cnode,
                                                std::vector<std::vector<mindspore::AnfNodePtr>>::iterator *layer_iter,
                                                std::vector<nlohmann::json> *input_desc_list, size_t *index) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(input_desc_list);
   std::vector<nlohmann::json> input_desc_list_tmp = {};
+  // 1. input json
   bool is_dynamic_input = IsDynamicInput(cnode);
   for (size_t i = 1; i < cnode->inputs().size(); ++i) {
     auto input = cnode->input(i);
     auto kernel_idx = AnfAlgo::VisitKernel(input, 0);
     auto real_node = kernel_idx.first;
     size_t real_idx = kernel_idx.second;
-    MS_LOG(INFO) << "Real name" << real_node->fullname_with_scope() << "index:" << real_idx;
+    MS_LOG(INFO) << "Fusion info: real name: " << real_node->fullname_with_scope() << ". index:" << real_idx;
     nlohmann::json input_desc;
     GenDescJson(real_node, real_idx, real_idx, &input_desc);
     if (is_dynamic_input) {
+      // 2. dynamic input json
       MS_LOG(INFO) << "Node has dynamic input.";
       input_desc[kJDynIndex] = (i - 1);
     }
@@ -854,7 +980,8 @@ bool TbeKernelBuild::GenFusionComputeInputJson(const mindspore::CNodePtr &cnode,
   }
   size_t optional_num = GetOptionalInput(cnode, is_dynamic_input);
   if (optional_num > 0) {
-    MS_LOG(INFO) << "Node has optional input.";
+    // 3. optional input
+    MS_LOG(INFO) << "Fusion info: node has optional input.";
     for (size_t i = 0; i < optional_num; ++i) {
       nlohmann::json optional_input_desc;
       optional_input_desc[kJName] = std::string(kOptional) + std::to_string(*index);
@@ -872,7 +999,7 @@ std::vector<size_t> TbeKernelBuild::GetDescOutputIndex(const std::vector<int> &o
   std::vector<size_t> desc_output_index = {};
   for (size_t idx = 0; idx < output_used_nums.size(); ++idx) {
     auto output_use_num_item = output_used_nums[idx];
-    MS_LOG(INFO) << "Output used num[" << idx << "] = " << output_use_num_item;
+    MS_LOG(INFO) << "Fusion info: output used num[" << idx << "] = " << output_use_num_item;
     desc_output_index.emplace_back(idx);
     if (output_use_num_item > 1) {
       desc_output_index.emplace_back(idx);
@@ -887,7 +1014,7 @@ bool TbeKernelBuild::GenFusionComputeOutputJson(const mindspore::CNodePtr &cnode
   auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
   if (AnfAlgo::HasNodeAttr(kAttrOutputUsedNum, cnode)) {
     auto output_used_nums = AnfAlgo::GetNodeAttr<std::vector<int>>(cnode, kAttrOutputUsedNum);
-    MS_LOG(INFO) << "This node's output has been reused, node name: " << cnode->fullname_with_scope();
+    MS_LOG(INFO) << "Fusion info: this node's output has been reused, node name: " << cnode->fullname_with_scope();
     if (output_used_nums.size() != output_size) {
       MS_LOG(INFO) << "Fusion error: output tenor num(" << output_size << ")"
                    << " is not match output used num(" << output_used_nums.size() << ")";
@@ -930,20 +1057,14 @@ bool TbeKernelBuild::GenFusionComputeJson(const mindspore::AnfNodePtr &compute_n
   // gen output desc
   std::vector<nlohmann::json> output_desc_list;
   if (!GenFusionComputeOutputJson(cnode, &output_desc_list)) {
-    MS_LOG(INFO) << "Fusion Error: gen fusion output desc faild, node full name: " << cnode->fullname_with_scope();
+    MS_LOG(INFO) << "Fusion Error: gen fusion output desc failed, node full name: " << cnode->fullname_with_scope();
     return false;
   }
   (*compute_op_str)[kJOutputDesc] = output_desc_list;
-  // gen others
-  auto origin_type = AnfAlgo::GetCNodeName(cnode);
-  // replace special op type for buffer fusion op
-  auto type = GetRealOpType(origin_type);
-  (*compute_op_str)[kJtype] = type;
-  tbe::TbeAdapter::NormalizeFuncName(&type);
-  (*compute_op_str)[kJFuncName] = type;
-  (*compute_op_str)[kJName] = NormalizeFullScopeName(cnode->fullname_with_scope());
-  (void)(*fusion_kernel_name).append("_");
-  (void)(*fusion_kernel_name).append(type);
+  // gen common desc
+  GenFusionComputeCommonJson(cnode, compute_op_str, fusion_kernel_name);
+  // gen prebuild args
+  GenFusionComputePreBuildJson(cnode, compute_op_str);
   return true;
 }
 
@@ -965,7 +1086,7 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list,
   MS_EXCEPTION_IF_NULL(output_size_list);
   input_size_list->clear();
   output_size_list->clear();
-
+  // cal input size for malloc
   for (const auto &op : fusion_op_list) {
     if (op[kJtype] == "Data") {
       const auto &data_output_desc = op[kJOutputDesc];
@@ -975,23 +1096,23 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list,
         }
         auto ret = GetIOSizeImpl(data_output);
         input_size_list->push_back(ret);
-        MS_LOG(INFO) << "Fusion info: scope input name： " << op[kJName] << ", size: " << ret;
+        MS_LOG(INFO) << "Fusion info: input node name： " << op[kJName] << ", size: " << ret;
       }
     }
   }
-
+  // cal output size for malloc
   for (const auto &output_node : output_nodes) {
     auto kernel_idx = AnfAlgo::VisitKernel(output_node, 0);
     auto real_node = kernel_idx.first;
     size_t real_idx = kernel_idx.second;
-    auto normal_name = NormalizeFullScopeName(real_node->fullname_with_scope());
-    MS_LOG(INFO) << "Fusion info: real node name: " << normal_name << ", real output index: " << real_idx;
+    auto full_name = real_node->fullname_with_scope();
+    MS_LOG(INFO) << "Fusion info: real output node name: " << full_name << ", real output index: " << real_idx;
     for (const auto &op : fusion_op_list) {
-      if (op[kJName] == normal_name) {
+      if (op[kJName] == full_name) {
         auto op_output_desces = op[kJOutputDesc];
         if (output_node != real_node) {
           // tuple_get item
-          MS_LOG(INFO) << "Output is a tuple getitem node";
+          MS_LOG(INFO) << "Fusion info: output is a tuple get_item node";
           auto output_desc = op_output_desces[real_idx];
           if (output_desc[kJShape].empty()) {
             MS_LOG(INFO) << "Fusion error: output_desc's shape is empty. real_index " << real_idx;
@@ -1001,6 +1122,7 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &fusion_op_list,
           output_size_list->push_back(ret);
           MS_LOG(INFO) << "Fusion info: scope output index： " << real_idx << ", size: " << ret;
         } else {
+          MS_LOG(INFO) << "Fusion info: output is self.";
           for (const auto &output_desc : op_output_desces) {
             if (output_desc[kJShape].empty()) {
               MS_LOG(INFO) << "Fusion info: output_desc's shape is empty, may be this node output";
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
index 3a00169632c..d4cfe7866d1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
@@ -41,8 +41,8 @@ class TbeKernelBuild {
                         std::vector<size_t> *output_size_list);
   // Ub Fuison
   static bool GenFusionScopeJson(const std::vector<AnfNodePtr> &input_nodes,
-                                 const std::vector<AnfNodePtr> &compute_nodes, nlohmann::json *fusion_str,
-                                 std::string *fusion_kernel);
+                                 const std::vector<AnfNodePtr> &compute_nodes, nlohmann::json *fusion_json,
+                                 std::string *fusion_kernel_name);
   static bool GetIOSize(const nlohmann::json &fusion_op_list, const std::vector<AnfNodePtr> &output_nodes,
                         std::vector<size_t> *input_size_list, std::vector<size_t> *output_size_list);
 
@@ -61,9 +61,14 @@ class TbeKernelBuild {
   static std::vector<size_t> GetDescOutputIndex(const std::vector<int> &output_used_nums);
   static bool GenFusionComputeOutputJson(const mindspore::CNodePtr &cnode,
                                          std::vector<nlohmann::json> *output_desc_list);
+  static void GenPreDescJson(nlohmann::json *output_desc);
+  static void GenFusionComputeCommonJson(const mindspore::CNodePtr &cnode, nlohmann::json *compute_op_str,
+                                         std::string *fusion_kernel_name);
+  static void GenFusionComputePreBuildJson(const mindspore::CNodePtr &cnode, nlohmann::json *compute_op_str);
   static void GenDescJson(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t node_out_idx,
                           size_t desc_output_idx, nlohmann::json *output_desc,
                           FusionDataType fusion_data_type = kFusionNormal);
+  static void GenSuffixDescJson(nlohmann::json *output_desc);
   static void GenReusedOutputDesc(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
                                   size_t output_index, nlohmann::json *output_desc);
   static size_t GetIOSizeImpl(const nlohmann::json &desc);
@@ -76,6 +81,7 @@ class TbeKernelBuild {
   static bool IsDynamicInput(const CNodePtr &cnode);
   static size_t GetOptionalInput(const CNodePtr &cnode, bool is_dynamic_input);
   static std::string GetRealOpType(const std::string &origin_type);
+  static std::string GetNodeFusionType(const CNodePtr &cnode);
 };
 
 class TbeKernelJsonCreator {
@@ -84,14 +90,14 @@ class TbeKernelJsonCreator {
   ~TbeKernelJsonCreator() = default;
   bool GenTbeSingleKernelJson(const std::shared_ptr<AnfNode> &anf_node, nlohmann::json *kernel_json);
   std::string json_name() { return json_name_; }
+  bool GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
+                      nlohmann::json *attrs_json);
 
  private:
   bool GenTbeInputsJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
                         nlohmann::json *inputs_json);
   bool GenTbeOutputsJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
                          nlohmann::json *outputs_json);
-  bool GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
-                      nlohmann::json *attrs_json);
   static void ParseAttrValue(const std::string &type, const ValuePtr &value, nlohmann::json *attr_obj);
   bool GenInputDescJson(const std::shared_ptr<AnfNode> &anf_node, size_t real_input_index, bool value,
                         const std::shared_ptr<OpIOInfo> &input_ptr, const string &op_input_name, size_t input_i,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
index 79a538acd3b..41f03b7f5da 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
@@ -33,42 +33,6 @@
 namespace mindspore {
 namespace kernel {
 using mindspore::kernel::tbe::TbeUtils;
-
-bool TbeOpParallelPreBuild(const std::vector<AnfNodePtr> &anf_nodes) {
-  auto build_manger = std::make_shared<ParallelBuildManager>();
-  MS_EXCEPTION_IF_NULL(build_manger);
-  for (const auto &anf_node : anf_nodes) {
-    // gen kernel json
-    MS_EXCEPTION_IF_NULL(anf_node);
-    nlohmann::json kernel_json;
-    TbeKernelJsonCreator creator(OP_PRE_COMPILE);
-    if (!creator.GenTbeSingleKernelJson(anf_node, &kernel_json)) {
-      MS_LOG(ERROR) << "GenTbeSingleKernelJson failed";
-      return false;
-    }
-    kernel_json["compile_type"] = "pre_build";
-    // op build
-    auto task_id = build_manger->StartCompileOp(kernel_json);
-    build_manger->SavePreTaskInfo(task_id, anf_node);
-  }
-  while (!build_manger->IsAllPreTaskFinish()) {
-    int task_id = -1;
-    std::string task_result;
-    std::string pre_build_result;
-    auto ret = build_manger->WaitOne(&task_id, &task_result, &pre_build_result);
-    if (!ret) {
-      MS_EXCEPTION(ArgumentError) << "Pre Build Failed. wait one ret:" << ret << ", task id:" << task_id;
-    }
-
-    if (task_result != "Success") {
-      MS_EXCEPTION(ArgumentError) << "task pre compile Failed, task id:" << task_id << ", cause:" << task_result;
-    }
-
-    build_manger->PreTaskFinishProcess(task_id, pre_build_result);
-  }
-  return true;
-}
-
 bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
   auto build_manger = std::make_shared<ParallelBuildManager>();
   MS_EXCEPTION_IF_NULL(build_manger);
@@ -122,15 +86,8 @@ bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
   return build_manger->GenSameOpKernelMod();
 }
 
-ParallelBuildManager::ParallelBuildManager() {}
-
 ParallelBuildManager::~ParallelBuildManager() { ResetTaskInfo(); }
 
-void ParallelBuildManager::SavePreTaskInfo(int32_t task_id, const mindspore::AnfNodePtr &anf_node) {
-  MS_LOG(INFO) << "SavePreTaskInfo, task id: " << task_id;
-  pre_task_map_[task_id] = anf_node;
-}
-
 void ParallelBuildManager::SaveTaskInfo(int32_t task_id, const mindspore::AnfNodePtr &anf_node,
                                         const std::string &json_name, const std::vector<size_t> &input_size_list,
                                         const std::vector<size_t> &output_size_list, int32_t scope_id) {
@@ -149,42 +106,11 @@ void ParallelBuildManager::SaveTaskInfo(int32_t task_id, const mindspore::AnfNod
   task_map_[task_id] = task_info;
 }
 
-bool ParallelBuildManager::IsAllPreTaskFinish() const {
-  MS_LOG(INFO) << "wait pre build process task_num: " << pre_task_map_.size();
-  return pre_task_map_.empty();
-}
-
 bool ParallelBuildManager::IsAllTaskFinish() const {
   MS_LOG(INFO) << "wait process task_num: " << task_map_.size();
   return task_map_.empty();
 }
 
-void ParallelBuildManager::PreTaskFinishProcess(int32_t task_id, const std::string &pre_build_result) {
-  auto task_iter = pre_task_map_.find(task_id);
-  if (task_iter == pre_task_map_.end()) {
-    MS_EXCEPTION(ArgumentError) << "can find pre task_id:" << task_id;
-  }
-  auto node = task_iter->second;
-  auto builder =
-    std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
-  std::string start_flag = "fusion_pattern_start";
-  std::string end_flag = "fusion_pattern_end";
-  int start = pre_build_result.find(start_flag);
-  int end = pre_build_result.find(end_flag);
-  if (start != -1 && end != -1 && end >= start) {
-    std::string result = pre_build_result.substr(start + start_flag.size(), end - start - start_flag.size());
-    if (result == "") {
-      (void)pre_task_map_.erase(task_iter);
-      return;
-    }
-    transform(result.begin(), result.end(), result.begin(), ::toupper);
-    FusionType fusion_type = tbe::GetFusionType(result);
-    builder->SetFusionType(fusion_type);
-    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
-  }
-  (void)pre_task_map_.erase(task_iter);
-}
-
 std::pair<int32_t, KernelModPtr> ParallelBuildManager::TaskFinishProcess(int32_t task_id, bool set_kernel_mod) {
   auto task_iter = task_map_.find(task_id);
   if (task_iter == task_map_.end()) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h
index a026f186c05..a7a28d45025 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h
@@ -28,7 +28,6 @@
 
 namespace mindspore {
 namespace kernel {
-bool TbeOpParallelPreBuild(const std::vector<AnfNodePtr> &anf_nodes);
 bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes);
 
 struct KernelBuildTaskInfo {
@@ -42,9 +41,8 @@ struct KernelBuildTaskInfo {
 
 class ParallelBuildManager {
  public:
-  ParallelBuildManager();
+  ParallelBuildManager() = default;
   ~ParallelBuildManager();
-  void SavePreTaskInfo(int32_t task_id, const AnfNodePtr &anf_node);
   void SaveTaskInfo(int32_t task_id, const AnfNodePtr &anf_node, const std::string &json_name,
                     const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
                     int32_t scope_id = 0);
@@ -54,10 +52,7 @@ class ParallelBuildManager {
   bool SearchInCache(const std::string &json_name, const std::string &processor,
                      const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
                      AnfNode *node) const;
-
-  bool IsAllPreTaskFinish() const;
   bool IsAllTaskFinish() const;
-  void PreTaskFinishProcess(int32_t task_id, const std::string &pre_build_result);
   std::pair<int32_t, KernelModPtr> TaskFinishProcess(int32_t task_id, bool set_kernel_mod = true);
   KernelModPtr GenKernelMod(const string &json_name, const string &processor,
                             const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
index 5662fdb4467..47b82d1435a 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -1187,6 +1187,19 @@ TypeId AnfRuntimeAlgorithm::GetPrevNodeOutputPrecision(const AnfNodePtr &node, s
   return GetCNodeOutputPrecision(kernel_with_index.first);
 }
 
+bool AnfRuntimeAlgorithm::IsDynamicShape(const AnfNodePtr &node) {
+  if (!node->isa<CNode>()) {
+    return false;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto has_attr = AnfAlgo::HasNodeAttr(kAttrIsDynamicShape, cnode);
+  if (!has_attr) {
+    return false;
+  }
+  return AnfAlgo::GetNodeAttr<bool>(node, kAttrIsDynamicShape);
+}
+
 bool AnfRuntimeAlgorithm::IsCondControlKernel(const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   if (node->inputs().empty()) {
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
index 2fff066166a..d4a5f00a259 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -217,6 +217,7 @@ class AnfRuntimeAlgorithm {
   static TypeId GetCNodeOutputPrecision(const AnfNodePtr &node);
   // get fix output precision from prev node, input_idx is the input index of current node related to prev node.
   static TypeId GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx);
+  static bool IsDynamicShape(const AnfNodePtr &node);
   static bool IsCondControlKernel(const CNodePtr &node);
   static bool IsIndependentNode(const CNodePtr &node);
 };
diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index 019fabecbdc..fd240d41cbb 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -445,7 +445,6 @@ void AscendSession::InitRuntimeResource() {
 }
 
 void AscendSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) const {
-  device::ascend::KernelPreBuild(kernel_graph.get());
   MS_LOG(INFO) << "HardwareOptimize start!";
   opt::AscendBackendOptimization(kernel_graph);
   opt::AscendGraphKernelCommonProcess(kernel_graph);
diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
index d5b76edcf05..833104a1c50 100644
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
@@ -19,7 +19,8 @@
 #include <vector>
 #include <string>
 #include <memory>
-
+#include <set>
+#include <map>
 #include "runtime/device/ascend/kernel_select_ascend.h"
 #include "runtime/device/kernel_info.h"
 #include "backend/kernel_compiler/kernel.h"
@@ -61,32 +62,6 @@ static kernel::KernelModPtr SerialCompileImpl(const AnfNodePtr &anf_node) {
   return kernel_mod_ptr;
 }
 
-static bool KernelPreBuildParallelCompile(const mindspore::session::KernelGraph *kernel_graph_ptr) {
-  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  std::vector<AnfNodePtr> tbe_nodes;
-  for (const auto &anf_node : kernel_graph_ptr->execution_order()) {
-    MS_EXCEPTION_IF_NULL(anf_node);
-    if (!AnfAlgo::IsRealKernel(anf_node)) {
-      continue;
-    }
-    KernelType kernel_type = AnfAlgo::GetKernelType(anf_node);
-    switch (kernel_type) {
-      case KernelType::TBE_KERNEL: {
-        if (AnfAlgo::GetKernelMod(anf_node) == nullptr &&
-            AnfAlgo::GetFusionType(anf_node) == kernel::FusionType::DYNAMIC) {
-          tbe_nodes.push_back(anf_node);
-        }
-        break;
-      }
-      default: {
-        break;
-      }
-    }
-  }
-  bool ret = kernel::TbeOpParallelPreBuild(tbe_nodes);
-  return ret;
-}
-
 static bool KernelBuildParallelCompile(const mindspore::session::KernelGraph *kernel_graph_ptr) {
   MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
   std::vector<AnfNodePtr> tbe_nodes;
@@ -237,12 +212,6 @@ static bool IsAtomicNode(const CNodePtr &kernel_node) {
   return !(workspace_indexs.empty() && output_indexs.empty());
 }
 
-bool KernelPreBuild(const mindspore::session::KernelGraph *kernel_graph_ptr) {
-  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
-  bool ret = device::ascend::KernelPreBuildParallelCompile(kernel_graph_ptr);
-  return ret;
-}
-
 bool KernelBuild(const mindspore::session::KernelGraph *kernel_graph_ptr) {
   MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
   TbeUtils::LoadCache();
diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h
index b478f59c14d..6c41eed460b 100644
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h
@@ -22,10 +22,6 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
-/**
- * @brief kernel pre build for ascend.
- */
-bool KernelPreBuild(const mindspore::session::KernelGraph *kernel_graph_ptr);
 /**
  * @brief kernel build for ascend.
  */
diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h
index 5a395eb018b..f7d905f649f 100644
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -32,6 +32,7 @@ namespace mindspore {
 // op name. Op which not exists in operator/ops.h, so define it's name here
 constexpr auto kFour2FiveOpName = "Four2Five";
 constexpr auto kFive2FourOpName = "Five2Four";
+constexpr auto kConv2DOpName = "Conv2D";
 constexpr auto kConvBN1OpName = "ConvBN1";
 constexpr auto kBN2AddReluOpName = "BN2AddRelu";
 constexpr auto kBN2ReLUOpName = "BN2Relu";
@@ -273,6 +274,7 @@ constexpr auto kAttrPadDimSize = "pad_dim_size";
 constexpr auto kAttrNumSegments = "num_segments";
 constexpr auto kAttrBegin = "begin";
 constexpr auto kAttrSize = "size";
+constexpr auto kAttrIsDynamicShape = "is_dynamic_shape";
 
 // attr value
 constexpr auto kValueTargetSwitch = "target_switch";
diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index b321db47e08..bb63d4bf32e 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """aicpu ops"""
+from .unique import _unique_aicpu
 from .init_data_set_queue import _init_data_set_queue_aicpu
 from .embedding_lookup import _embedding_lookup_aicpu
 from .padding import _padding_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/unique.py b/mindspore/ops/_op_impl/aicpu/unique.py
new file mode 100644
index 00000000000..849e9696093
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/unique.py
@@ -0,0 +1,31 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Unique op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+unique_op_info = AiCPURegOp("Unique") \
+    .fusion_type("OPAQUE") \
+    .input(0, "x", "required") \
+    .output(0, "y", "required") \
+    .output(1, "idx", "required") \
+    .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .get_op_info()
+
+@op_info_register(unique_op_info)
+def _unique_aicpu():
+    """Unique AiCPU register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/matmul.py b/mindspore/ops/_op_impl/tbe/matmul.py
index 0f68fa4c9da..e773191ae88 100644
--- a/mindspore/ops/_op_impl/tbe/matmul.py
+++ b/mindspore/ops/_op_impl/tbe/matmul.py
@@ -17,7 +17,7 @@
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 matmul_op_info = TBERegOp("MatMul") \
-    .fusion_type("ELEMWISE") \
+    .fusion_type("OPAQUE") \
     .async_flag(False) \
     .binfile_name("matmul.so") \
     .compute_cost(10) \
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index e4a47fc90c2..05d0b77eaad 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -91,6 +91,7 @@ from ._thor_ops import (CusBatchMatMul, CusCholeskyTrsm, CusFusedAbsMax1, CusImg
 from .sparse_ops import SparseToDense
 
 __all__ = [
+    'Unique',
     'ReverseSequence',
     'EditDistance',
     'CropAndResize',
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index bfc2f316272..c42c2505520 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -597,9 +597,9 @@ class Unique(Primitive):
         containing indices of elements in the input coressponding to the output tensor.
 
     Examples:
-        >>> x = Tensor(np.array([1, 2, 5, 2]), mindspore.float32)
+        >>> x = Tensor(np.array([1, 2, 5, 2]), mindspore.int32)
         >>> out = P.Unique()(x)
-        (Tensor([1, 2, 5], mindspore.int32), Tensor([0, 1, 2, 1], mindspore.float32))
+        (Tensor([1, 2, 5], mindspore.int32), Tensor([0, 1, 2, 1], mindspore.int32))
     """
     @prim_attr_register
     def __init__(self):
diff --git a/tests/ut/cpp/stub/tdt/tdt_mock.cc b/tests/ut/cpp/stub/tdt/tdt_mock.cc
index 45725de173a..6b9c6f95a4b 100644
--- a/tests/ut/cpp/stub/tdt/tdt_mock.cc
+++ b/tests/ut/cpp/stub/tdt/tdt_mock.cc
@@ -35,39 +35,5 @@ StatusFactory::StatusFactory() {}
 
 std::mutex& StatusFactory::GetMutex() { return GetInstance()->rwMutex_; }
 
-TsdClient* TsdClient::GetInstance() {
-  static TsdClient instance;
-  return &instance;
-}
-
-/**
- * @ingroup TsdClient
- * @brief 构造函数
- */
-TsdClient::TsdClient() { rankSize_ = 1; }
-
-/**
- * @ingroup TsdClient
- * @brief 析构函数
- */
-TsdClient::~TsdClient() = default;
-
-/**
- * @ingroup TsdClient
- * @brief framework发送拉起hccp和computer process的命令
- * @param [in] phyDeviceId : FMK传入物理ID
- * @param [in] phyDeviceId : FMK传入rankSize
- * @return TDT_OK:成功 或者其他错误码
- */
-TDT_StatusT TsdClient::Open(const uint32_t deviceId, const uint32_t rankSize) { return TDT_OK; }
-
-/**
- * @ingroup TsdClient
- * @brief 通知TsdClient关闭相关资源
- * @param 无
- * @return TDT_OK:成功 或者其他错误码
- */
-TDT_StatusT TsdClient::Close() { return TDT_OK; }
-
 }  // namespace tdt
 #endif  // TDT_MOCK_H