diff --git a/.gitmodules b/.gitmodules
index a241b6d69b..df7212b083 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "graphengine"]
 	path = graphengine
 	url = https://gitee.com/mindspore/graphengine.git
+[submodule "akg"]
+	path = akg
+	url = https://gitee.com/mindspore/akg.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1051aeb96c..34521d22d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,10 +86,14 @@ if (ENABLE_GE OR ENABLE_D OR ENABLE_TESTCASES)
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc/toolchain)
 endif()
 
+if (ENABLE_AKG AND ENABLE_D)
+    add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
+endif()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
 add_subdirectory(mindspore/ccsrc)
 if (ENABLE_TESTCASES)
     add_subdirectory(tests)
 endif()
 
-include(cmake/package.cmake)
\ No newline at end of file
+include(cmake/package.cmake)
diff --git a/akg b/akg
new file mode 160000
index 0000000000..c460176523
--- /dev/null
+++ b/akg
@@ -0,0 +1 @@
+Subproject commit c460176523d039c8995f1d71089753725ebc0792
diff --git a/build.sh b/build.sh
index dfed66aadf..7676665be7 100755
--- a/build.sh
+++ b/build.sh
@@ -246,6 +246,9 @@ checkopts "$@"
 echo "---------------- mindspore: build start ----------------"
 mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
 git submodule update --init graphengine
+if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" ]]; then
+    git submodule update --init --recursive akg
+fi
 
 build_exit()
 {
@@ -308,7 +311,7 @@ build_mindspore()
     if [[ "X$USE_GLOG" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DUSE_GLOG=ON"
     fi
-    if [[ "X$ENABLE_AKG" = "Xon" ]]; then
+    if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_AKG=ON"
     fi
     echo "${CMAKE_ARGS}"
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 01f7bdabd8..1cff396ef1 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -236,6 +236,16 @@ if (ENABLE_GPU)
     endif ()
 endif ()
 
+if (ENABLE_D AND ENABLE_AKG)
+    set (AKG_PATH ${CMAKE_SOURCE_DIR}/build/mindspore/akg)
+    install(
+        DIRECTORY
+            ${AKG_PATH}/akg
+        DESTINATION ${INSTALL_PY_DIR}/..
+        COMPONENT mindspore
+    )
+endif ()
+
 if (EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
     install(
         DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/dataset
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/__init__.py b/mindspore/_extends/parallel_compile/akg_compiler/__init__.py
new file mode 100644
index 0000000000..e30774307c
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/akg_compiler/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/compiler.py b/mindspore/_extends/parallel_compile/akg_compiler/compiler.py
new file mode 100644
index 0000000000..de78aad7e4
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/akg_compiler/compiler.py
@@ -0,0 +1,35 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Providing akg compile with json"""
+import sys
+def run_compiler(op_json):
+    """
+    Run AKG compiler to compile op with subprocess, if this process of
+    compilation failed, an exception will be raised
+
+    Args:
+        op_json (str): json string of the op
+
+    Returns:
+        None
+    """
+    p = __import__("akg", globals(), locals(), ['ms'], 0)
+    func = getattr(p.ms, "compilewithjson")
+    res = func(op_json)
+    if not res:
+        raise ValueError("Compile error")
+
+if __name__ == "__main__":
+    run_compiler(sys.argv[1])
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/multi_process_compiler.py b/mindspore/_extends/parallel_compile/akg_compiler/multi_process_compiler.py
new file mode 100644
index 0000000000..ffe9c85dc3
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/akg_compiler/multi_process_compiler.py
@@ -0,0 +1,71 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Providing multi process compile with json"""
+import os
+import subprocess
+import sys
+from multiprocessing import Pool, cpu_count
+
+
+def _compile_akg_task(*json_strs):
+    """
+    compile func called in single process
+
+    Parameters:
+        json_strs: list. List contains multiple kernel infos, suitable for json compile api.
+    """
+    akg_compiler = os.path.join(os.path.split(
+        os.path.realpath(__file__))[0], "compiler.py")
+    for json_str in json_strs:
+        res = subprocess.run(
+            [sys.executable, akg_compiler, json_str], text=True)
+        if res.returncode != 0:
+            raise ValueError("Failed, args: {}!".format(json_str))
+
+
+def compile_akg_kernel_parallel(json_infos, process, waitime):
+    """
+    compile kernel use multi processes
+
+    Parameters:
+        json_infos: list. list contain kernel info(task id and json str)
+        process: int. processes num
+        waittime: int. max time the function blocked
+
+    Returns:
+        True for all compile success, False for some failed.
+    """
+    if not isinstance(json_infos, list):
+        raise ValueError("json_infos must be a list")
+    if not isinstance(process, int):
+        raise ValueError("process must be a num")
+    if not isinstance(waitime, int):
+        raise ValueError("waittime must be a num")
+
+    if process == 0 and json_infos:
+        process = 1
+
+    cpu_proc_num = cpu_count()
+    max_proc_num = 16
+    process = min([cpu_proc_num, max_proc_num, process])
+
+    args = [[] for _ in range(process)]
+    for p, info in enumerate(json_infos):
+        args[p % process].append(info)
+
+    with Pool(processes=process) as pool:
+        res = pool.starmap_async(_compile_akg_task, args)
+        res.get(timeout=waitime)
+    return True
diff --git a/mindspore/_extends/parallel_compile/multi_compiler.py b/mindspore/_extends/parallel_compile/multi_compiler.py
deleted file mode 100644
index 86e1b684d2..0000000000
--- a/mindspore/_extends/parallel_compile/multi_compiler.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Providing multi process compile with json"""
-import json
-import math
-import os
-import subprocess
-import sys
-from multiprocessing import Pool
-
-
-def _compiletask(platform, *jsons):
-    """
-        compile func called in single process
-
-        Parameters:
-            platform: str. AKG platform or TBE platform
-            *jsons: str. json str contain kernel info, suitable for json compile
-                    api
-
-        """
-    if platform == "AKG":
-        p = __import__("_akg", globals(), locals(), ['ms'], 0)
-        func = getattr(p.ms, "compilewithjson")
-        for json_item in jsons:
-            res = func(json_item)
-            if not res:
-                raise ValueError("Compile error")
-    if platform == "TBE":
-        tbe_compiler = os.path.join(os.path.split(os.path.realpath(__file__))[0], "tbe_compiler", "compiler.py")
-        for json_item in jsons:
-            res = subprocess.run([sys.executable, tbe_compiler], input=json_item, text=True)
-            if res.returncode != 0:
-                raise ValueError("Tbe compile error")
-
-
-def compilekernelparallel(jsons, process, waitime):
-    """
-    compile kernel use multi processes
-
-    Parameters:
-        jsons: list. json str list contain kernel info
-        process: int. processes num
-        waittime: int. max time the function blocked
-    """
-    if not isinstance(jsons, list):
-        raise ValueError("jsons must be a list")
-    if not isinstance(process, int):
-        raise ValueError("process must be a num")
-    if not isinstance(waitime, int):
-        raise ValueError("waittime must be a num")
-
-    jsons_akg = []
-    jsons_tbe = []
-    for json_ in jsons:
-        j = json.loads(json_)
-        if j["platform"] == "TBE":
-            jsons_tbe.append(json_)
-            continue
-        if j["platform"] == "AKG":
-            jsons_akg.append(json_)
-            continue
-        raise RuntimeError(
-            "not support this platform {0}".format(j["platform"]))
-    if jsons_akg:
-        process_akg = math.floor(len(jsons)/len(jsons_akg)*process)
-    else:
-        process_akg = 0
-
-    if process_akg == 0 and jsons_akg:
-        process_akg = 1
-    process_tbe = process-process_akg
-    if process_tbe == 0 and jsons_tbe:
-        process_tbe = 1
-        raise RuntimeWarning("we add a process for compile more operator")
-
-    args = [[] for _ in range(process_akg+process_tbe)]
-    args_lens = len(args)
-    for p in range(args_lens):
-        if p < process_tbe:
-            args[p].append("TBE")
-        else:
-            args[p].append("AKG")
-    jsons_tbe_lens = len(jsons_tbe)
-    for p in range(jsons_tbe_lens):
-        args[p % process_tbe].append(jsons_tbe[p])
-    jsons_akg_lens = len(jsons_akg)
-    for p in range(jsons_akg_lens):
-        args[process-p % process_akg-1].append(jsons_akg[p])
-    for p in range(args_lens):
-        args[p] = tuple(args[p])
-    with Pool(processes=process) as pool:
-        res = pool.starmap_async(_compiletask, args)
-        res.get(timeout=waitime)
-    return True
diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 44f78d6216..c9e224080a 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -39,7 +39,7 @@ if(ENABLE_GPU)
             "device/gpu/*.cu"
             "kernel/gpu/*.cu"
             "kernel/akg/gpu/*.cc"
-            "kernel/akg/akgkernelbuild.cc"
+            "kernel/akg/akg_kernel_build.cc"
             "kernel/akg/akg_kernel_attrs_process.cc"
             )
 
diff --git a/mindspore/ccsrc/common/trans.cc b/mindspore/ccsrc/common/trans.cc
index a9ce32c8df..9cf6eb3a5a 100644
--- a/mindspore/ccsrc/common/trans.cc
+++ b/mindspore/ccsrc/common/trans.cc
@@ -428,6 +428,10 @@ std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const s
   auto temp_shape = shape;
   std::vector<size_t> device_shape;
   if (format == kOpFormat_FRAC_NZ) {
+    if (shape.size() == 1 && (shape[0] == 1 || shape[0] % kCubeSize == 0)) {
+      // For [1] and [1024] shape we can trait it as NZ shape
+      return shape;
+    }
     if (shape.size() < 2) {
       MS_LOG(EXCEPTION) << "Format" << format << " is not support shape " << shape.size();
     } else {
diff --git a/mindspore/ccsrc/debug/anf_ir_dump.cc b/mindspore/ccsrc/debug/anf_ir_dump.cc
index 1fd3096e7c..fc32e0fb5f 100644
--- a/mindspore/ccsrc/debug/anf_ir_dump.cc
+++ b/mindspore/ccsrc/debug/anf_ir_dump.cc
@@ -111,9 +111,15 @@ void DumpGlobalInfoEntry(const FuncGraphPtr &graph, std::ostringstream &buffer)
   }
 
   buffer << "#IR entry      : @" << graph->ToString() << "." << graph->debug_info()->get_id() << std::endl;
-  buffer << "#flags         :" << std::endl;
-  for (const auto &flag : graph->flags()) {
-    buffer << flag.first << " : " << flag.second << std::endl;
+  buffer << "#attrs         :" << std::endl;
+  for (const auto &attr : graph->attrs()) {
+    buffer << attr.first << " : ";
+    if (attr.second->isa<BoolImm>()) {
+      buffer << GetValue<bool>(attr.second);
+    } else if (attr.second->isa<StringImm>()) {
+      buffer << GetValue<std::string>(attr.second);
+    }
+    buffer << std::endl;
   }
 }
 
@@ -417,10 +423,16 @@ void DumpSubgraph(const OrderedMap<FuncGraphPtr, std::shared_ptr<SubGraphIRInfo>
   fout << std::endl;
 
   for (const auto &sg : *sub_graphs) {
-    fout << "subgraph flag:" << std::endl;
+    fout << "subgraph attr:" << std::endl;
     MS_EXCEPTION_IF_NULL(sg.first);
-    for (const auto &flag : sg.first->flags()) {
-      fout << flag.first << " : " << flag.second << std::endl;
+    for (const auto &attr : sg.first->attrs()) {
+      fout << attr.first << " : ";
+      if (attr.second->isa<BoolImm>()) {
+        fout << GetValue<bool>(attr.second);
+      } else if (attr.second->isa<StringImm>()) {
+        fout << GetValue<std::string>(attr.second);
+      }
+      fout << std::endl;
     }
     fout << "subgraph @" << sg.first->ToString() << ".";
     fout << sg.first->debug_info()->get_id() << "(";
diff --git a/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc b/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
index 125630fe22..f0bad6b492 100644
--- a/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
@@ -548,9 +548,15 @@ void AscendStreamAssign::GetNeedActiveStreams(const shared_ptr<session::KernelGr
   for (size_t i = 0; i < cnode_ptr_list.size(); ++i) {
     cur_cnode_ptr = cnode_ptr_list[i];
     MS_EXCEPTION_IF_NULL(cur_cnode_ptr);
+    ValuePtr value_ptr = nullptr;
     auto primitive = AnfAlgo::GetCNodePrimitive(cur_cnode_ptr);
-    MS_EXCEPTION_IF_NULL(primitive);
-    auto value_ptr = primitive->GetAttr(kStreamNeedActivedFirst);
+    if (primitive != nullptr) {
+      value_ptr = primitive->GetAttr(kStreamNeedActivedFirst);
+    } else {
+      auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cur_cnode_ptr);
+      MS_EXCEPTION_IF_NULL(func_graph);
+      value_ptr = func_graph->get_attr(kStreamNeedActivedFirst);
+    }
     if (value_ptr == nullptr) {
       continue;
     }
diff --git a/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc b/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
index dcc4e6ace0..254c92afbf 100644
--- a/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
+++ b/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
@@ -26,10 +26,12 @@
 #include "kernel/kernel.h"
 #include "kernel/tbe/tbe_kernel_build.h"
 #include "kernel/tbe/tbe_kernel_parallel_build.h"
+#include "kernel/akg/ascend/akg_ascend_kernel_build.h"
 #include "kernel/aicpu/aicpu_kernel_build.h"
 #include "kernel/hccl/hccl_kernel_build.h"
 #include "kernel/rts/rt_kernel_build.h"
 #include "kernel/tbe/tbe_utils.h"
+#include "kernel/common_utils.h"
 #include "operator/ops.h"
 #include "session/anf_runtime_algorithm.h"
 #include "./common.h"
@@ -91,6 +93,7 @@ static bool KernelPreBuildParallelCompile(const mindspore::session::KernelGraph
 static bool KernelBuildParallelCompile(const mindspore::session::KernelGraph *kernel_graph_ptr) {
   MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
   std::vector<AnfNodePtr> tbe_nodes;
+  std::vector<AnfNodePtr> akg_nodes;
   std::vector<AnfNodePtr> other_nodes;
   for (const auto &anf_node : kernel_graph_ptr->execution_order()) {
     MS_EXCEPTION_IF_NULL(anf_node);
@@ -105,19 +108,26 @@ static bool KernelBuildParallelCompile(const mindspore::session::KernelGraph *ke
         }
         break;
       }
+      case KernelType::AKG_KERNEL: {
+        akg_nodes.push_back(anf_node);
+        break;
+      }
       default: {
         other_nodes.push_back(anf_node);
         break;
       }
     }
   }
-  bool ret = kernel::TbeOpParallelBuild(tbe_nodes);
+  bool tbe_ret = kernel::TbeOpParallelBuild(tbe_nodes);
+  bool akg_ret = kernel::AkgAscendKernelParallelBuild(akg_nodes);
+  auto bin_map = kernel::tbe::KernelMeta::GetInstance();
+  (void)bin_map->ReadIndex(kernel::kCceKernelMeta);
   for (const auto &anf_node : other_nodes) {
     kernel::KernelModPtr kernel_mod_ptr = SerialCompileImpl(anf_node);
     MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
     AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
   }
-  return ret;
+  return tbe_ret && akg_ret;
 }
 
 static std::vector<int> CalCleanZerosSize(const CNodePtr &pre_node) {
@@ -234,7 +244,7 @@ void KernelBuildPreprocess(mindspore::session::KernelGraph *kernel_graph) {
   for (const auto &anf_node : kernel_graph->execution_order()) {
     std::string apply_function_name = AnfAlgo::GetCNodeName(anf_node);
     if (apply_function_name == prim::kPrimMaxPoolGrad->name() &&
-        AnfAlgo::GetKernelType(anf_node) == KernelType::AUTO_DIFF_KERNEL) {
+        AnfAlgo::GetKernelType(anf_node) == KernelType::AKG_KERNEL) {
       auto clear_zero_prim = std::make_shared<Primitive>(kClearZeroOpName);
       MS_EXCEPTION_IF_NULL(clear_zero_prim);
       auto new_value_node = NewValueNode(clear_zero_prim);
diff --git a/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc b/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
index 3951e1a132..922f62329d 100644
--- a/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
@@ -15,16 +15,27 @@
  */
 
 #include "device/ascend/kernel_select_ascend.h"
+
 #include <string>
 #include <vector>
 #include <memory>
 #include <utility>
+#include <algorithm>
 #include <map>
-#include "kernel/oplib/oplib.h"
-#include "kernel/kernel_query.h"
-#include "session/anf_runtime_algorithm.h"
-#include "utils/context/ms_context.h"
+#include <unordered_map>
+#include <unordered_set>
+
+#include "common/utils.h"
 #include "debug/anf_ir_dump.h"
+#include "operator/ops.h"
+#include "ir/func_graph.h"
+#include "utils/context/ms_context.h"
+#include "session/anf_runtime_algorithm.h"
+#include "device/kernel_info.h"
+#include "kernel/common_utils.h"
+#include "kernel/kernel_query.h"
+#include "kernel/oplib/oplib.h"
+#include "kernel/kernel_build_info.h"
 
 namespace mindspore {
 namespace device {
@@ -121,12 +132,23 @@ void UpdateCurMatchCounts(const kernel::KernelBuildInfo &kernel_build_info, cons
   }
   auto pri_match_format = GetPriorityMatchFormat(kernel_node);
   for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
+    auto input_anf_node = kernel_node->input(input_index + 1);
+    // we do not take ValueNode into consideration in graph kernel.
+    if (kernel_build_info.kernel_type() == KernelType::AKG_KERNEL) {
+      if (input_anf_node->isa<ValueNode>() && AnfAlgo::GetOutputDeviceDataType(input_anf_node, 0) == kTypeUnknown) {
+        continue;
+      }
+    }
     auto base_score = AnfAlgo::IsFeatureMapInput(kernel_node, input_index) ? kFeatureMapBaseScore : kWegihtBaseScore;
     if (kernel_build_info.GetInputFormat(input_index) == AnfAlgo::GetPrevNodeOutputFormat(kernel_node, input_index)) {
       (*cur_kernelinfo_match_counts)[MATCH_FORMAT_COUNT] += base_score;
     }
-    if (kernel_build_info.GetInputDeviceType(input_index) ==
-        AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, input_index)) {
+    // we match output fix precision first.
+    auto prev_device_type = AnfAlgo::GetPrevNodeOutputPrecision(kernel_node, input_index);
+    if (prev_device_type == kTypeUnknown) {
+      prev_device_type = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, input_index);
+    }
+    if (kernel_build_info.GetInputDeviceType(input_index) == prev_device_type) {
       (*cur_kernelinfo_match_counts)[MATCH_DTYPE_COUNT] += base_score;
     }
     if (kernel_build_info.GetInputFormat(input_index) == pri_match_format) {
@@ -146,41 +168,6 @@ void UpdateCurMatchCounts(const kernel::KernelBuildInfo &kernel_build_info, cons
   }
 }
 
-void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
-    auto input_kernel_node = AnfAlgo::GetInputNode(kernel_node, input_index);
-    MS_EXCEPTION_IF_NULL(input_kernel_node);
-    auto input_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
-    MS_EXCEPTION_IF_NULL(input_with_index.first);
-    auto real_input_node = input_with_index.first;
-    if (real_input_node->isa<CNode>()) {
-      continue;
-    }
-    std::shared_ptr<kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder =
-      std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
-    bool is_ref = false;
-    auto op_info = mindspore::kernel::OpLib::FindOp(AnfAlgo::GetCNodeName(kernel_node), kernel::kTBE);
-    if (op_info != nullptr) {
-      is_ref = op_info->is_ref();
-    }
-    auto ms_context = MsContext::GetInstance();
-    MS_EXCEPTION_IF_NULL(ms_context);
-    if (ms_context->execution_mode() == kPynativeMode &&
-        AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) {
-      continue;
-    }
-    // we set special device info of a input tensor.
-    if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
-      std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
-      builder->SetOutputsFormat(output_format);
-      std::vector<TypeId> output_type = {AnfAlgo::GetInputDeviceDataType(kernel_node, input_index)};
-      builder->SetOutputsDeviceType(output_type);
-      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
-    }
-  }
-}
-
 void AddSupportMixedPrecisionDataTypeIndex(TypeId data_type, std::vector<int> *support_index) {
   MS_EXCEPTION_IF_NULL(support_index);
   int index = kUnSupportMixedDataTypeIndex;
@@ -467,6 +454,51 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
 }
 }  // namespace
 
+void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
+    auto input_kernel_node = AnfAlgo::GetInputNode(kernel_node, input_index);
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    auto input_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
+    MS_EXCEPTION_IF_NULL(input_with_index.first);
+    auto real_input_node = input_with_index.first;
+    if (real_input_node->isa<CNode>()) {
+      continue;
+    }
+    if (real_input_node->isa<Parameter>() && !AnfAlgo::IsParameterWeight(real_input_node->cast<ParameterPtr>())) {
+      continue;
+    }
+    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+    if (IsValueNode<tensor::Tensor>(input_kernel_node) &&
+        AnfAlgo::GetOutputDeviceDataType(input_kernel_node, 0) == kTypeUnknown) {
+      std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
+      builder->SetOutputsFormat(output_format);
+      std::vector<TypeId> output_type = {selected_kernel_info.GetInputDeviceType(input_index)};
+      builder->SetOutputsDeviceType(output_type);
+      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), input_kernel_node.get());
+      continue;
+    }
+    // we set special device info of a input tensor.
+    bool is_ref = false;
+    auto op_info = kernel::OpLib::FindOp(AnfAlgo::GetCNodeName(kernel_node), kernel::kTBE);
+    if (op_info != nullptr) {
+      is_ref = op_info->is_ref();
+    }
+    MS_EXCEPTION_IF_NULL(MsContext::GetInstance());
+    if (MsContext::GetInstance()->execution_mode() == kPynativeMode &&
+        AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) != kTypeUnknown) {
+      continue;
+    }
+    if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
+      std::vector<std::string> output_format = {selected_kernel_info.GetInputFormat(input_index)};
+      builder->SetOutputsFormat(output_format);
+      std::vector<TypeId> output_type = {selected_kernel_info.GetInputDeviceType(input_index)};
+      builder->SetOutputsDeviceType(output_type);
+      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
+    }
+  }
+}
+
 KernelSelectStatus SetMatchedKernelInfo(const CNodePtr &kernel_node,
                                         const std::vector<std::shared_ptr<kernel::KernelBuildInfo>> &kernel_info_list) {
   MS_EXCEPTION_IF_NULL(kernel_node);
@@ -498,11 +530,17 @@ KernelSelectStatus SetMatchedKernelInfo(const CNodePtr &kernel_node,
   return select_status;
 }
 
-KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node) {
+KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) {
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> aicpu_kernel_info_list;
   MS_EXCEPTION_IF_NULL(kernel_node);
-  kernel::KernelQuery(kernel_node, &kernel_info_list);
+  if (AnfAlgo::IsGraphKernel(kernel_node)) {
+    auto func_graph = GetValueNode<FuncGraphPtr>(kernel_node->input(kAnfPrimitiveIndex));
+    MS_EXCEPTION_IF_NULL(func_graph);
+    SelectGraphKernelInfo(kernel_node, func_graph);
+    return kStatusAllMatched;
+  }
+  kernel::KernelQuery(kernel_node, &kernel_info_list, kernel_type);
   auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list);
   // If aicore not find valid kernel info reloading aicpu kernel info list to find it
   if (select_status == kNoMatched) {
diff --git a/mindspore/ccsrc/device/ascend/kernel_select_ascend.h b/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
index c4c777c18a..7b7a7b9fb9 100644
--- a/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
+++ b/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
@@ -27,7 +27,10 @@ enum KernelSelectStatus {
   kStatusReducePrecision = 1,
   kStatusRaisePrecision = 2,
 };
-KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node);
+KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node,
+                                    KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE);
+void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, const CNodePtr &kernel_node);
+void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph);
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc b/mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc
new file mode 100644
index 0000000000..b57ed1cd1b
--- /dev/null
+++ b/mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc
@@ -0,0 +1,516 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/ascend/kernel_select_ascend.h"
+#include "session/anf_runtime_algorithm.h"
+#include "device/kernel_info.h"
+#include "ir/func_graph.h"
+#include "kernel/common_utils.h"
+#include "kernel/kernel_query.h"
+#include "kernel/kernel_build_info.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+
+TypeId GetPrimitivePrecision(const CNodePtr &cnode) {
+  auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
+  MS_EXCEPTION_IF_NULL(primitive);
+
+  TypeId except_type = kTypeUnknown;
+  if (primitive->GetAttr(kAttrFixPrecision) != nullptr) {
+    auto strExceptDtype = GetValue<std::string>(primitive->GetAttr(kAttrFixPrecision));
+    if (strExceptDtype == "float16") {
+      except_type = kNumberTypeFloat16;
+    } else if (strExceptDtype == "float32") {
+      except_type = kNumberTypeFloat32;
+    } else {
+      MS_LOG(EXCEPTION) << "The fix precision must be float16 or float32, but got" << strExceptDtype;
+    }
+  }
+
+  return except_type;
+}
+
+void ResetKernelBuildInfo(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  for (size_t input_index = 0; input_index < input_num; ++input_index) {
+    auto input_kernel_node = AnfAlgo::GetInputNode(kernel_node, input_index);
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    auto kernel_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
+    if (!kernel::IsWeightBoundary(kernel_with_index.first)) {
+      continue;
+    }
+    // reset format and dtype.
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    builder.SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+    builder.SetOutputsDeviceType(std::vector<TypeId>{kTypeUnknown});
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_kernel_node.get());
+  }
+}
+
+void UpdateKernelInfo(const std::vector<AnfNodePtr> &node_list) {
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    // select nodes in subgraph.
+    auto anf_node = node_list[i];
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto fix_precision_type = GetPrimitivePrecision(cnode);
+    if (fix_precision_type != kTypeUnknown) {
+      std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
+      kernel::KernelQuery(cnode, &kernel_info_list, KernelType::AKG_KERNEL);
+
+      for (size_t index = 0; index < kernel_info_list.size(); ++index)
+        // only math the first input
+        if (kernel_info_list[index]->GetInputDeviceType(0) == fix_precision_type &&
+            kernel_info_list[index]->GetInputFormat(0) == AnfAlgo::GetPrevNodeOutputFormat(cnode, 0) &&
+            AnfAlgo::GetInputDeviceDataType(cnode, 0) != fix_precision_type) {
+          auto selected_kernel_info_ptr = kernel_info_list[index];
+          ResetKernelBuildInfo(cnode);
+          AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info_ptr, cnode.get());
+          SetTensorDeviceInfo(*selected_kernel_info_ptr, cnode);
+          break;
+        }
+    }
+  }
+}
+
+bool CanConvertDefaultShapeToNZ(const std::vector<size_t> &shape) {
+  for (size_t i = 1; i <= shape.size(); ++i) {
+    if (i > 2) {
+      break;
+    }
+    if (shape[shape.size() - i] != 1 && shape[shape.size() - i] % kCubeSize != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<int> DefaultToFracNZAxis(const std::vector<size_t> &ori_shape, const std::vector<int> &axis) {
+  std::vector<int> frac_nz_axis = axis;
+  auto shape_len = ori_shape.size();
+  for (size_t i = 0; i < axis.size(); ++i) {
+    auto axis_idx = (frac_nz_axis[i] + shape_len) % shape_len;
+    if (axis_idx == shape_len - 1) {
+      frac_nz_axis[i] = axis_idx - 1;
+      frac_nz_axis.push_back(axis_idx + 2);
+    } else if (axis_idx == shape_len - 2) {
+      frac_nz_axis[i] = axis_idx + 1;
+      frac_nz_axis.push_back(axis_idx + 2);
+    } else {
+      frac_nz_axis[i] = axis_idx;
+    }
+  }
+  return frac_nz_axis;
+}
+
+std::vector<size_t> GetReducedFracNZShape(const std::vector<size_t> &ori_shape, const std::vector<int> &axis,
+                                          bool keep_dims) {
+  std::vector<size_t> result;
+  std::set<size_t> positive_idx;
+  for (const auto &a : axis) {
+    positive_idx.insert(a >= 0 ? a : ori_shape.size() + a);
+  }
+  for (size_t i = 0; i < ori_shape.size(); ++i) {
+    if (positive_idx.count(i) == 0) {
+      result.push_back(ori_shape[i]);
+    } else if (keep_dims) {
+      result.push_back(1);
+    }
+  }
+  return result;
+}
+
+void UpdateFracNZReduceOp(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto input_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, 0);
+  if (input_format == kOpFormat_FRAC_NZ) {
+    // Clone primitive to modify it
+    auto prim = GetCNodePrimitive(cnode);
+    auto new_prim = std::make_shared<Primitive>(*prim);
+    auto new_prim_node = NewValueNode(new_prim);
+    cnode->set_input(0, new_prim_node);
+
+    auto axis_value = new_prim->GetAttr(kAttrAxis);
+    std::vector<int> default_axis;
+    if (axis_value->isa<ValueList>()) {
+      auto value_list = dyn_cast<ValueList>(axis_value);
+      for (const auto &item : value_list->value()) {
+        if (item->isa<Int32Imm>()) {
+          default_axis.push_back(GetValue<int32_t>(item));
+        }
+      }
+    } else if (axis_value->isa<ValueTuple>()) {
+      auto value_tuple = dyn_cast<ValueTuple>(axis_value);
+      for (const auto &item : value_tuple->value()) {
+        if (item->isa<Int32Imm>()) {
+          default_axis.push_back(GetValue<int32_t>(item));
+        }
+      }
+    } else {
+      MS_LOG(ERROR) << "Axis attr type is not correct!";
+    }
+    auto infer_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
+    std::vector<int> frac_nz_axis = DefaultToFracNZAxis(infer_shape, default_axis);
+    AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue<std::vector<int>>(frac_nz_axis), cnode);
+    auto output_shape = AnfAlgo::GetOutputInferShape(cnode, 0);
+    if (output_shape.size() == 1) {
+      AnfAlgo::SetNodeAttr(kAttrOutputDefault, MakeValue<bool>(true), cnode);
+    }
+  }
+}
+
+void GetDefaultFormat(const CNodePtr &kernel_node, std::string *default_format, bool *use_same_format) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(default_format);
+  MS_EXCEPTION_IF_NULL(use_same_format);
+  std::unordered_map<std::string, size_t> all_input_formats;
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_kernel_node = AnfAlgo::VisitKernel(kernel_node->input(i + 1), 0).first;
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    if (!input_kernel_node->isa<Parameter>()) {
+      auto pre_format = AnfAlgo::GetPrevNodeOutputFormat(kernel_node, i);
+      ++all_input_formats[pre_format];
+      continue;
+    }
+    auto para = input_kernel_node->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(para);
+    if (AnfAlgo::GetOutputDeviceDataType(para, 0) != kTypeUnknown) {
+      auto pre_format = AnfAlgo::GetOutputFormat(para, 0);
+      ++all_input_formats[pre_format];
+      continue;
+    }
+    *use_same_format = false;
+  }
+
+  if (all_input_formats.empty()) {
+    // all inputs are parameter.
+    *default_format = kOpFormat_NC1HWC0;
+  } else {
+    std::vector<std::pair<std::string, size_t>> pairs;
+    for (auto iter = all_input_formats.begin(); iter != all_input_formats.end(); ++iter) {
+      pairs.push_back(std::make_pair(iter->first, iter->second));
+    }
+    auto cmp_func = [](const std::pair<std::string, size_t> &a, const std::pair<std::string, size_t> &b) {
+      if (a.second != b.second) {
+        return a.second > b.second;
+      } else if (a.first == kOpFormat_DEFAULT) {
+        return a.second + 1 > b.second;
+      } else if (b.first == kOpFormat_DEFAULT) {
+        return a.second > b.second + 1;
+      }
+      return a.second > b.second;
+    };
+    std::sort(pairs.begin(), pairs.end(), cmp_func);
+    *default_format = pairs.begin()->first;
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_kernel_node = AnfAlgo::VisitKernel(kernel_node->input(i + 1), 0).first;
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    if (!input_kernel_node->isa<Parameter>() ||
+        AnfAlgo::GetOutputDeviceDataType(input_kernel_node, 0) != kTypeUnknown) {
+      continue;
+    }
+    auto weight_infer_shape = AnfAlgo::GetOutputInferShape(input_kernel_node, 0);
+    if (weight_infer_shape.size() < 2 && *default_format == kOpFormat_FRAC_NZ) {
+      *default_format = kOpFormat_DEFAULT;
+      *use_same_format = true;
+      break;
+    }
+  }
+}
+
+void UpdateGraphKernelInputsKernelInfo(const CNodePtr &kernel_node, const std::vector<AnfNodePtr> &input_list,
+                                       const std::string &default_format, bool use_same_format,
+                                       std::vector<std::string> *graph_input_format,
+                                       std::vector<TypeId> *graph_input_type) {
+  MS_EXCEPTION_IF_NULL(graph_input_format);
+  MS_EXCEPTION_IF_NULL(graph_input_type);
+  // We set same format to all inputs of graph kernel subgraph, and process this latter.
+  // We set dtype to inputs of graph kernel subgraph same as infer dtypes.
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  for (size_t i = 0; i < input_num; ++i) {
+    auto input_kernel_node = AnfAlgo::VisitKernel(kernel_node->input(i + 1), 0).first;
+    MS_EXCEPTION_IF_NULL(input_kernel_node);
+    if (use_same_format) {
+      bool can_convert = true;
+      if (default_format == kOpFormat_FRAC_NZ) {
+        auto infer_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
+        if (!CanConvertDefaultShapeToNZ(infer_shape)) {
+          MS_LOG(WARNING) << "Shape can't be converted to frac nz shape, so use default format instead";
+          can_convert = false;
+        }
+      }
+      if (can_convert) {
+        graph_input_format->push_back(default_format);
+      } else {
+        graph_input_format->push_back(kOpFormat_DEFAULT);
+      }
+      graph_input_type->push_back(AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, i));
+      continue;
+    }
+
+    if (!input_kernel_node->isa<Parameter>()) {
+      // subgraph parameter from output of other nodes.
+      graph_input_format->push_back(AnfAlgo::GetPrevNodeOutputFormat(kernel_node, i));
+      graph_input_type->push_back(AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, i));
+      continue;
+    }
+
+    auto para = input_kernel_node->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(para);
+    if (AnfAlgo::GetOutputDeviceDataType(para, 0) != kTypeUnknown) {
+      // parameter already selected.
+      graph_input_format->push_back(AnfAlgo::GetOutputFormat(para, 0));
+      graph_input_type->push_back(AnfAlgo::GetOutputDeviceDataType(para, 0));
+      continue;
+    }
+
+    // weight parameter.
+    graph_input_format->push_back(default_format);
+    graph_input_type->push_back(AnfAlgo::GetOutputInferDataType(input_kernel_node, 0));
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    std::vector<std::string> outputs_format = {(*graph_input_format)[i]};
+    std::vector<TypeId> outputs_device_type = {(*graph_input_type)[i]};
+    builder.SetOutputsFormat(outputs_format);
+    builder.SetOutputsDeviceType(outputs_device_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_list[i].get());
+  }
+}
+
+void UpdateEquivFormat(const std::vector<std::pair<AnfNodePtr, size_t>> &output_index,
+                       const std::vector<AnfNodePtr> &node_list, const FuncGraphPtr &func_graph,
+                       const FuncGraphManagerPtr &mng) {
+  MS_EXCEPTION_IF_NULL(mng);
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    // select nodes in subgraph.
+    auto anf_node = node_list[i];
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
+    SelectKernelInfo(cnode, KernelType::AKG_KERNEL);
+    // Update ReduceSum
+    if (!IsPrimitiveCNode(cnode, prim::kPrimReduceSum)) {
+      continue;
+    }
+    UpdateFracNZReduceOp(cnode);
+    // If ReduceSum's output is 1d and not Default format, convert it to Default format
+    auto out_format = AnfAlgo::GetOutputFormat(cnode, 0);
+    if (out_format == kOpFormat_DEFAULT || !AnfAlgo::HasNodeAttr(kAttrOutputDefault, cnode)) {
+      continue;
+    }
+    auto infer_shape = AnfAlgo::GetOutputInferShape(cnode, 0);
+    // Insert EquivFormat node, then select kernel info again
+    std::vector<AnfNodePtr> trans_inputs;
+    trans_inputs.push_back(NewValueNode(prim::kPrimEquivFormat));
+    trans_inputs.push_back(cnode);
+    CNodePtr trans_node = func_graph->NewCNode(trans_inputs);
+    AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetPrevNodeOutputInferDataType(cnode, 0)},
+                                        {AnfAlgo::GetOutputInferShape(cnode, 0)}, trans_node.get());
+    AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue<std::vector<std::string>>({"x"}), trans_node);
+
+    if (trans_node->kernel_info() == nullptr) {
+      trans_node->set_kernel_info(std::make_shared<device::KernelInfo>());
+    }
+    SelectKernelInfo(trans_node, KernelType::AKG_KERNEL);
+    mng->Replace(cnode, trans_node);
+  }
+}
+
+void UpdateFormatsAndDtypes(const CNodePtr &kernel_node, const std::vector<AnfNodePtr> &node_list,
+                            const std::vector<AnfNodePtr> &input_list, const FuncGraphManagerPtr &mng,
+                            const std::string &default_format, std::vector<std::string> *graph_input_format,
+                            std::vector<TypeId> *graph_input_type) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(mng);
+  MS_EXCEPTION_IF_NULL(graph_input_format);
+  MS_EXCEPTION_IF_NULL(graph_input_type);
+  // update graph input format and dtype use inner ops.
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (graph_input_format->size() != input_num) {
+    MS_LOG(EXCEPTION) << "Graph input format size is not equal to input num of cnode[" << kernel_node->DebugString()
+                      << "], [%" << graph_input_format->size() << "] != [%" << input_num << "]";
+  }
+  std::vector<bool> need_update(input_num, false);
+  auto &node_users = mng->node_users();
+  for (size_t i = 0; i < input_num; ++i) {
+    auto &input = input_list[i];
+    auto iter = node_users.find(input);
+    if (iter == node_users.end() || iter->second.empty()) {
+      continue;
+    }
+    for (auto &node_user : iter->second) {
+      if (node_user.first->kernel_info() == nullptr ||
+          node_user.first->kernel_info()->select_kernel_build_info() == nullptr) {
+        // maybe not a real kernel.
+        continue;
+      }
+      auto user_format = AnfAlgo::GetInputFormat(node_user.first, IntToSize(node_user.second - 1));
+      if (user_format != (*graph_input_format)[i]) {
+        MS_LOG(WARNING) << "Users of input: [" << i << "][" << input->DebugString(2) << " of ["
+                        << kernel_node->DebugString()
+                        << "] selected different format. we use defult: " << default_format;
+        (*graph_input_format)[i] = default_format;
+        need_update[i] = true;
+      }
+
+      if (kernel_node->input(i + 1)->isa<Parameter>()) {
+        auto user_dtype = AnfAlgo::GetInputDeviceDataType(node_user.first, IntToSize(node_user.second - 1));
+        if (user_dtype != (*graph_input_type)[i]) {
+          TypeId default_dtype = AnfAlgo::GetOutputInferDataType(input, 0);
+          MS_LOG(WARNING) << "Users of input: [" << i << "][" << input->DebugString(2) << " of ["
+                          << kernel_node->DebugString()
+                          << "] selected different dtype. we use default: " << TypeIdLabel(default_dtype);
+          (*graph_input_type)[i] = default_dtype;
+          need_update[i] = true;
+        }
+      }
+    }
+  }
+
+  for (size_t i = 0; i < input_num; ++i) {
+    if (!need_update[i]) {
+      continue;
+    }
+    need_update[i] = false;
+
+    MS_LOG(DEBUG) << "Update input format: " << i << " of: [" << kernel_node->DebugString()
+                  << "] to: " << (*graph_input_format)[i];
+    MS_LOG(DEBUG) << "Update input dtype: " << i << " of: [" << kernel_node->DebugString()
+                  << "] to: " << TypeIdLabel((*graph_input_type)[i]);
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    std::vector<std::string> outputs_format = {(*graph_input_format)[i]};
+    std::vector<TypeId> outputs_device_type = {(*graph_input_type)[i]};
+    builder.SetOutputsFormat(outputs_format);
+    builder.SetOutputsDeviceType(outputs_device_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_list[i].get());
+  }
+
+  ResetKernelBuildInfo(kernel_node);
+  // select nodes in subgraph again.
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    auto anf_node = node_list[i];
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+    size_t cnode_input_num = AnfAlgo::GetInputTensorNum(cnode);
+    for (size_t j = 0; j < cnode_input_num; ++j) {
+      auto input_node = cnode->input(j + 1);
+      MS_EXCEPTION_IF_NULL(input_node);
+      if (!IsValueNode<tensor::Tensor>(input_node)) {
+        continue;
+      }
+      // reset format and dtype of const tensor.
+      builder.SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+      builder.SetOutputsDeviceType(std::vector<TypeId>{kTypeUnknown});
+      AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), input_node.get());
+    }
+    SelectKernelInfo(node_list[i]->cast<CNodePtr>(), KernelType::AKG_KERNEL);
+  }
+}
+
+void SetGraphKernelInfo(const CNodePtr &kernel_node, const std::vector<std::pair<AnfNodePtr, size_t>> &output_index,
+                        const std::vector<std::string> &graph_input_format,
+                        const std::vector<TypeId> &graph_input_type) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<std::string> graph_output_format;
+  std::vector<TypeId> graph_output_type;
+  for (size_t i = 0; i < output_index.size(); ++i) {
+    auto const &output = output_index[i];
+    graph_output_format.push_back(AnfAlgo::GetOutputFormat(output.first, output.second));
+    TypeId output_type(kTypeUnknown);
+    if (output.first->isa<CNode>()) {
+      output_type = AnfAlgo::GetCNodeOutputPrecision(output.first);
+    }
+    if (output_type == kTypeUnknown) {
+      output_type = AnfAlgo::GetOutputDeviceDataType(output.first, output.second);
+    }
+    graph_output_type.push_back(output_type);
+  }
+
+  kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
+  graph_info_builder.SetInputsFormat(graph_input_format);
+  graph_info_builder.SetInputsDeviceType(graph_input_type);
+  graph_info_builder.SetOutputsFormat(graph_output_format);
+  graph_info_builder.SetOutputsDeviceType(graph_output_type);
+  graph_info_builder.SetProcessor(kernel::Processor::AICORE);
+  graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
+  graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
+  auto graph_selected_info = graph_info_builder.Build();
+  MS_EXCEPTION_IF_NULL(graph_selected_info);
+  AnfAlgo::SetSelectKernelBuildInfo(graph_selected_info, kernel_node.get());
+  SetTensorDeviceInfo(*graph_selected_info, kernel_node);
+}
+
+void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  // collect input info of funcgraph
+  std::vector<AnfNodePtr> node_list;
+  std::vector<AnfNodePtr> input_list;
+  std::vector<AnfNodePtr> output_list;
+  kernel::GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+  if (input_list.size() != kernel_node->inputs().size() - 1) {
+    MS_EXCEPTION(ArgumentError) << "Input num of funcgraph[" << func_graph->ToString() << "] not equal input of cnode["
+                                << kernel_node->DebugString() << "], [%" << input_list.size() << "] != ["
+                                << kernel_node->inputs().size() << "]";
+  }
+
+  std::string default_format;
+  bool use_same_format = true;
+  GetDefaultFormat(kernel_node, &default_format, &use_same_format);
+  MS_LOG(DEBUG) << "GraphKernel[" << func_graph->ToString() << "] use same input format[" << default_format
+                << "] for ParameterWeight.";
+
+  std::vector<std::string> graph_input_format;
+  std::vector<TypeId> graph_input_type;
+  UpdateGraphKernelInputsKernelInfo(kernel_node, input_list, default_format, use_same_format, &graph_input_format,
+                                    &graph_input_type);
+
+  auto mng = func_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(func_graph, true);
+  }
+  auto output_index = kernel::GetOutputIndex(node_list, input_list, output_list);
+  UpdateEquivFormat(output_index, node_list, func_graph, mng);
+  node_list.clear();
+  input_list.clear();
+  output_list.clear();
+  kernel::GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+
+  // update graph input format and dtype use inner ops.
+  UpdateFormatsAndDtypes(kernel_node, node_list, input_list, mng, default_format, &graph_input_format,
+                         &graph_input_type);
+
+  // set fix_precision for kernel when the me prim has fix_precision attr
+  UpdateKernelInfo(node_list);
+
+  output_index = kernel::GetOutputIndex(node_list, input_list, output_list);
+  SetGraphKernelInfo(kernel_node, output_index, graph_input_format, graph_input_type);
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc
index 7f6b424f2e..1f2d1570bb 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/graph_desc_reporter.cc
@@ -24,7 +24,7 @@ namespace device {
 namespace ascend {
 void GraphDescReporter::ReportData() {
   for (const auto &node : cnode_list_) {
-    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL && AnfAlgo::GetKernelType(node) != AUTO_DIFF_KERNEL) {
+    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL && AnfAlgo::GetKernelType(node) != AKG_KERNEL) {
       MS_LOG(WARNING) << "Skip non tbe kernel";
       continue;
     }
diff --git a/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc b/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc
index f05cb8bbdb..0bd66e31ef 100644
--- a/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/reporter/task_desc_reporter.cc
@@ -31,7 +31,7 @@ void TaskDescReporter::ReportData() {
 
   size_t task_index = 0;
   for (const auto &node : cnode_list_) {
-    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL && AnfAlgo::GetKernelType(node) != AUTO_DIFF_KERNEL) {
+    if (AnfAlgo::GetKernelType(node) != TBE_KERNEL && AnfAlgo::GetKernelType(node) != AKG_KERNEL) {
       MS_LOG(WARNING) << "Skip non tbe kernel";
       ++task_index;
       continue;
diff --git a/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc b/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
index 18da966575..3281ba9b5f 100644
--- a/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
@@ -43,7 +43,37 @@ bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::ve
 void TaskGenerator::LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressPtrList *kernel_inputs) {
   MS_EXCEPTION_IF_NULL(anf_node_ptr);
   if (anf_node_ptr->inputs().size() != 2) {
-    MS_LOG(EXCEPTION) << "atomic Addr clean Node Input nodes not equal 2.";
+    // akg process
+    // set atomic clean addr
+    if (AnfAlgo::HasNodeAttr(kAttrAutomicOutputIndexs, anf_node_ptr)) {
+      auto clean_output_indexs = AnfAlgo::GetNodeAttr<std::vector<size_t>>(anf_node_ptr, kAttrAutomicOutputIndexs);
+      auto graph = anf_node_ptr->func_graph();
+      MS_EXCEPTION_IF_NULL(graph);
+      auto manager = graph->manager();
+      MS_EXCEPTION_IF_NULL(manager);
+      auto node_users = manager->node_users();
+      if (node_users[anf_node_ptr].empty()) {
+        MS_LOG(EXCEPTION) << "Node users of " << anf_node_ptr->ToString() << " is empty.";
+      }
+      auto depend_node = node_users[anf_node_ptr].pop().first;
+      if (!IsPrimitiveCNode(depend_node, prim::kPrimDepend)) {
+        MS_LOG(EXCEPTION) << "Checking Depend node failed";
+      }
+      if (node_users[depend_node].empty()) {
+        MS_LOG(EXCEPTION) << "Node users of " << depend_node->ToString() << " is empty.";
+      }
+      auto post_node = node_users[depend_node].pop().first;
+      for (auto index : clean_output_indexs) {
+        auto device_address = AnfAlgo::GetOutputAddr(post_node, index);
+        kernel::AddressPtr input = std::make_shared<kernel::Address>();
+        input->addr = device_address->ptr_;
+        MS_EXCEPTION_IF_NULL(input->addr);
+        input->size = device_address->size_;
+        kernel_inputs->push_back(input);
+      }
+      MS_LOG(DEBUG) << "AtomicAddClean clean output size: " << clean_output_indexs.size();
+    }
+    return;
   }
   MS_EXCEPTION_IF_NULL(anf_node_ptr->inputs()[1]);
   auto pre_node = (anf_node_ptr->inputs()[1])->cast<CNodePtr>();
@@ -59,7 +89,7 @@ void TaskGenerator::LaunchAddrCleanKernel(const CNodePtr &anf_node_ptr, AddressP
       input->size = device_address->size_;
       kernel_inputs->push_back(input);
     }
-    MS_LOG(INFO) << "AtomicAddClean clean output size:" << clean_output_indexs.size();
+    MS_LOG(DEBUG) << "AtomicAddClean clean output size:" << clean_output_indexs.size();
   }
   // set clean workspace address
   if (AnfAlgo::HasNodeAttr(kAttrAutomicWorkspaceSize, pre_node)) {
diff --git a/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc b/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
index b6bc22603f..19d2284510 100644
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
@@ -16,7 +16,7 @@
 #include "device/gpu/gpu_kernel_build.h"
 #include <string>
 #include "kernel/kernel.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "kernel/akg/gpu/akg_gpu_kernel_build.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "operator/ops.h"
@@ -37,7 +37,7 @@ void GpuBuild(const KernelGraphPtr &kernel_graph) {
       continue;
     }
 
-    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AUTO_DIFF_KERNEL) {
+    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AKG_KERNEL) {
       auto gpu_kernel_ptr = kernel::AkgGpuKernelBuild(kernel);
       if (!gpu_kernel_ptr) {
         MS_LOG(EXCEPTION) << "Build akg kernel op[" << kernel_name << "] failed";
diff --git a/mindspore/ccsrc/device/gpu/kernel_info_setter.cc b/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
index 05d6679f76..42e76e2483 100644
--- a/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
+++ b/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
@@ -184,7 +184,7 @@ void SetKernelInfo(const CNodePtr &kernel_node) {
 
   if (!result) {
     result = SelectAkgKernel(kernel_node, builder->Build());
-    kernel_type = AUTO_DIFF_KERNEL;
+    kernel_type = AKG_KERNEL;
   }
 
   if (!result) {
diff --git a/mindspore/ccsrc/ir/anf.cc b/mindspore/ccsrc/ir/anf.cc
index 29a74b79ba..3b2402172b 100644
--- a/mindspore/ccsrc/ir/anf.cc
+++ b/mindspore/ccsrc/ir/anf.cc
@@ -26,6 +26,8 @@
 #include "ir/func_graph.h"
 #include "ir/primitive_base.h"
 
+#include "operator/ops.h"
+
 namespace mindspore {
 // namespace to support intermediate representation definition
 CNode::CNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph)
@@ -106,10 +108,14 @@ std::string ValueNode::fullname_with_scope() {
 bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value) {
   MS_EXCEPTION_IF_NULL(node);
   auto cnode = node->cast<CNodePtr>();
-  if (cnode != nullptr) {
+  if (cnode == nullptr) {
+    return false;
+  }
+  if (value != nullptr) {
     return cnode->IsApply(value);
   }
-  return false;
+  const auto &prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+  return prim != nullptr;
 }
 
 PrimitivePtr GetCNodePrimitive(const AnfNodePtr &node) {
diff --git a/mindspore/ccsrc/ir/anf.h b/mindspore/ccsrc/ir/anf.h
index c2db17aec5..c2bd3ab208 100644
--- a/mindspore/ccsrc/ir/anf.h
+++ b/mindspore/ccsrc/ir/anf.h
@@ -124,6 +124,7 @@ class AnfNode : public Base {
 
   const KernelInfoDevice *kernel_info() const { return kernel_info_.get(); }
   KernelInfoDevice *kernel_info() { return kernel_info_.get(); }
+  const KernelInfoDevicePtr &kernel_info_ptr() { return kernel_info_; }
   void set_kernel_info(const KernelInfoDevicePtr &kernel_info) { kernel_info_ = kernel_info; }
 
   AbstractBasePtr abstract() const { return abstract_; }
@@ -395,9 +396,9 @@ static S GetValue(const ValuePtr &value) {
 std::string GetCNodeFuncName(CNodePtr cnode);
 
 // used to check whether an AnfNode is a cnode with a kind of Primitive as first input
-bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value);
+bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value = nullptr);
 
-// used to check whether an AnfNode is a cnode with a Primitive as first input
+// used to get PrimitivePtr from a cnode first input
 PrimitivePtr GetCNodePrimitive(const AnfNodePtr &node);
 
 // used to check whether an AnfNode is a valuenode having some Primitive value
diff --git a/mindspore/ccsrc/ir/anf_extends.cc b/mindspore/ccsrc/ir/anf_extends.cc
index 0345ad29f5..432ffdb606 100644
--- a/mindspore/ccsrc/ir/anf_extends.cc
+++ b/mindspore/ccsrc/ir/anf_extends.cc
@@ -70,7 +70,7 @@ std::string CNode::fullname_with_scope() {
     }
     fullname_with_scope_ = name;
   } else {
-    // cnode input 0 should be primitive ptr
+    // cnode input 0 should be primitive ptr or funcgraph ptr
     auto value_ptr = input(0)->cast<ValueNodePtr>();
     if (value_ptr == nullptr) {
       MS_LOG(WARNING) << "Input 0 of cnode is not a value node, its type is " << input(0)->type_name() << ".";
@@ -84,11 +84,23 @@ std::string CNode::fullname_with_scope() {
       return fullname_with_scope_;
     }
 
-    PrimitivePtr prim = GetValue<PrimitivePtr>(input_value);
+    auto prim = input_value->cast<PrimitivePtr>();
     MS_EXCEPTION_IF_NULL(scope());
-    MS_EXCEPTION_IF_NULL(prim);
-    fullname_with_scope_ =
-      scope()->name() + "/" + prim->name() + "-op" + id_generator::get_id(shared_from_base<CNode>());
+    fullname_with_scope_ = scope()->name() + "/";
+    if (prim != nullptr) {
+      fullname_with_scope_ += prim->name();
+    } else {
+      auto func_graph = input_value->cast<FuncGraphPtr>();
+      MS_EXCEPTION_IF_NULL(func_graph);
+      auto fg_flag = func_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+      if (fg_flag != nullptr) {
+        auto fg_name = GetValue<std::string>(fg_flag);
+        fullname_with_scope_ += "GraphKernel_" + fg_name;
+      } else {
+        fullname_with_scope_ += func_graph->ToString();
+      }
+    }
+    fullname_with_scope_ += "-op" + id_generator::get_id(shared_from_base<CNode>());
   }
 
   return fullname_with_scope_;
diff --git a/mindspore/ccsrc/ir/dtype/number.h b/mindspore/ccsrc/ir/dtype/number.h
index 3930f51d73..f8a746f8d6 100644
--- a/mindspore/ccsrc/ir/dtype/number.h
+++ b/mindspore/ccsrc/ir/dtype/number.h
@@ -77,9 +77,9 @@ class Bool : public Number {
 
   TypeId generic_type_id() const override { return kNumberTypeBool; }
   TypePtr DeepCopy() const override { return std::make_shared<Bool>(); }
-  std::string ToString() const override { return "Bool_"; }
-  std::string ToReprString() const override { return "bool_"; }
-  std::string DumpText() const override { return "Bool_"; }
+  std::string ToString() const override { return "Bool"; }
+  std::string ToReprString() const override { return "bool"; }
+  std::string DumpText() const override { return "Bool"; }
 };
 
 // Int
diff --git a/mindspore/ccsrc/ir/func_graph.cc b/mindspore/ccsrc/ir/func_graph.cc
index d5d80eb2f0..cdca98fc61 100644
--- a/mindspore/ccsrc/ir/func_graph.cc
+++ b/mindspore/ccsrc/ir/func_graph.cc
@@ -34,7 +34,7 @@ namespace mindspore {
  * Methods of Graph
  */
 FuncGraph::FuncGraph()
-    : flags_(),
+    : attrs_(),
       transforms_(),
       parameter_default_value_(),
       seen_(0),
@@ -95,13 +95,27 @@ ParameterPtr FuncGraph::AddWeightParameter(const std::string &name) {
   return p;
 }
 
-bool FuncGraph::has_flag(const std::string &flag) {
-  if (flags_.count(flag)) {
-    return flags_[flag];
+bool FuncGraph::has_flag(const std::string &key) {
+  auto iter = attrs_.find(key);
+  if (iter != attrs_.cend()) {
+    if (iter->second->isa<BoolImm>()) {
+      return GetValue<bool>(iter->second);
+    }
+    MS_LOG(WARNING) << "key " << key << " is not a flag, please use has_attr function.";
   }
   return false;
 }
 
+bool FuncGraph::has_attr(const std::string &key) {
+  auto iter = attrs_.find(key);
+  return !(iter == attrs_.cend());
+}
+
+ValuePtr FuncGraph::get_attr(const std::string &key) {
+  auto iter = attrs_.find(key);
+  return iter == attrs_.cend() ? nullptr : iter->second;
+}
+
 CNodePtr FuncGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   CNodePtr cnode = std::make_shared<CNode>(inputs, shared_from_base<FuncGraph>());
   if (has_flag(GRAPH_FLAG_HAS_EFFECT)) {
diff --git a/mindspore/ccsrc/ir/func_graph.h b/mindspore/ccsrc/ir/func_graph.h
index c66fee2d13..5f09dfe6b5 100644
--- a/mindspore/ccsrc/ir/func_graph.h
+++ b/mindspore/ccsrc/ir/func_graph.h
@@ -74,6 +74,7 @@ using FuncGraphMap = OrderedMap<FuncGraphPtr, int>;
 const char FUNC_GRAPH_FLAG_IGNORE_VALUES[] = "ignore_values";
 const char FUNC_GRAPH_FLAG_DEFER_INLINE[] = "defer_inline";
 const char FUNC_GRAPH_FLAG_CORE[] = "core";
+const char FUNC_GRAPH_ATTR_GRAPH_KERNEL[] = "graph_kernel";
 const char FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER[] = "spec_param";
 
 namespace abstract {
@@ -195,10 +196,19 @@ class FuncGraph : public FuncGraphBase {
   void set_is_generate(bool generated) { is_generated_ = generated; }
   bool is_generated() const { return is_generated_; }
 
-  bool has_flag(const std::string &flag);
-  std::unordered_map<std::string, bool> &flags() { return flags_; }
-  void set_flags(const std::unordered_map<std::string, bool> &flags) { flags_ = flags; }
-  void set_flags(const std::string &key, const bool value) { flags_[key] = value; }
+  std::unordered_map<std::string, ValuePtr> &attrs() { return attrs_; }
+  void set_attrs(const std::unordered_map<std::string, ValuePtr> &attrs) {
+    for (auto &attr : attrs) {
+      attrs_[attr.first] = attr.second;
+    }
+  }
+  bool has_flag(const std::string &key);
+  void set_flag(const std::string &key, bool flag) { attrs_[key] = MakeValue(flag); }
+  void erase_flag(const std::string &key) { (void)attrs_.erase(key); }
+
+  bool has_attr(const std::string &key);
+  ValuePtr get_attr(const std::string &key);
+  void set_attr(const std::string &key, const ValuePtr &value) { attrs_[key] = value; }
 
   std::unordered_map<std::string, FuncGraphTransform> &transforms() { return transforms_; }
   void set_transforms(const std::unordered_map<std::string, FuncGraphTransform> &transforms) {
@@ -317,7 +327,7 @@ class FuncGraph : public FuncGraphBase {
 
   std::unordered_map<AnfNodePtr, AnfNodePtr> &make_ref_params() { return make_ref_params_; }
 
-  std::unordered_map<std::string, bool> flags_;
+  std::unordered_map<std::string, ValuePtr> attrs_;
   std::unordered_map<std::string, FuncGraphTransform> transforms_;
   // parameter default value
   std::map<std::string, AnfNodePtr> parameter_default_value_;
diff --git a/mindspore/ccsrc/ir/func_graph_cloner.cc b/mindspore/ccsrc/ir/func_graph_cloner.cc
index 4622bf9ea2..4a0c69d99a 100644
--- a/mindspore/ccsrc/ir/func_graph_cloner.cc
+++ b/mindspore/ccsrc/ir/func_graph_cloner.cc
@@ -90,6 +90,7 @@ void Cloner::CloneCNode(const AnfNodePtr &node, const FuncGraphPtr &target) {
   new_node->set_abstract(old_node->abstract());
   ScopePtr scope = (node->scope() != kDefaultScope) ? node->scope() : this->scope();
   new_node->set_scope(scope);
+  new_node->set_kernel_info(old_node->kernel_info_ptr());
   repl_node_[old_node] = new_node;
   nodes_.emplace_back(old_node, new_node);
   TraceManager::EndTrace();
@@ -211,7 +212,7 @@ void Cloner::SetFuncGraphInfo(const FuncGraphPtr &func_graph, FuncGraphPtr *cons
   MS_EXCEPTION_IF_NULL(target_func_graph);
   TraceManager::DebugTrace(func_graph->debug_info(), target_relation_);
   *target_func_graph = std::make_shared<FuncGraph>();
-  (*target_func_graph)->set_flags(func_graph->flags());
+  (*target_func_graph)->set_attrs(func_graph->attrs());
   (*target_func_graph)->set_transforms(func_graph->transforms());
   (*target_func_graph)->set_has_vararg(func_graph->has_vararg());
   (*target_func_graph)->set_has_kwarg(func_graph->has_kwarg());
@@ -636,9 +637,14 @@ FuncGraphPtr TransformableClone(const FuncGraphPtr &func_graph, const TraceInfoP
 
   if (MsContext::GetInstance()->is_multi_graph_sink()) {
     if (func_graph->has_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES)) {
-      new_func_graph->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+      new_func_graph->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
     }
   }
+
+  if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+    new_func_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, func_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+  }
+
   return new_func_graph;
 }
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/ir/func_graph_extends.cc b/mindspore/ccsrc/ir/func_graph_extends.cc
index 14998a1eaa..ad7aa6ee0c 100644
--- a/mindspore/ccsrc/ir/func_graph_extends.cc
+++ b/mindspore/ccsrc/ir/func_graph_extends.cc
@@ -399,8 +399,8 @@ void FuncGraph::ReleaseFullOrderToEffectOrder() {
         depend_inputs.push_back(*iter);
       }
     }
-    set_flags(GRAPH_FLAG_HAS_EFFECT, false);
-    set_flags(GRAPH_FLAG_EFFECT_PATIAL_ORDER, true);
+    set_flag(GRAPH_FLAG_HAS_EFFECT, false);
+    set_flag(GRAPH_FLAG_EFFECT_PATIAL_ORDER, true);
     if (!depend_inputs.empty()) {
       SetEffectDepends(depend_inputs);
     }
diff --git a/mindspore/ccsrc/kernel/CMakeLists.txt b/mindspore/ccsrc/kernel/CMakeLists.txt
index 01fc7faa79..ceea6b1a99 100644
--- a/mindspore/ccsrc/kernel/CMakeLists.txt
+++ b/mindspore/ccsrc/kernel/CMakeLists.txt
@@ -9,6 +9,10 @@ if (ENABLE_D)
 	file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
 		"kernel_query.cc"
 		"kernel_fusion.cc"
+		"akg/ascend/*.cc"
+		"akg/akg_kernel_build.cc"
+		"akg/akg_kernel_attrs_process.cc"
+		"akg/akg_kernel_metadata.cc"
 		"tbe/*.cc"
 		"aicpu/*.cc"
 		"rts/*.cc"
@@ -33,7 +37,7 @@ if (ENABLE_GPU)
     file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "gpu/*.cu"
         "akg/gpu/*.cc"
-        "akg/akgkernelbuild.cc"
+        "akg/akg_kernel_build.cc"
         "akg/akg_kernel_attrs_process.cc"
 	)
 
diff --git a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
index 1afe01bd6a..c83994b5f2 100644
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
@@ -24,7 +24,7 @@
 #include <map>
 #include "device/kernel_runtime.h"
 #include "kernel/aicpu/aicpu_kernel_mod.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "proto/tensor.pb.h"
 #include "proto/tensor_shape.pb.h"
 #include "proto/attr.pb.h"
diff --git a/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc b/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc
index c9ff41dc55..3a0cc3eb25 100644
--- a/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_attrs_process.cc
@@ -79,6 +79,10 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
     dst_type = "float32";
   } else if (output_type == kFloat16->type_id()) {
     dst_type = "float16";
+  } else if (output_type == kInt32->type_id()) {
+    dst_type = "int32";
+  } else {
+    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
   }
   AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }
diff --git a/mindspore/ccsrc/kernel/akg/akgkernelbuild.cc b/mindspore/ccsrc/kernel/akg/akg_kernel_build.cc
similarity index 78%
rename from mindspore/ccsrc/kernel/akg/akgkernelbuild.cc
rename to mindspore/ccsrc/kernel/akg/akg_kernel_build.cc
index c0759172a5..1f88bbb89a 100644
--- a/mindspore/ccsrc/kernel/akg/akgkernelbuild.cc
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include <Python.h>
 #include <sys/types.h>
 #include <signal.h>
@@ -43,7 +43,9 @@ namespace kernel {
 constexpr int ME_MAX_KERNEL_NAME_LENGTH = 200;
 constexpr int32_t ARGS_SIZE = 1;
 constexpr auto kCompileWithJsonFunc = "compilewithjson";
+
 // json key
+constexpr auto kOpDesc = "op_desc";
 constexpr auto kInputDesc = "input_desc";
 constexpr auto kShape = "shape";
 constexpr auto kDataType = "data_type";
@@ -51,13 +53,24 @@ constexpr auto kOutputDesc = "output_desc";
 constexpr auto kName = "name";
 constexpr auto kTensorName = "tensor_name";
 constexpr auto kValue = "value";
-constexpr auto KInpputNames = "input_names";
+constexpr auto KDynInputSizes = "dyn_input_sizes";
+constexpr auto KInputNames = "input_names";
 constexpr auto KInput = "input";
 constexpr auto KDtype = "dtype";
-int AkgKernelBuild::op_cnt_ = 0;
-std::mutex AkgKernelBuild::op_cnt_mtx_;
+namespace {
+template <typename T>
+std::string Vector2Str(const std::vector<T> &inputs) {
+  if (!inputs.empty()) {
+    std::ostringstream oss;
+    (void)std::copy(inputs.begin(), inputs.end() - 1, std::ostream_iterator<T>(oss, ", "));
+    oss << inputs.back();
+    return oss.str();
+  }
+  return "";
+}
+}  // namespace
 
-std::string PyObjectToStr(PyObject *const PyObj) {
+std::string AkgKernelBuild::PyObjectToStr(PyObject *const PyObj) {
   char *pChar = nullptr;
   std::string str_res;
   if (PyObj == nullptr) {
@@ -76,6 +89,72 @@ std::string PyObjectToStr(PyObject *const PyObj) {
   return str_res;
 }
 
+std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
+                          const std::pair<size_t, size_t> &position) {
+  if (node_json.count(tag) == 0) {
+    MS_LOG(ERROR) << "Node [" << node_json.dump() << "] has no key [" << tag << "].";
+    return "";
+  }
+
+  auto const &tag_desc = node_json[tag];
+  nlohmann::json first_index;
+  if (tag == kOutputDesc) {
+    first_index = tag_desc;
+  } else if (!tag_desc.is_array() || tag_desc.size() <= position.first) {
+    MS_LOG(ERROR) << "Node [" << tag_desc.dump() << "] has no enough value [" << position.first << "].";
+    return "";
+  } else {
+    first_index = tag_desc[position.first];
+  }
+
+  if (!first_index.is_array() || first_index.size() <= position.second) {
+    MS_LOG(ERROR) << "Node [" << first_index.dump() << "] has no enough value [" << position.second << "].";
+    return "";
+  }
+  auto const &second_index = first_index[position.second];
+  if (second_index.count(kTensorName) == 0) {
+    MS_LOG(ERROR) << "Node [" << second_index.dump() << "] has no key [" << kTensorName << "].";
+    return "";
+  }
+
+  return second_index[kTensorName];
+}
+
+void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
+                   nlohmann::json *const node_json) {
+  MS_EXCEPTION_IF_NULL(node_json);
+  if (node_json->count(tag) == 0) {
+    MS_LOG(ERROR) << "Node [" << node_json->dump() << "] has no key [" << tag << "].";
+    return;
+  }
+
+  nlohmann::json *tag_desc = &((*node_json)[tag]);
+  nlohmann::json *first_index;
+  if (tag == kOutputDesc) {
+    first_index = tag_desc;
+  } else if (!tag_desc->is_array() || tag_desc->size() <= position.first) {
+    MS_LOG(ERROR) << "Node [" << tag_desc->dump() << "] has no enough value [" << position.first << "].";
+    return;
+  } else {
+    first_index = &((*tag_desc)[position.first]);
+  }
+
+  if (!first_index->is_array() || first_index->size() <= position.second) {
+    MS_LOG(ERROR) << "Node [" << first_index->dump() << "] has no enough value [" << position.second << "].";
+    return;
+  }
+  nlohmann::json *second_index = &((*first_index)[position.second]);
+  if (second_index->count(kTensorName) == 0) {
+    MS_LOG(ERROR) << "Node [" << second_index->dump() << "] has no key [" << kTensorName << "].";
+    return;
+  }
+  (*second_index)[kTensorName] = new_name;
+  return;
+}
+
+int AkgKernelBuild::op_cnt_ = 0;
+std::mutex AkgKernelBuild::op_cnt_mtx_;
+
 std::string AkgKernelBuild::GetProcessor(const AnfNodePtr &anf_node) {
   MS_EXCEPTION_IF_NULL(anf_node);
   std::string device;
@@ -187,10 +266,7 @@ bool AkgKernelBuild::CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::j
     for (size_t input_i = 0; input_i < input_tensor_num; input_i++) {
       // dtype : float16
       auto type_id = AnfAlgo::GetInputDeviceDataType(anf_node, real_input_index);
-      TypePtr type_ptr = TypeIdToType(type_id);
-      MS_EXCEPTION_IF_NULL(type_ptr);
-      std::string dtype = type_ptr->ToString();
-      dtype = Dtype2String(dtype);
+      std::string dtype = TypeId2String(type_id);
       if (dtype.empty()) {
         MS_LOG(ERROR) << "Op [" << op_name << "] input [" << input_i << "] data type is null. ";
         return false;
@@ -198,13 +274,23 @@ bool AkgKernelBuild::CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::j
       nlohmann::json input_desc_json;
       input_desc_json[kDataType] = dtype;
       input_desc_json[kName] = op_input_name;
-      input_desc_json[kTensorName] =
-        op_input_name + "_" + std::to_string(real_input_index) + "_" + std::to_string(input_i);
-      input_desc_json[kShape] = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
+      input_desc_json[kTensorName] = "input_" + std::to_string(GetInputTensorIdxInc(anf_node, real_input_index));
+      auto input_shape = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
+      if (GetInputTensorValue(anf_node, real_input_index, &input_desc_json)) {
+        MS_LOG(WARNING) << "we take input[" << real_input_index << "] of [" << anf_node->DebugString(2)
+                        << "] as const tensor, shape: [" << Vector2Str(input_shape)
+                        << "], value: " << input_desc_json[kValue];
+
+        input_shape.clear();
+      }
+      if (input_shape.empty()) {
+        input_shape.push_back(1);
+      }
+      input_desc_json[kShape] = input_shape;
       input_list.emplace_back(input_desc_json);
+      real_input_index++;
     }
     inputs_json->emplace_back(input_list);
-    real_input_index++;
   }
   return true;
 }
@@ -220,10 +306,7 @@ bool AkgKernelBuild::CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::
   for (size_t i = 0; i < output_tensor_num; i++) {
     nlohmann::json output_json;
     auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, i);
-    TypePtr type_ptr = TypeIdToType(type_id);
-    MS_EXCEPTION_IF_NULL(type_ptr);
-    std::string dtype = type_ptr->ToString();
-    dtype = Dtype2String(dtype);
+    std::string dtype = TypeId2String(type_id);
     if (dtype.empty()) {
       MS_LOG(ERROR) << "Op [" << op_name << "] output [" << i << "] data type is null. ";
       return false;
@@ -232,7 +315,7 @@ bool AkgKernelBuild::CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::
     std::string output_name = outputs[i]->name();
     output_json[kDataType] = dtype;
     output_json[kName] = output_name;
-    output_json[kTensorName] = output_name + "_" + std::to_string(i);
+    output_json[kTensorName] = "output_" + std::to_string(i) + "_" + std::to_string(GetOutputTensorIdxInc());
     output_json[kShape] = AnfAlgo::GetOutputDeviceShape(anf_node, i);
     outputs_json->push_back(output_json);
   }
@@ -358,15 +441,14 @@ bool AkgKernelBuild::GenerateSingleKernelJson(const AnfNodePtr &anf_node, const
   MS_EXCEPTION_IF_NULL(op_info_ptr);
 
   // get basic params from currentNodeOpDesc
-  (*node_json)["platform"] = "AKG";
   (*node_json)[kName] = op_name;
-  (*node_json)["fusion_type"] = AnfAlgo::GetFusionType(anf_node);
   (*node_json)["impl_path"] = op_info_ptr->impl_path();
   (*node_json)["process"] = AkgKernelBuild::GetProcessor(anf_node);
+  (*node_json)["composite"] = false;
 
   auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
   MS_EXCEPTION_IF_NULL(primitive);
-  ValuePtr input_names_v = primitive->GetAttr(KInpputNames);
+  ValuePtr input_names_v = primitive->GetAttr(KInputNames);
   if (input_names_v == nullptr) {
     MS_LOG(ERROR) << "ApplyKernel has no input_names, op[" << op_name << "].";
     return false;
@@ -465,12 +547,12 @@ KernelPackPtr AkgKernelBuild::OpBuild(const std::string &node_json, const AnfNod
   (void)alarm(0);
   if (pRes == nullptr) {
     MS_LOG(ERROR) << "No ret got, failed to call function [" << kCompileWithJsonFunc << "], args:\n("
-                  << PyObjectToStr(pArg) << ").";
+                  << AkgKernelBuild::PyObjectToStr(pArg) << ").";
     return nullptr;
   }
   if (PyObject_IsTrue(pRes) != 1) {
     MS_LOG(ERROR) << "Illegal ret, failed to call function [" << kCompileWithJsonFunc << "], args:\n("
-                  << PyObjectToStr(pArg) << ").";
+                  << AkgKernelBuild::PyObjectToStr(pArg) << ").";
     return nullptr;
   }
 
@@ -513,5 +595,29 @@ KernelPackPtr AkgKernelBuild::BuildByJson(const AnfNodePtr &anf_node, std::vecto
                << "]";
   return kernel_pack;
 }
+
+size_t AkgKernelBuild::GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  auto cnode = anf_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (input_idx + 1 >= cnode->inputs().size()) {
+    MS_EXCEPTION(ArgumentError) << "input_idx [" << input_idx << "] is out of index of inputs of ["
+                                << cnode->inputs().size() - 1 << "][" << cnode->DebugString() << "]";
+  }
+
+  auto input_node = cnode->input(input_idx + 1);
+  if (input_tensor_idx_.find(input_node) == input_tensor_idx_.end()) {
+    size_t index = input_tensor_idx_.size();
+    input_tensor_idx_[input_node] = index;
+  }
+
+  return input_tensor_idx_[input_node];
+}
+
+size_t AkgKernelBuild::GetOutputTensorIdxInc() {
+  size_t idx = output_tensor_idx_++;
+  return idx;
+}
+
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/akgkernelbuild.h b/mindspore/ccsrc/kernel/akg/akg_kernel_build.h
similarity index 70%
rename from mindspore/ccsrc/kernel/akg/akgkernelbuild.h
rename to mindspore/ccsrc/kernel/akg/akg_kernel_build.h
index f8127843bd..d32bd48ce6 100644
--- a/mindspore/ccsrc/kernel/akg/akgkernelbuild.h
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_build.h
@@ -32,29 +32,45 @@ namespace mindspore {
 namespace kernel {
 class AkgKernelBuild {
  public:
-  AkgKernelBuild() = default;
+  AkgKernelBuild() {
+    input_tensor_idx_ = {};
+    output_tensor_idx_ = 0;
+  }
   ~AkgKernelBuild() = default;
 
   KernelPackPtr BuildByJson(const AnfNodePtr &anf_node, std::vector<size_t> *const input_size,
                             std::vector<size_t> *const output_size);
+  static std::string GetProcessor(const AnfNodePtr &anf_node);
+  static std::string PyObjectToStr(PyObject *const PyObj);
 
- private:
+ protected:
   bool CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const inputs_json);
   bool CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::json *const outputs_json);
   bool CreateAttrDescJson(const AnfNodePtr &anf_node, const std::string &op_name,
                           const std::shared_ptr<OpInfo> &op_info, nlohmann::json *const attrs_json);
+  KernelPackPtr OpBuild(const std::string &node_json, const AnfNodePtr &anf_node);
+  int GetOpCntInc();
+  size_t GetInputTensorIdxInc(const AnfNodePtr &anf_node, size_t input_idx);
+  size_t GetOutputTensorIdxInc();
   bool GenerateSingleKernelJson(const AnfNodePtr &anf_node, const std::string &op_name,
                                 nlohmann::json *const node_json);
-  KernelPackPtr OpBuild(const std::string &node_json, const AnfNodePtr &anf_node);
 
-  int GetOpCntInc();
-  std::string GetProcessor(const AnfNodePtr &anf_node);
   static int op_cnt_;
   // lock for variable fusionOpCnt in singleton mode
   static std::mutex op_cnt_mtx_;
   std::string json_name_;
   std::string json_info_;
+  std::unordered_map<AnfNodePtr, size_t> input_tensor_idx_;
+  size_t output_tensor_idx_;
 };
+
+bool GetIOSize(const nlohmann::json &node_json, std::vector<size_t> *const input_size,
+               std::vector<size_t> *const output_size);
+void SetTensorName(const std::string &tag, const std::string &new_name, const std::pair<size_t, size_t> &position,
+                   nlohmann::json *const node_json);
+std::string GetTensorName(const nlohmann::json &node_json, const std::string &tag,
+                          const std::pair<size_t, size_t> &position);
+
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.cc b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.cc
new file mode 100644
index 0000000000..3515add1e0
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.cc
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/akg_kernel_metadata.h"
+#include <memory>
+#include "session/anf_runtime_algorithm.h"
+#include "kernel/oplib/oplib.h"
+#include "kernel/common_utils.h"
+
+namespace mindspore {
+namespace kernel {
+void AkgMetadataInfo(const CNodePtr &kernel_node,
+                     std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_info_list);
+
+  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
+  for (size_t i = 0; i < support_devices.size(); i++) {
+    auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAKG);
+    if (op_info_ptr == nullptr) {
+      continue;
+    }
+
+    if (!ParseMetadata(kernel_node, op_info_ptr, Processor(i), kernel_info_list)) {
+      MS_LOG(WARNING) << "Akg parsed metadata of op[" << op_name << "], device[" << support_devices[i] << "] failed.";
+    } else {
+      MS_LOG(DEBUG) << "Akg parsed metadata of op[" << op_name << "], device[" << support_devices[i] << "].";
+      break;
+    }
+  }
+
+  if (kernel_info_list->empty()) {
+    MS_LOG(WARNING) << "Akg dose not has metadata of op[" << op_name << "].";
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.h b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.h
new file mode 100644
index 0000000000..5e329f0080
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/akg_kernel_metadata.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
+#define MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <memory>
+#include "kernel/kernel_build_info.h"
+
+namespace mindspore {
+namespace kernel {
+void AkgMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list);
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_KERNEL_AKG_AKG_KERNEL_METADATA_H_
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.cc b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.cc
new file mode 100644
index 0000000000..454b8052ab
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.cc
@@ -0,0 +1,385 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/ascend/akg_ascend_kernel_build.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <Python.h>
+#include "ir/dtype.h"
+#include "ir/func_graph.h"
+#include "kernel/kernel.h"
+#include "kernel/common_utils.h"
+#include "kernel/tbe/tbe_utils.h"
+#include "kernel/akg/ascend/akg_ascend_kernel_mod.h"
+#include "kernel/akg/akg_kernel_attrs_process.h"
+#include "session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace kernel {
+
+constexpr int32_t PARALLEL_ARGS_SIZE = 3;
+constexpr int32_t PROCESS_NUM = 16;
+constexpr int32_t TIME_OUT = 300;
+
+constexpr auto kOpDesc = "op_desc";
+constexpr auto kShape = "shape";
+constexpr auto kDataType = "data_type";
+constexpr auto kInputDesc = "input_desc";
+constexpr auto kOutputDesc = "output_desc";
+constexpr auto kTensorName = "tensor_name";
+constexpr auto kCompileAkgKernelParallelFunc = "compile_akg_kernel_parallel";
+constexpr auto kMultiProcModule = "mindspore._extends.parallel_compile.akg_compiler.multi_process_compiler";
+
+bool AkgAscendKernelBuilder::CollectJson(const AnfNodePtr &anf_node) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
+  MS_LOG(INFO) << "AKG start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
+  auto it = kAkgKernelAttrsProcessMap.find(op_name);
+  if (it != kAkgKernelAttrsProcessMap.end()) {
+    it->second(anf_node);
+  }
+  MS_LOG(INFO) << "Akg start compile, op[" << op_name << "], device[" << AkgKernelBuild::GetProcessor(anf_node) << "]";
+  nlohmann::json node_json;
+  if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
+    MS_LOG(ERROR) << "Op[" << op_name << "] create single kernel json failed.";
+  }
+
+  kernel_json_ = node_json.dump();
+
+  if (!GetIOSize(node_json, &input_size_list_, &output_size_list_)) {
+    MS_LOG(ERROR) << "Cal mem size failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool AkgAscendKernelBuilder::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes,
+                                              const std::vector<AnfNodePtr> &input_list,
+                                              const std::vector<AnfNodePtr> &output_list) {
+  if (anf_nodes.empty() || input_list.empty()) {
+    MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size()
+                  << "].";
+    return false;
+  }
+  MS_LOG(INFO) << "anf_nodes [" << output_list.size() << "], input_list [" << anf_nodes.size() << "], output_list ["
+               << input_list.size() << "].";
+
+  std::map<AnfNodePtr, nlohmann::json> node_json_map;
+
+  for (auto const &anf_node : anf_nodes) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    std::string op_name = AnfAlgo::GetCNodeName(anf_node);
+    if (!AnfAlgo::IsRealKernel(anf_node)) {
+      MS_LOG(ERROR) << "Invalid anf node to build [" << anf_node->fullname_with_scope() << "].";
+      return false;
+    }
+    auto it = kAkgKernelAttrsProcessMap.find(op_name);
+    if (it != kAkgKernelAttrsProcessMap.end()) {
+      it->second(anf_node);
+    }
+
+    nlohmann::json node_json;
+    if (!GenerateSingleKernelJson(anf_node, op_name, &node_json)) {
+      MS_LOG(ERROR) << "Op [" << op_name << "] create single kernel json failed.";
+      return false;
+    }
+    // No need for composite op.
+    node_json.erase("id");
+    node_json.erase("op");
+    node_json.erase("composite");
+
+    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
+    MS_EXCEPTION_IF_NULL(primitive);
+
+    if (primitive->GetAttr("fusion") != nullptr) {
+      node_json["fusion"] = primitive->GetAttr("fusion")->ToString();
+    }
+
+    node_json_map[anf_node] = node_json;
+  }
+
+  for (auto const &anf_node : anf_nodes) {
+    std::vector<int> dyn_input_sizes;
+    auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
+    MS_EXCEPTION_IF_NULL(primitive);
+
+    if (primitive->GetAttr(kAttrDynInputSizes) != nullptr) {
+      dyn_input_sizes = GetValue<const std::vector<int>>(primitive->GetAttr(kAttrDynInputSizes));
+    }
+
+    bool is_dynamic_input = !dyn_input_sizes.empty();
+    size_t input_num = is_dynamic_input ? dyn_input_sizes.size() : AnfAlgo::GetInputTensorNum(anf_node);
+    size_t real_input_index = 0;
+    for (size_t i = 0; i < input_num; ++i) {
+      size_t input_tensor_num = is_dynamic_input ? IntToSize(dyn_input_sizes[i]) : 1;
+      for (size_t j = 0; j < input_tensor_num; ++j) {
+        auto tmp_input = GetKernelInput(anf_node, real_input_index);
+        std::string tensor_name = GetTensorName(node_json_map[anf_node], kInputDesc, std::make_pair(i, j));
+        if (node_json_map.find(tmp_input.first) != node_json_map.end()) {
+          std::string new_tensor_name =
+            GetTensorName(node_json_map[tmp_input.first], kOutputDesc, std::make_pair(0, tmp_input.second));
+          SetTensorName(kInputDesc, new_tensor_name, std::make_pair(i, j), &(node_json_map[anf_node]));
+          MS_LOG(DEBUG) << "Update [" << real_input_index << "] input [" << tensor_name << "] of ["
+                        << anf_node->fullname_with_scope() << "] to [" << tmp_input.second << "] output ["
+                        << new_tensor_name << "] of [" << tmp_input.first->fullname_with_scope() << "].";
+        } else {
+          MS_LOG(DEBUG) << "[" << real_input_index << "] input " << tensor_name << "] of ["
+                        << anf_node->fullname_with_scope() << "] is out input.";
+        }
+        real_input_index++;
+      }
+    }
+  }
+
+  nlohmann::json fused_node_json;
+  std::vector<nlohmann::json> node_json_desc;
+  std::transform(anf_nodes.begin(), anf_nodes.end(), std::back_inserter(node_json_desc),
+                 [&node_json_map](const AnfNodePtr &anf_node) { return node_json_map[anf_node]; });
+  fused_node_json[kOpDesc] = node_json_desc;
+
+  nlohmann::json inputs_json;
+  auto input_index = GetInputIndex(anf_nodes, input_list);
+  for (size_t i = 0; i < input_index.size(); ++i) {
+    auto tmp_input = input_index[i];
+    auto type_id = AnfAlgo::GetInputDeviceDataType(tmp_input.first, tmp_input.second.first);
+    std::string dtype = TypeId2String(type_id);
+    nlohmann::json input_desc_json;
+    input_desc_json[kTensorName] = GetTensorName(node_json_map[tmp_input.first], kInputDesc, tmp_input.second);
+    input_desc_json[kDataType] = dtype;
+    input_desc_json[kShape] = AnfAlgo::GetInputDeviceShape(tmp_input.first, tmp_input.second.first);
+    inputs_json.emplace_back(std::vector<nlohmann::json>{input_desc_json});
+  }
+  fused_node_json[kInputDesc] = inputs_json;
+
+  nlohmann::json outputs_json;
+  auto output_index = GetOutputIndex(anf_nodes, input_list, output_list);
+  for (size_t i = 0; i < output_index.size(); ++i) {
+    auto tmp_output = output_index[i];
+    bool found = false;
+    nlohmann::json output_desc_json;
+    for (size_t input_i = 0; input_i < input_list.size(); ++input_i) {
+      if (tmp_output.first == input_list[input_i]) {
+        output_desc_json = inputs_json[input_i][0];
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      auto type_id = AnfAlgo::GetOutputDeviceDataType(tmp_output.first, tmp_output.second);
+      std::string dtype = TypeId2String(type_id);
+      output_desc_json[kTensorName] =
+        GetTensorName(node_json_map[tmp_output.first], kOutputDesc, std::make_pair(0, tmp_output.second));
+      output_desc_json[kDataType] = dtype;
+      auto output_shape = AnfAlgo::GetOutputDeviceShape(tmp_output.first, tmp_output.second);
+      if (output_shape.empty()) {
+        output_shape.push_back(1);
+      }
+      output_desc_json[kShape] = output_shape;
+    }
+    outputs_json.emplace_back(output_desc_json);
+  }
+  fused_node_json[kOutputDesc] = outputs_json;
+
+  size_t hash_id = std::hash<std::string>()(fused_node_json.dump());
+  json_name_ = "Fused_";
+  auto fg = anf_nodes[0]->func_graph();
+  MS_EXCEPTION_IF_NULL(fg);
+  auto attr_val = fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+  if (attr_val != nullptr) {
+    auto fg_attr = GetValue<std::string>(attr_val);
+    (void)json_name_.append(fg_attr).append("_");
+  }
+  (void)json_name_.append(std::to_string(hash_id));
+  fused_node_json["composite_graph"] = fg->ToString();
+  fused_node_json["op"] = json_name_;
+  fused_node_json["platform"] = "AKG";
+  fused_node_json["process"] = "aicore";
+  fused_node_json["composite"] = true;
+
+  kernel_json_ = fused_node_json.dump();
+
+  if (!GetIOSize(fused_node_json, &input_size_list_, &output_size_list_)) {
+    MS_LOG(ERROR) << "Cal mem size failed.";
+    return false;
+  }
+
+  return true;
+}
+
+void GenParallelCompileFuncArgs(const std::vector<std::string> &kernel_jsons, PyObject **p_args) {
+  MS_EXCEPTION_IF_NULL(p_args);
+  *p_args = PyTuple_New(PARALLEL_ARGS_SIZE);
+
+  PyObject *arg1 = PyList_New(kernel_jsons.size());
+  for (int i = 0; i < PyList_Size(arg1); ++i) {
+    PyList_SetItem(arg1, i, Py_BuildValue("s", kernel_jsons[i].c_str()));
+  }
+  PyObject *arg2 = Py_BuildValue("i", PROCESS_NUM);
+  PyObject *arg3 = Py_BuildValue("i", TIME_OUT);
+
+  (void)PyTuple_SetItem(*p_args, 0, arg1);
+  (void)PyTuple_SetItem(*p_args, 1, arg2);
+  (void)PyTuple_SetItem(*p_args, 2, arg3);
+}
+
+bool AkgOpParallelBuild(const std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> &build_args) {
+  // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
+  std::vector<std::string> jsons;
+  std::unordered_set<std::string> json_name_set;
+  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> repeat_nodes;
+  for (const auto &[builder, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto json_name = builder.json_name();
+    MS_LOG(DEBUG) << "Akg start compile op: " << json_name;
+    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    if (cached_kernel_pack != nullptr) {
+      MS_LOG(DEBUG) << "Use cached kernel, json_name_[" << json_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
+      kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
+      kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+      AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+      continue;
+    }
+
+    if (json_name_set.count(json_name) != 0) {
+      repeat_nodes.push_back({builder, anf_node});
+      continue;
+    }
+    json_name_set.insert(json_name);
+    auto node_json = builder.kernel_json();
+    kernel::SaveJsonInfo(json_name, node_json);
+    jsons.push_back(node_json);
+  }
+
+  // No nodes need to be compiled!
+  if (jsons.empty()) {
+    return true;
+  }
+
+  // Try to call python method to compile nodes parallely.
+  PyObject *p_module = nullptr;
+  PyObject *p_func = nullptr;
+  PyObject *p_arg = nullptr;
+  PyObject *p_res = nullptr;
+
+  p_module = PyImport_ImportModule(kMultiProcModule);
+  if (p_module == nullptr) {
+    MS_LOG(ERROR) << "Failed to import [" << kMultiProcModule << "].";
+    return false;
+  }
+
+  p_func = PyObject_GetAttrString(p_module, kCompileAkgKernelParallelFunc);
+  GenParallelCompileFuncArgs(jsons, &p_arg);
+  MS_LOG(DEBUG) << "Call function [" << kCompileAkgKernelParallelFunc << "], try to compile " << jsons.size()
+                << " Akg kernels parallelly.";
+  p_res = PyEval_CallObject(p_func, p_arg);
+  if (p_res == nullptr) {
+    PyErr_Print();
+    MS_LOG(ERROR) << "No ret got, failed to call function [" << kCompileAkgKernelParallelFunc << "], args:\n("
+                  << AkgKernelBuild::PyObjectToStr(p_arg) << ").";
+    return false;
+  }
+  if (PyObject_IsTrue(p_res) != 1) {
+    PyErr_Print();
+    MS_LOG(ERROR) << "Illegal ret, failed to call function [" << kCompileAkgKernelParallelFunc << "], args:\n("
+                  << AkgKernelBuild::PyObjectToStr(p_arg) << ").";
+    return false;
+  }
+
+  // All unique done here, cache them and set kernel.
+  for (const auto &[builder, anf_node] : build_args) {
+    auto json_name = builder.json_name();
+    auto new_kernel_pack = tbe::TbeUtils::InsertCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    if (new_kernel_pack == nullptr) {
+      MS_LOG(ERROR) << "Insert to cache failed, json_name_[" << json_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      return false;
+    }
+    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(new_kernel_pack);
+    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
+    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+    MS_LOG(DEBUG) << "Akg compile " << json_name << " kernel and insert cache successfully!";
+  }
+
+  // Handle repeated nodes.
+  for (const auto &[builder, anf_node] : repeat_nodes) {
+    auto node_json = builder.kernel_json();
+    auto json_name = builder.json_name();
+    auto cached_kernel_pack = tbe::TbeUtils::SearchCache(json_name, AkgKernelBuild::GetProcessor(anf_node));
+    if (cached_kernel_pack == nullptr) return false;
+    MS_LOG(INFO) << "Use just compiled kernel, json_name_[" << json_name << "], fullname_with_scope["
+                 << anf_node->fullname_with_scope() << "].";
+    auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(cached_kernel_pack);
+    kernel_mod_ptr->SetInputSizeList(builder.input_size_list());
+    kernel_mod_ptr->SetOutputSizeList(builder.output_size_list());
+    AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+  }
+
+  return true;
+}
+
+bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
+  std::vector<std::pair<AkgAscendKernelBuilder, AnfNodePtr>> json_and_node;
+  for (const auto &anf_node : anf_nodes) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    AkgAscendKernelBuilder akg_cce_kernel_builder;
+    KernelPackPtr kernel_pack = nullptr;
+    auto cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    if (AnfAlgo::IsGraphKernel(cnode)) {
+      auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
+      auto mng = func_graph->manager();
+      if (mng == nullptr) {
+        mng = Manage(func_graph, true);
+        func_graph->set_manager(mng);
+      }
+      MS_EXCEPTION_IF_NULL(func_graph);
+      std::vector<AnfNodePtr> node_list;
+      std::vector<AnfNodePtr> input_list;
+      std::vector<AnfNodePtr> output_list;
+      std::string op_name = AnfAlgo::GetCNodeName(anf_node);
+      MS_LOG(INFO) << "Akg start compile composite op[" << op_name << "]";
+      GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+      if (!akg_cce_kernel_builder.CollectFusedJson(node_list, input_list, output_list)) {
+        MS_EXCEPTION(UnknownError) << "Akg build failed composite op[" << op_name << "].";
+      }
+    } else {
+      if (!akg_cce_kernel_builder.CollectJson(anf_node)) {
+        MS_EXCEPTION(UnknownError) << "Akg build failed op[" << AnfAlgo::GetCNodeName(anf_node) << "].";
+      }
+    }
+    json_and_node.push_back({akg_cce_kernel_builder, anf_node});
+  }
+
+  if (json_and_node.empty()) {
+    MS_LOG(DEBUG) << "There is no kernel needed to be compiled.";
+    return true;
+  }
+
+  return AkgOpParallelBuild(json_and_node);
+}
+
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.h b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.h
new file mode 100644
index 0000000000..619b583fde
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_build.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
+#define MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
+
+#include <string>
+#include <memory>
+#include <vector>
+#include "ir/anf.h"
+#include "kernel/kernel.h"
+#include "kernel/akg/akg_kernel_build.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgAscendKernelBuilder : public AkgKernelBuild {
+ public:
+  AkgAscendKernelBuilder() = default;
+  ~AkgAscendKernelBuilder() = default;
+
+  bool CollectJson(const AnfNodePtr &anf_node);
+  bool CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, const std::vector<AnfNodePtr> &input_list,
+                        const std::vector<AnfNodePtr> &output_list);
+  std::string json_name() const { return json_name_; }
+  std::string kernel_json() const { return kernel_json_; }
+  const std::vector<size_t> &input_size_list() const { return input_size_list_; }
+  const std::vector<size_t> &output_size_list() const { return output_size_list_; }
+
+ private:
+  std::string kernel_json_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+};
+
+bool AkgAscendKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_BUILD_H_
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc
new file mode 100644
index 0000000000..24324f70e0
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.cc
@@ -0,0 +1,181 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/akg/ascend/akg_ascend_kernel_mod.h"
+#include <algorithm>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+#include "nlohmann/json.hpp"
+#include "runtime/rt.h"
+#include "utils/log_adapter.h"
+#include "utils/convert_utils.h"
+
+namespace mindspore {
+namespace kernel {
+using std::fstream;
+using std::map;
+using std::mutex;
+using std::string;
+using TbeTaskInfoPtr = std::shared_ptr<ge::model_runner::TbeTaskInfo>;
+using tbe::KernelManager;
+constexpr uint32_t DEFAULT_BLOCK_DIM = 1;
+/**
+ * @brief infotable contain func_stub\blockdim\kernel file buffer
+ */
+AkgKernelMod::AkgKernelMod(const KernelPackPtr &kernel_pack) : kernel_pack_(kernel_pack) {}
+
+void AkgKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
+
+void AkgKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
+
+void AkgKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
+
+const std::vector<size_t> &AkgKernelMod::GetInputSizeList() const { return input_size_list_; }
+
+const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return output_size_list_; }
+
+const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
+
+void DumpData(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  const char *dump_data = getenv("MS_KERNEL_DUMP_DATA");
+  if (dump_data) {
+    int idx = 0;
+    for (const auto &x : inputs) {
+      std::vector<char> buf(x->size);
+      if (RT_ERROR_NONE != rtMemcpy(buf.data(), buf.size(), reinterpret_cast<const void *>(x->addr), x->size,
+                                    RT_MEMCPY_DEVICE_TO_HOST)) {
+        MS_LOG(WARNING) << "Call runtime rtMemcpy error.";
+        return;
+      }
+
+      std::string file_name("input_");
+      file_name += std::to_string(idx);
+      std::ofstream file(file_name, std::ios::binary);
+      if (file.is_open()) {
+        (void)file.write(buf.data(), SizeToLong(buf.size()));
+        file.close();
+        idx++;
+      } else {
+        MS_LOG(ERROR) << "Open file failed.";
+        return;
+      }
+    }
+    idx = 0;
+    for (const auto &x : outputs) {
+      std::vector<char> buf(x->size);
+      if (RT_ERROR_NONE != rtMemcpy(buf.data(), buf.size(), reinterpret_cast<const void *>(x->addr), x->size,
+                                    RT_MEMCPY_DEVICE_TO_HOST)) {
+        MS_LOG(WARNING) << "Call runtime rtMemcpy error.";
+        return;
+      }
+
+      std::string file_name("output_");
+      file_name += std::to_string(idx);
+      std::ofstream file(file_name, std::ios::binary);
+      if (file.is_open()) {
+        (void)file.write(buf.data(), SizeToLong(buf.size()));
+        file.close();
+        idx++;
+      } else {
+        MS_LOG(ERROR) << "Open file failed.";
+        return;
+      }
+    }
+  }
+}
+
+bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  if (stream_ptr == 0) {
+    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
+    return false;
+  }
+
+  if (kernel_pack_ == nullptr) {
+    MS_LOG(ERROR) << "kernel pack should not be nullptr.";
+    return false;
+  }
+
+  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
+  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
+  if (func_stub == 0) {
+    MS_LOG(ERROR) << "GenFuncStub failed.";
+    return false;
+  }
+
+  // pack all addresses into a vector.
+  std::vector<void *> runtime_args;
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtime_args),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+
+  rtL2Ctrl_t *l2ctrl = nullptr;
+  auto stream = reinterpret_cast<rtStream_t *>(stream_ptr);
+  if (RT_ERROR_NONE != rtKernelLaunch(reinterpret_cast<void *>(func_stub), block_dim, runtime_args.data(),
+                                      SizeToUint(sizeof(void *) * runtime_args.size()), l2ctrl, stream)) {
+    MS_LOG(ERROR) << "Call runtime rtKernelLaunch error.";
+    return false;
+  }
+
+  DumpData(inputs, outputs);
+
+  return true;
+}
+
+std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                                               const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
+  if (kernel_pack_ == nullptr) {
+    MS_LOG(EXCEPTION) << "kernel pack should not be nullptr.";
+  }
+
+  std::vector<uint8_t> args;
+  uint32_t args_size = 0;
+  std::vector<uint8_t> sm_desc;
+  void *binary = nullptr;
+  uint32_t binary_size = 0;
+  std::vector<uint8_t> meta_data;
+  std::vector<void *> input_data_addrs;
+  std::vector<void *> output_data_addrs;
+  std::vector<void *> workspace_addrs;
+
+  // pack all addresses into a vector.
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(input_data_addrs),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+
+  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
+  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
+  if (func_stub == 0) {
+    MS_LOG(EXCEPTION) << "GenFuncStub failed.";
+  }
+
+  std::string stub_func = KernelManager::GetStubFuncName(kernel_pack_);
+
+  MS_LOG(DEBUG) << "The block_dim is:" << block_dim;
+
+  TbeTaskInfoPtr task_info_ptr = make_shared<ge::model_runner::TbeTaskInfo>(
+    stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data, input_data_addrs,
+    output_data_addrs, workspace_addrs);
+  return {task_info_ptr};
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.h b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.h
new file mode 100644
index 0000000000..18d342f629
--- /dev/null
+++ b/mindspore/ccsrc/kernel/akg/ascend/akg_ascend_kernel_mod.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
+#define MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include "kernel/ascend_kernel_mod.h"
+#include "kernel/tbe/tbe_utils.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgKernelMod : public AscendKernelMod {
+ public:
+  explicit AkgKernelMod(const KernelPackPtr &kernel_pack);
+  ~AkgKernelMod() final {}
+
+  void SetInputSizeList(const std::vector<size_t> &size_list);
+  void SetOutputSizeList(const std::vector<size_t> &size_list);
+  void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
+  const std::vector<size_t> &GetInputSizeList() const override;
+  const std::vector<size_t> &GetOutputSizeList() const override;
+  const std::vector<size_t> &GetWorkspaceSizeList() const override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+  std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
+
+ private:
+  KernelPackPtr kernel_pack_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+
+using AkgKernelModPtr = std::shared_ptr<AkgKernelMod>;
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_AKG_ASCEND_AKG_ASCEND_KERNEL_MOD_H_
diff --git a/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc b/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc
index 2bb2cfd267..534e355802 100644
--- a/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc
+++ b/mindspore/ccsrc/kernel/akg/gpu/akg_gpu_kernel_build.cc
@@ -18,7 +18,7 @@
 #include <vector>
 #include <memory>
 #include "kernel/kernel.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "kernel/akg/gpu/akg_gpu_kernel_mod.h"
 #include "common/utils.h"
 
diff --git a/mindspore/ccsrc/kernel/common_utils.cc b/mindspore/ccsrc/kernel/common_utils.cc
index e80037fa6e..3de03069ed 100644
--- a/mindspore/ccsrc/kernel/common_utils.cc
+++ b/mindspore/ccsrc/kernel/common_utils.cc
@@ -23,6 +23,11 @@
 #include "nlohmann/json.hpp"
 #include "session/anf_runtime_algorithm.h"
 #include "common/utils.h"
+#include "ir/manager.h"
+#include "ir/meta_tensor.h"
+#include "ir/func_graph.h"
+#include "operator/ops.h"
+#include "utils/graph_utils.h"
 
 namespace mindspore {
 namespace kernel {
@@ -48,12 +53,6 @@ const std::map<TypeId, std::string> type_id_str_map = {
   {TypeId::kNumberTypeBool, "bool"},
 };
 
-const std::map<std::string, std::string> DATATYPE_STRING_MAP{
-  {"Float32", "float32"}, {"Float16", "float16"}, {"Int8", "int8"},   {"Int16", "int16"},
-  {"UInt16", "uint16"},   {"UInt8", "uint8"},     {"Int32", "int32"}, {"UInt32", "uint32"},
-  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool_", "bool"},  {"Float64", "double"},
-};
-
 const std::unordered_map<std::string, std::string> dtype_shortdtype_map_ = {
   {"float16", "f16"}, {"float32", "f32"}, {"float64", "f64"}, {"int8", "i8"},    {"int16", "i16"},  {"int32", "i32"},
   {"int64", "i64"},   {"uint8", "u8"},    {"uint16", "u16"},  {"uint32", "u32"}, {"uint64", "u64"}, {"bool", "bool"},
@@ -243,14 +242,6 @@ TypeId DtypeToTypeId(const std::string &dtypes) {
   }
 }
 
-std::string Dtype2String(const std::string &dtypes) {
-  auto iter = DATATYPE_STRING_MAP.find(dtypes);
-  if (iter == DATATYPE_STRING_MAP.end()) {
-    MS_EXCEPTION(ArgumentError) << "Illegal input dtype:" << dtypes;
-  }
-  return iter->second;
-}
-
 std::string TypeId2String(TypeId type_id) {
   auto iter = type_id_str_map.find(type_id);
   if (iter == type_id_str_map.end()) {
@@ -361,7 +352,7 @@ bool SetOutputKernelBuilderInfo(const std::vector<std::shared_ptr<OpIOInfo>> &ou
       output_num = 1;
     } else {
       if (output_idx < real_output_num) {
-        MS_LOG(INFO) << "Set output kernel builder info, output type is optional, output index is :" << output_idx;
+        MS_LOG(DEBUG) << "Set output kernel builder info, output type is optional, output index is :" << output_idx;
         output_num = 1;
       }
     }
@@ -403,7 +394,7 @@ void SetKernelBuildInfo(const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBu
   }
 
   if (imply_type == kAKG) {
-    builder->SetKernelType(AUTO_DIFF_KERNEL);
+    builder->SetKernelType(AKG_KERNEL);
   } else if (imply_type == kAICPU) {
     builder->SetKernelType(AICPU_KERNEL);
   } else {
@@ -634,5 +625,256 @@ void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradie
   }
   unique_grad->indices_size_ = unique_indices_size + 1;
 }
+
+std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+
+  if (index >= AnfAlgo::GetInputTensorNum(anf_node)) {
+    MS_EXCEPTION(ArgumentError) << "Index is out of the size of anf_node inputs.";
+  }
+
+  auto cnode = anf_node->cast<CNodePtr>();
+  if (cnode == nullptr) {
+    return AnfAlgo::VisitKernel(anf_node, 0);
+  } else {
+    return AnfAlgo::VisitKernel(anf_node->cast<CNodePtr>()->input(index + 1), 0);
+  }
+}
+
+std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                                            const std::vector<AnfNodePtr> &input_list) {
+  std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> input_index;
+  for (size_t i = 0; i < input_list.size(); ++i) {
+    auto const &input = input_list[i];
+    MS_EXCEPTION_IF_NULL(input);
+    bool found = false;
+    // using NodeUsersMap = std::unordered_map<AnfNodePtr, std::set<std::pair<AnfNodePtr, int>>>;
+    auto mng = input->func_graph()->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    const NodeUsersMap &users = mng->node_users();
+    auto input_users = users.find(input);
+    if (input_users == users.end() || input_users->second.empty()) {
+      MS_EXCEPTION(ArgumentError) << "Input [" << i << "][" << input->DebugString(2) << "] of ["
+                                  << input->func_graph()->ToString() << "] has no users.";
+    }
+
+    for (auto const &input_user : input_users->second) {
+      for (auto const &anf_node : node_list) {
+        if (anf_node != input_user.first) {
+          continue;
+        }
+
+        std::vector<int> dyn_input_sizes;
+        auto prim = AnfAlgo::GetCNodePrimitive(anf_node);
+        MS_EXCEPTION_IF_NULL(prim);
+        if (prim->GetAttr(kAttrDynInputSizes) != nullptr) {
+          dyn_input_sizes = GetValue<const std::vector<int>>(prim->GetAttr(kAttrDynInputSizes));
+        }
+
+        if (dyn_input_sizes.empty()) {
+          input_index.push_back(std::make_pair(anf_node, std::make_pair(IntToSize(input_user.second - 1), 0)));
+          found = true;
+          break;
+        } else {
+          int used_as_idx = input_user.second - 1;
+          int accum_idx = 0;
+          size_t dyn_i = 0;
+          for (; dyn_i < dyn_input_sizes.size(); ++dyn_i) {
+            accum_idx += dyn_input_sizes[dyn_i];
+            if (used_as_idx < accum_idx) {
+              input_index.push_back(std::make_pair(
+                anf_node, std::make_pair(dyn_i, IntToSize(used_as_idx - (accum_idx - dyn_input_sizes[dyn_i])))));
+              break;
+            }
+          }
+          if (dyn_i != dyn_input_sizes.size()) {
+            found = true;
+            break;
+          }
+        }
+      }
+      if (found) {
+        break;
+      }
+    }
+
+    if (!found) {
+      MS_EXCEPTION(ArgumentError) << "Input [" << i << "][" << input->DebugString(2) << "] of ["
+                                  << input->func_graph()->ToString() << "] found no related kernel info.";
+    }
+  }
+  return input_index;
+}
+
+std::vector<std::pair<AnfNodePtr, size_t>> GetOutputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                          const std::vector<AnfNodePtr> &input_list,
+                                                          const std::vector<AnfNodePtr> &output_list) {
+  std::vector<std::pair<AnfNodePtr, size_t>> output_index;
+  for (size_t i = 0; i < output_list.size(); ++i) {
+    auto const &output = output_list[i];
+    MS_EXCEPTION_IF_NULL(output);
+    bool found = false;
+    auto pree_node = AnfAlgo::VisitKernel(output, 0);
+
+    auto pos = std::find(std::begin(node_list), std::end(node_list), pree_node.first);
+    if (pos != std::end(node_list)) {
+      output_index.push_back(pree_node);
+      continue;
+    }
+
+    auto ret = std::find(std::begin(input_list), std::end(input_list), pree_node.first);
+    if (ret != std::end(input_list)) {
+      output_index.push_back(std::make_pair(pree_node.first, 0));
+      found = true;
+    }
+
+    if (!found) {
+      MS_EXCEPTION(ArgumentError) << "Output [" << i << "][" << output->DebugString(2) << "] of ["
+                                  << output->func_graph()->ToString() << "] found no related kernel info.";
+    }
+  }
+  return output_index;
+}
+
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list) {
+  MS_EXCEPTION_IF_NULL(node_list);
+
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  std::vector<AnfNodePtr> node_lists = TopoSort(func_graph->get_return());
+  for (auto const &node : node_lists) {
+    if (!AnfAlgo::IsRealKernel(node) || !node->isa<CNode>()) {
+      continue;
+    }
+
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+
+    if (IsValueNode<Primitive>(cnode->input(kAnfPrimitiveIndex))) {
+      node_list->push_back(node);
+    }
+  }
+}
+
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
+                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list) {
+  MS_EXCEPTION_IF_NULL(node_list);
+  MS_EXCEPTION_IF_NULL(input_list);
+  MS_EXCEPTION_IF_NULL(output_list);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  GetValidKernelNodes(func_graph, node_list);
+
+  auto parameters = func_graph->parameters();
+  input_list->insert(input_list->begin(), parameters.begin(), parameters.end());
+
+  auto func_output = func_graph->output();
+  MS_EXCEPTION_IF_NULL(func_output);
+  if (func_output->isa<CNode>()) {
+    // multi output.
+    auto cnode = func_output->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto input0 = cnode->input(kAnfPrimitiveIndex);
+    MS_EXCEPTION_IF_NULL(input0);
+    if (IsPrimitive(input0, prim::kPrimMakeTuple)) {
+      for (size_t input_idx = 1; input_idx < cnode->inputs().size(); ++input_idx) {
+        auto input_node = cnode->input(input_idx);
+        MS_EXCEPTION_IF_NULL(input_node);
+        output_list->push_back(AnfAlgo::VisitKernel(input_node, 0).first);
+      }
+    } else {
+      // single output.
+      output_list->push_back(AnfAlgo::VisitKernel(func_output, 0).first);
+    }
+  } else {
+    // single output.
+    output_list->push_back(AnfAlgo::VisitKernel(func_output, 0).first);
+  }
+}
+
+bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  MS_EXCEPTION_IF_NULL(node_json);
+  auto cnode = anf_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (input_idx + 1 >= cnode->size()) {
+    MS_EXCEPTION(ArgumentError) << "input_idx [" << input_idx << "] is out of index of inputs of ["
+                                << cnode->inputs().size() << "][" << cnode->DebugString() << "]";
+  }
+
+  auto input_node = cnode->input(input_idx + 1);
+  if (!IsValueNode<tensor::Tensor>(input_node)) {
+    return false;
+  }
+
+  auto tensor = GetValueNode<tensor::TensorPtr>(input_node);
+  if (tensor == nullptr) {
+    return false;
+  }
+
+  auto type_id = tensor->data_type();
+  auto *data = tensor->data_c();
+  MS_EXCEPTION_IF_NULL(data);
+  if (tensor->DataDim() > 1 || tensor->DataSize() != 1) {
+    // not const tensor.
+    MS_LOG(WARNING) << "We take first value of tensor whose datasize != 1, [" << input_node->DebugString(2) << "]";
+  }
+
+  if (type_id == kFloat32->type_id()) {
+    float *val = static_cast<float *>(data);
+    MS_EXCEPTION_IF_NULL(val);
+    (*node_json)["value"] = val[0];
+    MS_LOG(DEBUG) << "Value of tensor[" << cnode->DebugString() << "] is [float32][" << *val << "].";
+    return true;
+  } else if (type_id == kFloat16->type_id()) {
+    float16 *val = static_cast<float16 *>(data);
+    MS_EXCEPTION_IF_NULL(val);
+    (*node_json)["value"] = static_cast<float>(val[0]);
+    MS_LOG(INFO) << "Value of tensor[" << cnode->DebugString() << "] is [float16][" << *val << "].";
+    return true;
+  } else if (type_id == kInt32->type_id()) {
+    int *val = static_cast<int *>(data);
+    MS_EXCEPTION_IF_NULL(val);
+    (*node_json)["value"] = val[0];
+    MS_LOG(INFO) << "Value of tensor[" << cnode->DebugString() << "] is [int32][" << *val << "].";
+    return true;
+  }
+  MS_LOG(ERROR) << "Unknown value type of tensor[" << cnode->DebugString() << "]";
+  return false;
+}
+
+void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(node_list);
+  auto output = func_graph->output();
+  MS_EXCEPTION_IF_NULL(output);
+  if (AnfAlgo::IsRealKernel(output)) {
+    // single output.
+    node_list->push_back(std::make_pair(output, 0));
+    return;
+  } else if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+    auto output_cnode = output->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(output_cnode);
+    // multi output.
+    auto &inputs = output_cnode->inputs();
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      auto in_with_idx = AnfAlgo::VisitKernel(inputs[i], 0);
+      node_list->push_back(in_with_idx);
+    }
+    return;
+  }
+  MS_EXCEPTION(ArgumentError) << "Unknown  output type: " << output->DebugString(2)
+                              << " of graph: " << func_graph->ToString();
+}
+
+bool IsWeightBoundary(const AnfNodePtr &node) {
+  if (node->isa<ValueNode>()) {
+    return true;
+  }
+  if (node->isa<Parameter>() && AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>())) {
+    return true;
+  }
+  return false;
+}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/common_utils.h b/mindspore/ccsrc/kernel/common_utils.h
index c93c7d0ac9..244d8e4e9a 100644
--- a/mindspore/ccsrc/kernel/common_utils.h
+++ b/mindspore/ccsrc/kernel/common_utils.h
@@ -20,9 +20,12 @@
 #include <dirent.h>
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 #include <map>
 #include <string>
 #include <vector>
+#include <utility>
+#include <nlohmann/json.hpp>
 #include "kernel/kernel.h"
 #include "kernel/oplib/opinfo.h"
 #include "kernel/kernel_build_info.h"
@@ -79,13 +82,11 @@ bool CheckCache(const std::string &kernel_name);
 KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &processor);
 KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
 TypeId DtypeToTypeId(const std::string &dtypes);
-std::string Dtype2String(const std::string &dtypes);
 std::string Dtype2ShortType(const std::string &dtypes);
 std::string TypeId2String(TypeId type_id);
 size_t GetDtypeNbyte(const std::string &dtypes);
 bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpInfo> &op_info_ptr, Processor processor,
                    std::vector<std::shared_ptr<KernelBuildInfo>> *const kernel_info_list);
-bool IsAtomicNode(const CNodePtr &kernel_node);
 void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
@@ -94,6 +95,18 @@ void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGr
                               size_t outer_dim);
 void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                           size_t outer_dim);
+std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
+std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                                            const std::vector<AnfNodePtr> &input_list);
+std::vector<std::pair<AnfNodePtr, size_t>> GetOutputIndex(const std::vector<AnfNodePtr> &node_list,
+                                                          const std::vector<AnfNodePtr> &input_list,
+                                                          const std::vector<AnfNodePtr> &output_list);
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list,
+                         std::vector<AnfNodePtr> *input_list, std::vector<AnfNodePtr> *output_list);
+void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr> *node_list);
+bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json);
+void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list);
+bool IsWeightBoundary(const AnfNodePtr &node);
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/kernel/kash/kernel_pack.cc b/mindspore/ccsrc/kernel/kash/kernel_pack.cc
index 31f81d5d02..79e2ab9dbb 100644
--- a/mindspore/ccsrc/kernel/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/kernel/kash/kernel_pack.cc
@@ -17,7 +17,7 @@
 #include <fstream>
 #include "mindspore/ccsrc/kernel/kernel.h"
 #include "kernel/kernel.h"
-#include "kernel/akg/akgkernelbuild.h"
+#include "kernel/akg/akg_kernel_build.h"
 #include "nlohmann/json.hpp"
 #include "securec/include/securec.h"
 #include "pipeline/parse/python_adapter.h"
diff --git a/mindspore/ccsrc/kernel/kernel.h b/mindspore/ccsrc/kernel/kernel.h
index 4adb3ea025..7bccce49c3 100644
--- a/mindspore/ccsrc/kernel/kernel.h
+++ b/mindspore/ccsrc/kernel/kernel.h
@@ -27,7 +27,7 @@
 #include "utils/log_adapter.h"
 
 namespace mindspore {
-enum KernelType : int { UNKNOWN_KERNEL_TYPE = 0, AUTO_DIFF_KERNEL, AICPU_KERNEL, RT_KERNEL, HCCL_KERNEL, TBE_KERNEL };
+enum KernelType : int { UNKNOWN_KERNEL_TYPE = 0, AKG_KERNEL, AICPU_KERNEL, RT_KERNEL, HCCL_KERNEL, TBE_KERNEL };
 
 namespace kernel {
 
diff --git a/mindspore/ccsrc/kernel/kernel_query.cc b/mindspore/ccsrc/kernel/kernel_query.cc
index f96d0cbebf..6538c28765 100755
--- a/mindspore/ccsrc/kernel/kernel_query.cc
+++ b/mindspore/ccsrc/kernel/kernel_query.cc
@@ -21,6 +21,7 @@
 #include "kernel/rts/rt_kernel_info.h"
 #include "kernel/hccl/hccl_kernel_metadata.h"
 #include "kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"
+#include "kernel/akg/akg_kernel_metadata.h"
 #include "session/anf_runtime_algorithm.h"
 
 namespace mindspore {
@@ -59,10 +60,14 @@ void FilterInvalidKernelInfo(const CNodePtr &kernel_node,
   }
 }
 }  // namespace
-void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
+
+void KernelQueryAll(const CNodePtr &kernel_node,
+                    std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
   MS_EXCEPTION_IF_NULL(kernel_node);
   MS_EXCEPTION_IF_NULL(kernel_info_list);
+
   TbeMetadataInfo(kernel_node, kernel_info_list);
+
   if (kernel_info_list->empty()) {
     AicpuMetadataInfo(kernel_node, kernel_info_list);
     if (!kernel_info_list->empty()) {
@@ -82,6 +87,28 @@ void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel
   if (kernel_info_list->empty()) {
     MS_LOG(EXCEPTION) << "Op " << kernel_node->DebugString() << "kernel query fail!";
   }
+}
+
+void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list,
+                 KernelType kernel_type) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_info_list);
+
+  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
+
+  switch (kernel_type) {
+    case KernelType::AKG_KERNEL:
+      AkgMetadataInfo(kernel_node, kernel_info_list);
+      break;
+    default:
+      KernelQueryAll(kernel_node, kernel_info_list);
+      break;
+  }
+
+  if (kernel_info_list->empty()) {
+    MS_EXCEPTION(NotExistsError) << "Op[" << kernel_node->DebugString() << "] kernel query fail!";
+  }
+  // check output
   FilterInvalidKernelInfo(kernel_node, kernel_info_list);
 }
 
diff --git a/mindspore/ccsrc/kernel/kernel_query.h b/mindspore/ccsrc/kernel/kernel_query.h
index fe8696a919..257b0cf073 100644
--- a/mindspore/ccsrc/kernel/kernel_query.h
+++ b/mindspore/ccsrc/kernel/kernel_query.h
@@ -25,7 +25,8 @@
 
 namespace mindspore {
 namespace kernel {
-void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
+void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list,
+                 KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE);
 void AICPUQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
 bool IsSupportedByAICPU(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info);
 bool IsSupportedByAICore(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info);
diff --git a/mindspore/ccsrc/kernel/oplib/oplib.cc b/mindspore/ccsrc/kernel/oplib/oplib.cc
index 42ec534ae0..35bc407026 100644
--- a/mindspore/ccsrc/kernel/oplib/oplib.cc
+++ b/mindspore/ccsrc/kernel/oplib/oplib.cc
@@ -272,8 +272,7 @@ std::shared_ptr<OpInfo> OpLib::FindOp(const std::string &op_name, OpImplyType im
   auto context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context);
   bool is_gpu = (context->device_target() == kGPUDevice);
-  if ((is_gpu && (imply_type == kTBE || imply_type == kAICPU)) ||
-      (!is_gpu && (imply_type != kTBE && imply_type != kAICPU))) {
+  if (is_gpu && (imply_type == kTBE || imply_type == kAICPU)) {
     MS_LOG(ERROR) << "FindOp failed: opname: " << op_name << ", imply_type: " << ImplTypeToStr(imply_type)
                   << ", current op num: " << op_info_.size();
     return nullptr;
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
index f1e827d6dd..3007280a14 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_adapter.cc
@@ -347,7 +347,7 @@ static int TypeStrToDstType(const std::string &type_str) {
     ret = 4;
   } else if (type_str == "UInt64") {
     ret = 10;
-  } else if (type_str == "Bool_") {
+  } else if (type_str == "Bool") {
     ret = 12;
   } else {
     MS_LOG(INFO) << "Error type str is invailed: " << type_str;
diff --git a/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc b/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc
index ee9be22120..90c5557253 100644
--- a/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_convert_utils.cc
@@ -51,7 +51,7 @@ const std::map<TypeId, std::string> type_id_str_maps = {
 const std::map<std::string, std::string> type_str_maps = {
   {"Float32", "float32"}, {"Float16", "float16"}, {"Int8", "int8"},   {"Int16", "int16"},
   {"UInt16", "uint16"},   {"UInt8", "uint8"},     {"Int32", "int32"}, {"UInt32", "uint32"},
-  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool_", "int8"},  {"Float64", "float64"},
+  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool", "int8"},   {"Float64", "float64"},
 };
 
 const std::unordered_map<std::string, size_t> type_nbyte_maps = {
diff --git a/mindspore/ccsrc/operator/composite/composite.cc b/mindspore/ccsrc/operator/composite/composite.cc
index 221d2b9aac..75532b9fbd 100644
--- a/mindspore/ccsrc/operator/composite/composite.cc
+++ b/mindspore/ccsrc/operator/composite/composite.cc
@@ -334,8 +334,8 @@ ArgsPairList HyperMap::Harmonize(const FuncGraphPtr &func_graph, const ArgsPairL
 
 FuncGraphPtr HyperMap::GenerateFromTypes(const TypePtrList &args_spec_list) {
   FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
   ptrGraph->debug_info()->set_name("hyper_map");
 
   AnfNodePtr ptrFnArg = nullptr;
@@ -389,7 +389,7 @@ FuncGraphPtr Tail::GenerateTupleFuncGraph(const abstract::AbstractTuplePtr &a_tu
   MS_EXCEPTION_IF_NULL(a_tuple);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   ret->debug_info()->set_name("tail");
   AnfNodePtr ptrTup = ret->add_parameter();
 
@@ -409,7 +409,7 @@ FuncGraphPtr Tail::GenerateListFuncGraph(const abstract::AbstractListPtr &a_list
   MS_EXCEPTION_IF_NULL(a_list);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   ret->debug_info()->set_name("tail");
   AnfNodePtr ptrList = ret->add_parameter();
 
@@ -481,10 +481,10 @@ FuncGraphPtr MakeTupleGradient::GenerateFuncGraph(const AbstractBasePtrList &arg
     grads.push_back(b->NewCNode({NewValueNode(prim::kPrimTupleGetItem), dout, NewValueNode(i)}));
   }
 
-  b->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  b->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   b->set_output(b->NewCNode(grads));
 
-  fg->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fg->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   fg->set_output(fg->NewCNode({NewValueNode(prim::kPrimMakeTuple), out, NewValueNode(b)}));
   (void)fg->transforms().emplace("primal", FuncGraphTransform(prim::kPrimMakeTuple));
   return fg;
@@ -504,7 +504,7 @@ FuncGraphPtr GradOperation::GetGrad(AnfNodePtr node, const AnfNodePtr &weights,
                                     const std::vector<AnfNodePtr> &params_list, const std::vector<AnfNodePtr> &args,
                                     bool applyJ) {
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
 
   auto weights_node = weights;
   if (weights == nullptr && !args.empty()) {
@@ -625,7 +625,7 @@ FuncGraphPtr GradOperation::GenerateFuncGraph(const AbstractBasePtrList &args_sp
 
   std::ostringstream ss;
   ss << "grad{" << nparam << "}";
-  dfBuilder->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  dfBuilder->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   dfBuilder->debug_info()->set_name(ss.str());
   ParameterPtr param_graph = dfBuilder->add_parameter();
 
@@ -671,7 +671,7 @@ FuncGraphPtr ListMap::GenerateFuncGraph(const AbstractBasePtrList &args_spec_lis
   }
 
   FuncGraphPtr fg_ptr = std::make_shared<FuncGraph>();
-  fg_ptr->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fg_ptr->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   fg_ptr->debug_info()->set_name("list_map");
   AnfNodePtr fn = fg_ptr->add_parameter();
 
@@ -741,7 +741,7 @@ void ListMap::MakeCond(const std::vector<AnfNodePtr> &lists, const FuncGraphPtr
   // cond = reduce(lambda a, b: g.apply(P.bool_and, a, b), hasnexts)
   FuncGraphPtr fgtrue_ptr = std::make_shared<FuncGraph>();
   fgtrue_ptr->debug_info()->set_name("ftrue");
-  fgtrue_ptr->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fgtrue_ptr->set_flag(FUNC_GRAPH_FLAG_CORE, true);
 
   CNodePtr fgtrue_output_cnode = fgtrue_ptr->NewCNode({NewValueNode(fgnext_ptr), fn, resl});
   auto inputs = fgtrue_output_cnode->inputs();
@@ -751,7 +751,7 @@ void ListMap::MakeCond(const std::vector<AnfNodePtr> &lists, const FuncGraphPtr
 
   FuncGraphPtr fgfalse_ptr = std::make_shared<FuncGraph>();
   fgfalse_ptr->debug_info()->set_name("ffalse");
-  fgfalse_ptr->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  fgfalse_ptr->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   fgfalse_ptr->set_output(resl);
 
   AnfNodePtr output_cnode = fg_ptr->NewCNode({NewValueNode(prim::kPrimSwitch), NewValueNode(std::string("cond")),
@@ -808,7 +808,7 @@ FuncGraphPtr TupleAdd::GenerateFuncGraph(const AbstractBasePtrList &args_spec_li
   }
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   AnfNodePtr p_tup_a = ret->add_parameter();
   AnfNodePtr p_tup_b = ret->add_parameter();
 
@@ -912,7 +912,7 @@ FuncGraphPtr TupleSlice::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
   GenerateTupleSliceParameter(tuple, slice, &start_index, &stop_index, &step_value);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   AnfNodePtr p_tuple = ret->add_parameter();
   (void)ret->add_parameter();
 
@@ -941,7 +941,7 @@ FuncGraphPtr TupleGetItemTensor::GenerateFuncGraph(const AbstractBasePtrList &ar
   AbstractBasePtrList branches = branches_abs->elements();
   if (branches.size() > 0 && branches[0] != nullptr && branches[0]->isa<AbstractFunction>()) {
     FuncGraphPtr ret_graph = std::make_shared<FuncGraph>();
-    ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+    ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
     AnfNodePtr functions = ret_graph->add_parameter();
     auto index = ret_graph->add_parameter();
 
diff --git a/mindspore/ccsrc/operator/composite/do_signature.cc b/mindspore/ccsrc/operator/composite/do_signature.cc
index 283afe5d5b..3569662d29 100644
--- a/mindspore/ccsrc/operator/composite/do_signature.cc
+++ b/mindspore/ccsrc/operator/composite/do_signature.cc
@@ -304,7 +304,7 @@ FuncGraphPtr DoSignatureMetaFuncGraph::GenerateFuncGraph(const AbstractBasePtrLi
   }
   auto new_cnode = BuildNewCNode(func_graph, name_, function_, args_spec_list, func_graph->parameters());
   func_graph->set_output(new_cnode);
-  func_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  func_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   return func_graph;
 }
 }  // namespace prim
diff --git a/mindspore/ccsrc/operator/composite/list_append_operation.cc b/mindspore/ccsrc/operator/composite/list_append_operation.cc
index b5a4fc626e..236a5b7062 100644
--- a/mindspore/ccsrc/operator/composite/list_append_operation.cc
+++ b/mindspore/ccsrc/operator/composite/list_append_operation.cc
@@ -35,7 +35,7 @@ FuncGraphPtr ListAppend::GenerateFuncGraph(const abstract::AbstractBasePtrList &
   MS_EXCEPTION_IF_NULL(arg0_list);
 
   FuncGraphPtr ret = std::make_shared<FuncGraph>();
-  ret->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   ret->debug_info()->set_name("append");
   AnfNodePtr arg0_node = ret->add_parameter();
 
diff --git a/mindspore/ccsrc/operator/composite/map.cc b/mindspore/ccsrc/operator/composite/map.cc
index 6752cfe078..a054da5f4d 100644
--- a/mindspore/ccsrc/operator/composite/map.cc
+++ b/mindspore/ccsrc/operator/composite/map.cc
@@ -51,8 +51,8 @@ AnfNodePtr Map::FullMakeLeaf(const FuncGraphPtr &func_graph, const AnfNodePtr &f
 FuncGraphPtr Map::GenerateLeafFunc(const size_t &args_size) {
   // Generate func for leaf nodes
   FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
   ptrGraph->debug_info()->set_name("map");
   AnfNodePtr ptrFnArg = nullptr;
   if (fn_leaf_ == nullptr) {
@@ -237,8 +237,8 @@ AnfNodePtr Map::Make(const FuncGraphPtr &func_graph, const AnfNodePtr &fn_arg, c
 
 FuncGraphPtr Map::GenerateFromTypes(const TypePtrList &args_spec_list) {
   FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
-  ptrGraph->set_flags(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
   ptrGraph->debug_info()->set_name("map");
 
   AnfNodePtr ptrFnArg = nullptr;
diff --git a/mindspore/ccsrc/operator/composite/unpack_call.cc b/mindspore/ccsrc/operator/composite/unpack_call.cc
index 6363d495c5..3993d41597 100644
--- a/mindspore/ccsrc/operator/composite/unpack_call.cc
+++ b/mindspore/ccsrc/operator/composite/unpack_call.cc
@@ -51,7 +51,7 @@ FuncGraphPtr UnpackCall::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
 
   (void)abstract::CheckArg<AbstractFunction>(op_name, args_spec_list, 0);
   auto ret_graph = std::make_shared<FuncGraph>();
-  ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
 
   AnfNodePtr fnNode = ret_graph->add_parameter();
   std::vector<AnfNodePtr> elems;
diff --git a/mindspore/ccsrc/operator/composite/zip_operation.cc b/mindspore/ccsrc/operator/composite/zip_operation.cc
index 4d34163f28..33e21da044 100644
--- a/mindspore/ccsrc/operator/composite/zip_operation.cc
+++ b/mindspore/ccsrc/operator/composite/zip_operation.cc
@@ -57,7 +57,7 @@ FuncGraphPtr ZipOperation::GenerateFuncGraph(const AbstractBasePtrList &args_spe
                                     return (x->cast<AbstractTuplePtr>()->size() < y->cast<AbstractTuplePtr>()->size());
                                   });
   FuncGraphPtr ret_graph = std::make_shared<FuncGraph>();
-  ret_graph->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   for (size_t idx = 0; idx < args_spec_list.size(); idx++) {
     (void)ret_graph->add_parameter();
   }
diff --git a/mindspore/ccsrc/operator/ops.cc b/mindspore/ccsrc/operator/ops.cc
index cae61f64d0..f86cbd7fd2 100755
--- a/mindspore/ccsrc/operator/ops.cc
+++ b/mindspore/ccsrc/operator/ops.cc
@@ -50,6 +50,12 @@ const PrimitivePtr kPrimBoolNot = std::make_shared<Primitive>("bool_not");
 const PrimitivePtr kPrimBoolAnd = std::make_shared<Primitive>("bool_and");
 const PrimitivePtr kPrimBoolOr = std::make_shared<Primitive>("bool_or");
 const PrimitivePtr kPrimBoolEq = std::make_shared<Primitive>("bool_eq");
+const PrimitivePtr kPrimGreater = std::make_shared<Primitive>("Greater");
+const PrimitivePtr kPrimGreaterEqual = std::make_shared<Primitive>("GreaterEqual");
+const PrimitivePtr kPrimLess = std::make_shared<Primitive>("Less");
+const PrimitivePtr kPrimLessEqual = std::make_shared<Primitive>("LessEqual");
+const PrimitivePtr kPrimEqual = std::make_shared<Primitive>("Equal");
+const PrimitivePtr kPrimNotEqual = std::make_shared<Primitive>("NotEqual");
 
 // Type introspection
 const PrimitivePtr kPrimTypeOf = std::make_shared<Primitive>("typeof");
@@ -166,17 +172,20 @@ const PrimitivePtr kPrimMul = std::make_shared<Primitive>("Mul");
 const PrimitivePtr kPrimMinimum = std::make_shared<Primitive>("Minimum");
 const PrimitivePtr kPrimMaximum = std::make_shared<Primitive>("Maximum");
 const PrimitivePtr kPrimSquare = std::make_shared<Primitive>("Square");
-const PrimitivePtr kPrimEqual = std::make_shared<Primitive>("Equal");
-const PrimitivePtr kPrimLess = std::make_shared<Primitive>("Less");
-const PrimitivePtr kPrimLessEqual = std::make_shared<Primitive>("LessEqual");
 const PrimitivePtr kPrimCumSum = std::make_shared<Primitive>("CumSum");
 const PrimitivePtr kPrimCumProd = std::make_shared<Primitive>("CumProd");
 const PrimitivePtr kPrimSubscalar = std::make_shared<Primitive>("Subscalar");
 const PrimitivePtr kPrimInplaceAdd = std::make_shared<Primitive>("InplaceAdd");
 const PrimitivePtr kPrimInplaceSub = std::make_shared<Primitive>("InplaceSub");
+const PrimitivePtr kPrimPow = std::make_shared<Primitive>("Pow");
+const PrimitivePtr kPrimRealDiv = std::make_shared<Primitive>("RealDiv");
+const PrimitivePtr kPrimSqrt = std::make_shared<Primitive>("Sqrt");
+const PrimitivePtr kPrimReciprocal = std::make_shared<Primitive>("Reciprocal");
+const PrimitivePtr kPrimExpandDims = std::make_shared<Primitive>("ExpandDims");
 
 // NN
 const PrimitivePtr kPrimFlatten = std::make_shared<Primitive>("Flatten");
+const PrimitivePtr kPrimSoftmax = std::make_shared<Primitive>("Softmax");
 const PrimitivePtr kPrimLogSoftmax = std::make_shared<Primitive>("LogSoftmax");
 const PrimitivePtr kPrimLogSoftmaxGrad = std::make_shared<Primitive>("LogSoftmaxGrad");
 const PrimitivePtr kPrimTanh = std::make_shared<Primitive>("Tanh");
@@ -253,6 +262,7 @@ const PrimitivePtr kPrimInDict = std::make_shared<Primitive>("in_dict");
 const PrimitivePtr kPrimNotInDict = std::make_shared<Primitive>("not_in_dict");
 const PrimitivePtr kPrimMixedPrecisionCast = std::make_shared<Primitive>("mixed_precision_cast");
 const PrimitivePtr kPrimIsConsant = std::make_shared<Primitive>("is_constant");
+const PrimitivePtr kPrimEquivFormat = std::make_shared<Primitive>("EquivFormat");
 
 // Comm ops
 const PrimitivePtr kPrimMirror = std::make_shared<Primitive>("_MirrorOperator");
diff --git a/mindspore/ccsrc/operator/ops.h b/mindspore/ccsrc/operator/ops.h
index 3b9ac01089..65327cf407 100755
--- a/mindspore/ccsrc/operator/ops.h
+++ b/mindspore/ccsrc/operator/ops.h
@@ -59,6 +59,12 @@ extern const PrimitivePtr kPrimBoolNot;
 extern const PrimitivePtr kPrimBoolAnd;
 extern const PrimitivePtr kPrimBoolOr;
 extern const PrimitivePtr kPrimBoolEq;
+extern const PrimitivePtr kPrimGreater;
+extern const PrimitivePtr kPrimGreaterEqual;
+extern const PrimitivePtr kPrimLess;
+extern const PrimitivePtr kPrimLessEqual;
+extern const PrimitivePtr kPrimEqual;
+extern const PrimitivePtr kPrimNotEqual;
 
 // Type introspection
 extern const PrimitivePtr kPrimTypeOf;
@@ -157,6 +163,10 @@ extern const PrimitivePtr KPrimTransData;
 extern const PrimitivePtr kPrimNMSWithMask;
 extern const PrimitivePtr kPrimPad;
 extern const PrimitivePtr kPrimArgMaxWithValue;
+extern const PrimitivePtr kPrimRealDiv;
+extern const PrimitivePtr kPrimSqrt;
+extern const PrimitivePtr kPrimReciprocal;
+extern const PrimitivePtr kPrimExpandDims;
 
 // Maths
 extern const PrimitivePtr kPrimTensorAdd;
@@ -183,9 +193,11 @@ extern const PrimitivePtr kPrimCumProd;
 extern const PrimitivePtr kPrimSubscalar;
 extern const PrimitivePtr kPrimInplaceAdd;
 extern const PrimitivePtr kPrimInplaceSub;
+extern const PrimitivePtr kPrimPow;
 
 // NN
 extern const PrimitivePtr kPrimFlatten;
+extern const PrimitivePtr kPrimSoftmax;
 extern const PrimitivePtr kPrimLogSoftmax;
 extern const PrimitivePtr kPrimLogSoftmaxGrad;
 extern const PrimitivePtr kPrimApplyCenteredRMSProp;
@@ -263,6 +275,7 @@ extern const PrimitivePtr kPrimInDict;
 extern const PrimitivePtr kPrimNotInDict;
 extern const PrimitivePtr kPrimMixedPrecisionCast;
 extern const PrimitivePtr kPrimIsConsant;
+extern const PrimitivePtr kPrimEquivFormat;
 
 // Comm ops
 extern const PrimitivePtr kPrimAllReduce;
diff --git a/mindspore/ccsrc/optimizer/ad/dfunctor.cc b/mindspore/ccsrc/optimizer/ad/dfunctor.cc
index cde90db346..e192f3912e 100644
--- a/mindspore/ccsrc/optimizer/ad/dfunctor.cc
+++ b/mindspore/ccsrc/optimizer/ad/dfunctor.cc
@@ -45,10 +45,19 @@ DFunctor::DFunctor(const FuncGraphPtr &primal_graph, const pipeline::ResourceBas
     : primal_graph_(primal_graph), resources_(resources), need_cut_(false), is_top_(false) {
   TraceManager::DebugTrace(std::make_shared<TraceGradFprop>(primal_graph->debug_info()));
   k_graph_ = std::make_shared<FuncGraph>();
+  if (primal_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+    std::string grad_op_name = GetValue<std::string>(primal_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+    k_graph_->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(grad_op_name));
+  }
   TraceManager::EndTrace();
 
   TraceManager::DebugTrace(std::make_shared<TraceGradBprop>(primal_graph->debug_info()));
   tape_ = std::make_shared<FuncGraph>();
+  // Add "_Grad" postfix
+  if (primal_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+    std::string grad_op_name = GetValue<std::string>(primal_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) + "_Grad";
+    tape_->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(grad_op_name));
+  }
   TraceManager::EndTrace();
 
   dout_ = tape_->add_parameter();
@@ -368,7 +377,7 @@ FuncGraphPtr DFunctor::KUserDefined(const FuncGraphPtr &primal) {
     (void)primal->transforms().insert(std::make_pair("grad", FuncGraphTransform(fg)));
     (void)fg->transforms().insert(std::make_pair("primal", FuncGraphTransform(primal)));
     // Reset defer_inline to enable successive inlining
-    primal->set_flags(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
+    primal->set_flag(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
 
     auto functor = std::make_shared<DFunctor>(primal, resources_);
     functor->Init();
diff --git a/mindspore/ccsrc/optimizer/ad/grad.cc b/mindspore/ccsrc/optimizer/ad/grad.cc
index 43d2a66ad2..d141dc6eea 100644
--- a/mindspore/ccsrc/optimizer/ad/grad.cc
+++ b/mindspore/ccsrc/optimizer/ad/grad.cc
@@ -37,7 +37,7 @@ FuncGraphPtr Grad(const FuncGraphPtr &func_graph, const pipeline::ResourceBasePt
   auto multi_graph_sink = [&func_graph](const FuncGraphPtr &f) {
     if (MsContext::GetInstance()->is_multi_graph_sink()) {
       if (func_graph->has_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES)) {
-        f->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+        f->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
       }
     }
   };
diff --git a/mindspore/ccsrc/optimizer/clean.cc b/mindspore/ccsrc/optimizer/clean.cc
index fafe26e2ed..6a54597282 100644
--- a/mindspore/ccsrc/optimizer/clean.cc
+++ b/mindspore/ccsrc/optimizer/clean.cc
@@ -78,7 +78,10 @@ AnfNodePtr ConvertGetAttrToTupleGetItem(const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(cons);
 
   auto dt = data->abstract();
-  MS_EXCEPTION_IF_NULL(dt);
+  if (dt == nullptr) {
+    return nullptr;
+  }
+
   if (!dt->isa<AbstractClass>()) {
     MS_LOG(EXCEPTION) << "First parameter of getattr is not AbstractClass, but " << dt->type_name() << ".";
   }
diff --git a/mindspore/ccsrc/optimizer/graph_kernel_reuse.cc b/mindspore/ccsrc/optimizer/graph_kernel_reuse.cc
new file mode 100644
index 0000000000..dc20ad925e
--- /dev/null
+++ b/mindspore/ccsrc/optimizer/graph_kernel_reuse.cc
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "optimizer/graph_kernel_reuse.h"
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "./common.h"
+#include "utils/graph_utils.h"
+
+namespace mindspore {
+/* namespace to support opt */
+namespace opt {
+
+bool GraphKernelReuse::CompareNode(const AnfNodePtr a, const AnfNodePtr b) {
+  if (a->abstract() && b->abstract()) {
+    auto a_type = a->abstract()->GetTypeTrack();
+    auto b_type = b->abstract()->GetTypeTrack();
+
+    if (a_type != b_type) {
+      return false;
+    }
+
+    auto a_shape = a->abstract()->GetShapeTrack();
+    auto b_shape = b->abstract()->GetShapeTrack();
+    if (a_shape != nullptr && a_shape == b_shape) {
+      return true;
+    }
+
+    if (a_shape != nullptr && b_shape != nullptr && a_shape->isa<abstract::Shape>() &&
+        b_shape->isa<abstract::Shape>()) {
+      return a_shape->cast<abstract::ShapePtr>()->shape() == b_shape->cast<abstract::ShapePtr>()->shape();
+    }
+  }
+  return false;
+}
+
+bool GraphKernelReuse::DoReplace(const FuncGraphManagerPtr manager) {
+  bool changed = false;
+  auto fgs = manager->func_graphs();
+  for (FuncGraphPtr &fg : fgs) {
+    if (!fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      continue;
+    }
+    std::string key = GetValue<std::string>(fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+    if (graph_kernel_ops.find(key) != graph_kernel_ops.end()) {
+      if (find(graph_kernel_ops[key].begin(), graph_kernel_ops[key].end(), fg) == graph_kernel_ops[key].end()) {
+        FuncGraphPtr new_fg = nullptr;
+        for (auto &cfg : graph_kernel_ops[key]) {
+          // If two graphs have different size then continue
+          auto fg_topos = TopoSort(fg->get_return());
+          auto cfg_topos = TopoSort(cfg->get_return());
+          if (fg_topos.size() != cfg_topos.size()) {
+            continue;
+          }
+
+          // Compare const tensor
+          bool has_same = true;
+          for (size_t i = 0; i < fg_topos.size(); ++i) {
+            if (IsValueNode<tensor::Tensor>(fg_topos[i])) {
+              if (!IsValueNode<tensor::Tensor>(cfg_topos[i])) {
+                has_same = false;
+                break;
+              }
+
+              auto tensor1 = GetValueNode<tensor::TensorPtr>(fg_topos[i]);
+              auto tensor2 = GetValueNode<tensor::TensorPtr>(cfg_topos[i]);
+              if (!tensor1->ValueEqual(*tensor2)) {
+                has_same = false;
+                break;
+              }
+            }
+          }
+
+          if (!has_same) {
+            continue;
+          }
+
+          auto fg_input = fg->parameters();
+          auto cfg_input = cfg->parameters();
+          if (fg_input.size() != cfg_input.size()) {
+            continue;
+          }
+          // Compare input
+          for (size_t i = 0; i < fg_input.size(); ++i) {
+            if (!CompareNode(fg_input[i], cfg_input[i])) {
+              has_same = false;
+              break;
+            }
+          }
+          if (!has_same) {
+            continue;
+          }
+
+          // Compare output
+          if (!CompareNode(fg->output(), cfg->output())) {
+            continue;
+          }
+
+          // Find reusable fg
+          new_fg = cfg;
+          break;
+        }
+
+        if (new_fg != nullptr) {
+          // Replace current fg with existing fg
+          auto users = fg->func_graph_cnodes_index();
+          for (auto &iter : users) {
+            auto cnode = iter.first->first->cast<CNodePtr>();
+            auto new_input = cnode->inputs();
+            auto main_graph = cnode->func_graph();
+            MS_EXCEPTION_IF_NULL(main_graph);
+            if (IsPrimitiveCNode(cnode, prim::kPrimPartial)) {
+              new_input[1] = NewValueNode(new_fg);
+            } else {
+              new_input[0] = NewValueNode(new_fg);
+            }
+            auto new_cnode = main_graph->NewCNode(new_input);
+            manager->Replace(iter.first->first, new_cnode);
+            changed = true;
+          }
+
+        } else {
+          // Add current fg to map
+          graph_kernel_ops[key].push_back(fg);
+        }
+      }
+    } else {
+      graph_kernel_ops[key] = {fg};
+    }
+  }
+
+  return changed;
+}
+
+bool GraphKernelReuse::ReuseGraphKernel(const FuncGraphPtr root, const FuncGraphManagerPtr manager) {
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->AddFuncGraph(root);
+
+  return DoReplace(manager);
+}
+
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/graph_kernel_reuse.h b/mindspore/ccsrc/optimizer/graph_kernel_reuse.h
new file mode 100644
index 0000000000..ed5cc93d18
--- /dev/null
+++ b/mindspore/ccsrc/optimizer/graph_kernel_reuse.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_OPTIMIZER_GRAPH_KERNEL_OP_REUSE_H
+#define MINDSPORE_CCSRC_OPTIMIZER_GRAPH_KERNEL_OP_REUSE_H
+
+#include <mindspore/ccsrc/session/anf_runtime_algorithm.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "optimizer/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+
+// Common subexpression elimination.
+class GraphKernelReuse {
+ public:
+  GraphKernelReuse() : count(0) {}
+  virtual ~GraphKernelReuse() = default;
+
+  bool operator()(const FuncGraphPtr &root, const OptimizerPtr &optimizer) {
+    bool chg = ReuseGraphKernel(root, optimizer->resource()->manager());
+    return chg;
+  }
+
+  bool CompareNode(const AnfNodePtr a, const AnfNodePtr other);
+  bool DoReplace(const FuncGraphManagerPtr manager);
+
+  bool ReuseGraphKernel(const FuncGraphPtr root, const FuncGraphManagerPtr manager);
+
+ private:
+  std::unordered_map<std::string, std::vector<FuncGraphPtr>> graph_kernel_ops;
+  int count;
+};
+
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_OPTIMIZER_GRAPH_KERNEL_OP_REUSE_H
diff --git a/mindspore/ccsrc/optimizer/irpass.cc b/mindspore/ccsrc/optimizer/irpass.cc
index 5daf080492..72177ccb06 100644
--- a/mindspore/ccsrc/optimizer/irpass.cc
+++ b/mindspore/ccsrc/optimizer/irpass.cc
@@ -41,6 +41,8 @@
 #include "optimizer/irpass/incorporate_call.h"
 #include "optimizer/irpass/grad_var_prepare.h"
 #include "optimizer/irpass/param_replace.h"
+#include "optimizer/irpass/mark_interface_fusion.h"
+#include "optimizer/opt.h"
 
 namespace mindspore {
 namespace opt {
@@ -48,7 +50,7 @@ namespace irpass {
 OptimizeIRPassLib::OptimizeIRPassLib() {
   arithmetic_simplify_ = MakeSubstitution(ArithmeticSimplify(), "arithmetic_simplify",
                                           {prim::kPrimScalarAdd, prim::kPrimScalarMul, prim::kPrimTensorAdd,
-                                           prim::kPrimIdentity, prim::kPrimMomentum, prim::kPrimMul});
+                                           prim::kPrimIdentity, prim::kPrimMomentum, prim::kPrimMul, prim::kPrimPow});
   special_op_eliminate_ =
     MakeSubstitution(SpecialOpEliminater(), "special_op_eliminate",
                      {prim::kPrimInsertGradientOf, prim::kPrimStopGradient, prim::kPrimHookBackward,
@@ -90,7 +92,6 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
   replace_refkey_by_param_ =
     MakeSubstitution(ReplaceRefkeyByParam(), "replace_refkey_by_param", IsValueNode<RefKey>, opt::FORCE_RENORM);
   replace_old_param_ = MakeSubstitution(ReplaceOldParam(), "replace_old_param", IsParam);
-
   // Gradient transforms
   expand_jprim_ = MakeSubstitution(ExpandJPrim(), "expand_jprim", prim::kPrimJ);
   minmaximum_grad_ = MakeSubstitution(MinMaximumGrad(), "minmaximum_grad", prim::kPrimTupleGetItem);
@@ -115,6 +116,8 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
   // Incorporation
   incorporate_getitem_set_ =
     MakeSubstitution(IncorporateGetitemSet(), "incorporate_getitem_set", prim::kPrimTupleGetItem);
+  incorporate_getitem_from_param_ =
+    MakeSubstitution(IncorporateGetitemFromParam(), "incorporate_getitem_from_param", IsCNodeGraphKernel);
   incorporate_call_ = MakeSubstitution(IncorporateCall(), "incorporate_call", IsCNodeDup);
   incorporate_call_switch_ = MakeSubstitution(IncorporateCallSwitch(), "incorporate_call_switch", IsCNodeDup);
 
@@ -124,6 +127,17 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
 
   // Convert
   print_tuple_wrapper_ = MakeSubstitution(PrintTupleWrapper(), "print_tuple_wrapper", prim::kPrimPrint);
+
+  // Unused parameter eliminate
+  unused_parameter_eliminate_ =
+    MakeSubstitution(UnusedParasEliminater(), "unused_parameter_eliminate", IsCNodeGraphKernel);
+  unused_output_eliminate_ = MakeSubstitution(UnusedOutputEliminater(), "unused_output_eliminate", IsCNodeGraphKernel);
+
+  // AddN eliminate
+  addn_eliminate_ = MakeSubstitution(AddNEliminater(), "addn_eliminate", IsCNodeGraphKernel);
+
+  // Mark interface fusion
+  mark_interface_fusion_ = MakeSubstitution(MarkInterfaceFusion(), "mark_interface_fusion", prim::kPrimSelect);
 }
 
 ResolveIRPassLib::ResolveIRPassLib() {
diff --git a/mindspore/ccsrc/optimizer/irpass.h b/mindspore/ccsrc/optimizer/irpass.h
index ac0c6eda6f..5e1550c883 100644
--- a/mindspore/ccsrc/optimizer/irpass.h
+++ b/mindspore/ccsrc/optimizer/irpass.h
@@ -84,6 +84,7 @@ class OptimizeIRPassLib {
 
   // Incorporation
   SubstitutionPtr incorporate_getitem_set_;
+  SubstitutionPtr incorporate_getitem_from_param_;
   SubstitutionPtr incorporate_call_;
   SubstitutionPtr incorporate_call_switch_;
 
@@ -92,6 +93,16 @@ class OptimizeIRPassLib {
 
   // Convert
   SubstitutionPtr print_tuple_wrapper_;
+
+  // Unused parameter eliminate
+  SubstitutionPtr unused_parameter_eliminate_;
+  SubstitutionPtr unused_output_eliminate_;
+
+  // AddN eliminate
+  SubstitutionPtr addn_eliminate_;
+
+  // Fusion
+  SubstitutionPtr mark_interface_fusion_;
 };
 
 // the collection of irpass for resolve action
@@ -145,6 +156,23 @@ inline bool IsCNodeGraph(const AnfNodePtr &node) {
   return IsValueNode<FuncGraph>(inp0);
 }
 
+// Check if CNode Input 0 is Func Graph of graph kernel.
+inline bool IsCNodeGraphKernel(const AnfNodePtr &node) {
+  if (node == nullptr || !node->isa<CNode>()) {
+    return false;
+  }
+
+  auto inp0 = node->cast<CNodePtr>()->input(0);
+  if (IsValueNode<FuncGraph>(inp0)) {
+    auto fg = GetValueNode<FuncGraphPtr>(inp0);
+    if (fg == nullptr) {
+      return false;
+    }
+    return fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+  }
+  return false;
+}
+
 // Check if CNode Input 0 is CNode
 inline bool IsCNodeDup(const AnfNodePtr &node) {
   if (node == nullptr || !node->isa<CNode>()) {
diff --git a/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h b/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
index 1a2ba87165..1836a88dbc 100644
--- a/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
+++ b/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
@@ -83,6 +83,216 @@ class MultiplyByZeroOrOne : public AnfVisitor {
   AnfNodePtr x_{nullptr};
 };
 
+// Support class used for checking if all values of a Tensor are equal `check_value_`
+// Supported data types: double, float/float32, int/int32
+class CheckTensorConstant {
+ public:
+  explicit CheckTensorConstant(int _check_value = 0) : check_value_(_check_value) {}
+  ~CheckTensorConstant() = default;
+  bool IsTensorConstant(const ValuePtr &value) {
+    if (!value->isa<tensor::Tensor>()) {
+      return false;
+    }
+    auto tensor_ptr = dyn_cast<tensor::Tensor>(value);
+    TypeId tensor_type = tensor_ptr->Dtype()->type_id();
+    if ((tensor_type == TypeId::kNumberTypeFloat32) || (tensor_type == TypeId::kNumberTypeFloat)) {
+      float *data2 = reinterpret_cast<float *>(tensor_ptr->data_c());
+      for (int i = 0; i < tensor_ptr->DataSize(); i++) {
+        if (fabs(data2[i] - check_value_) > FLT_EPSILON) {
+          return false;
+        }
+      }
+      return true;
+    } else if (tensor_type == TypeId::kNumberTypeFloat64) {
+      double *data2 = reinterpret_cast<double *>(tensor_ptr->data_c());
+      for (int i = 0; i < tensor_ptr->DataSize(); i++) {
+        if (fabs(data2[i] - check_value_) > DBL_EPSILON) {
+          return false;
+        }
+      }
+      return true;
+    } else if ((tensor_type == TypeId::kNumberTypeInt32) || (tensor_type == TypeId::kNumberTypeInt)) {
+      int *data2 = reinterpret_cast<int *>(tensor_ptr->data_c());
+      for (int i = 0; i < tensor_ptr->DataSize(); i++) {
+        if (data2[i] != check_value_) {
+          return false;
+        }
+      }
+      return true;
+    }
+    // Un-support Data Types
+    return false;
+  }
+
+  bool IsTensorScalarConstant(const ValuePtr &value) {
+    if (!value->isa<tensor::Tensor>()) {
+      return false;
+    }
+    auto tensor_ptr = dyn_cast<tensor::Tensor>(value);
+    if ((tensor_ptr->DataSize() > 1) || (tensor_ptr->DataDim() > 0)) {
+      return false;
+    }
+    return IsTensorConstant(value);
+  }
+
+ private:
+  int check_value_;
+};
+
+// {prim::kPrimMul, 0, X}, {prim::kPrimMul, X, 0}
+// {prim::kPrimMul, 1, X}, {prim::kPrimMul, X, 1}
+class TensorMultiplyByZeroOrOne : public AnfVisitor {
+ public:
+  TensorMultiplyByZeroOrOne() : zero_(MakeValue(0)) {}
+  ~TensorMultiplyByZeroOrOne() override = default;
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    Reset();
+    AnfVisitor::Match(prim::kPrimMul)(node);
+
+    if (is_zero_) {
+      if (x_->func_graph() != node->func_graph()) {
+        return nullptr;
+      }
+      return NewTensorFilledWithData(node);
+    }
+    if (is_one_) {
+      return NewTensorFilledWithData(node, x_);
+    }
+    return nullptr;
+  }
+
+  void Visit(const AnfNodePtr &node) override {
+    if (is_zero_ || is_one_) {
+      x_ = node;
+      return;
+    }
+
+    if (IsParam(node)) {
+      x_ = node;
+      return;
+    }
+
+    if (IsCNode(node)) {
+      CNodePtr cnode = node->cast<CNodePtr>();
+      if (IsPrimitive(cnode->input(0), prim::kPrimZerosLike)) {
+        is_zero_ = true;
+        return;
+      }
+      x_ = node;
+      return;
+    }
+    auto value = node->cast<ValueNodePtr>()->value();
+    if (CheckTensorConstant(0).IsTensorConstant(value)) {
+      is_zero_ = true;
+      return;
+    } else if (CheckTensorConstant(1).IsTensorConstant(value)) {
+      is_one_ = true;
+      return;
+    }
+    x_ = node;
+  }
+
+  void Visit(const ValueNodePtr &vnode) override {
+    auto value = vnode->value();
+    if (CheckTensorConstant(0).IsTensorConstant(value)) {
+      is_zero_ = true;
+      return;
+    } else if (CheckTensorConstant(1).IsTensorConstant(value)) {
+      is_one_ = true;
+      return;
+    }
+    x_ = vnode;
+  }
+  void Reset() {
+    x_ = nullptr;
+    is_one_ = false;
+    is_zero_ = false;
+  }
+
+  void *GetPointerToTensorData(const AnfNodePtr &node, bool writable = false) {
+    if (!node->isa<ValueNode>()) {
+      return nullptr;
+    }
+
+    auto value = node->cast<ValueNodePtr>()->value();
+
+    if (!value->isa<tensor::Tensor>()) {
+      return nullptr;
+    }
+
+    tensor::TensorPtr tensor_ptr = dyn_cast<tensor::Tensor>(value);
+    return tensor_ptr->data_c(writable);
+  }
+
+  // Make a new tensor (when possible) with the same shape as of `node`
+  // If x is nullptr then fill new tensor will "0"
+  // If x is a tensor with empty shape then fill new tensor with the single value of x
+  // If x is a tensor with same shape as `node` then return x as result
+  AnfNodePtr NewTensorFilledWithData(const AnfNodePtr &node, const AnfNodePtr &x = nullptr) {
+    if ((node->abstract() == nullptr) || !node->abstract()->isa<abstract::AbstractTensor>()) {
+      return nullptr;
+    }
+
+    auto tensor_abstract = node->abstract()->cast<abstract::AbstractTensorPtr>();
+    TypePtr tensor_type_ptr = tensor_abstract->element()->BuildType();
+    std::vector<int> tensor_shape = tensor_abstract->shape()->shape();
+
+    auto new_tensor_ptr = std::make_shared<tensor::Tensor>(tensor_type_ptr->type_id(), tensor_shape);
+    size_t mem_size = GetTypeByte(tensor_type_ptr) * IntToSize(new_tensor_ptr->ElementsNum());
+    char *data = reinterpret_cast<char *>(new_tensor_ptr->data_c(true));
+
+    if (x == nullptr) {
+      std::memset(data, 0, mem_size);
+      auto new_vnode = NewValueNode(new_tensor_ptr);
+      new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
+      return new_vnode;
+    }
+    // x is not nullptr
+    if (x->isa<CNode>()) {
+      if ((x->abstract() == nullptr) || !x->abstract()->isa<abstract::AbstractTensor>()) {
+        return nullptr;
+      }
+      auto x_abstract = x->abstract()->cast<abstract::AbstractTensorPtr>();
+      std::vector<int> x_shape = x_abstract->shape()->shape();
+
+      if (x_shape != tensor_shape) {
+        return nullptr;
+      }
+      return x;
+    }
+
+    if (!x->isa<ValueNode>()) {
+      return nullptr;
+    }
+    auto x_value = x->cast<ValueNodePtr>()->value();
+    if (!x_value->isa<tensor::Tensor>()) {
+      return nullptr;
+    }
+
+    auto x_tensor_ptr = dyn_cast<tensor::Tensor>(x_value);
+
+    if ((x_tensor_ptr->DataSize() > 1) && (x_tensor_ptr->DataSize() != new_tensor_ptr->DataSize())) {
+      return nullptr;
+    }
+    char *source_data = reinterpret_cast<char *>(GetPointerToTensorData(x));
+    if (x_tensor_ptr->DataSize() == 1) {
+      for (int i = 0; i < new_tensor_ptr->ElementsNum(); i++) {
+        memcpy(source_data, data + i * GetTypeByte(tensor_type_ptr), GetTypeByte(tensor_type_ptr));
+      }
+    } else {
+      memcpy(source_data, data, mem_size);
+    }
+    auto new_vnode = NewValueNode(new_tensor_ptr);
+    new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
+    return new_vnode;
+  }
+
+ private:
+  bool is_zero_{false}, is_one_{false};
+  ValuePtr zero_;
+  AnfNodePtr x_{nullptr};
+};
+
 // {prim::kPrimScalarAdd, X, 0}
 // {prim::kPrimScalarAdd, 0, X}
 class AddByZero : public AnfVisitor {
@@ -101,7 +311,8 @@ class AddByZero : public AnfVisitor {
   }
 
   void Visit(const AnfNodePtr &node) override {
-    if (node->isa<ValueNode>() && *GetValueNode(node) == *zero_) {
+    if (node->isa<ValueNode>() &&
+        ((*GetValueNode(node) == *zero_) || CheckTensorConstant(0).IsTensorScalarConstant(GetValueNode(node)))) {
       is_zero_ = true;
       return;
     }
@@ -139,10 +350,22 @@ class TensorAddByZero : public AnfVisitor {
       is_zero_ = true;
       return;
     }
+    if (node->isa<ValueNode>() && CheckTensorConstant(0).IsTensorScalarConstant(GetValueNode(node))) {
+      is_zero_ = true;
+      return;
+    }
 
     x_ = node;
   }
 
+  void Visit(const ValueNodePtr &vnode) override {
+    auto value = vnode->value();
+    if (CheckTensorConstant(0).IsTensorConstant(value)) {
+      is_zero_ = true;
+      return;
+    }
+  }
+
   void Reset() {
     x_ = nullptr;
     is_zero_ = false;
@@ -183,29 +406,143 @@ class OptUpdateZeroTensor : public AnfVisitor {
 // {prim::kPrimMul, {...}, {prim::kPrimMul, Tensor1, Tensor2}}
 class ConstantDuplicateMul : public AnfVisitor {
  public:
+  // Support function to multiply two constant tensors: partially support broadcasting shapes
+  template <typename T>
+  void Multiply(void *in_data_1, int in_data_1_size, void *in_data_2, int in_data_2_size, void **out_data,
+                int out_data_size) {
+    T *data_1 = reinterpret_cast<T *>(in_data_1);
+    T *data_2 = reinterpret_cast<T *>(in_data_2);
+    T *data_out = new T[out_data_size];
+
+    if (in_data_1_size == 1) {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] = data_1[0];
+      }
+    } else {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] = data_1[i];
+      }
+    }
+    if (in_data_2_size == 1) {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] *= data_2[0];
+      }
+    } else {
+      for (int i = 0; i < out_data_size; i++) {
+        data_out[i] *= data_2[i];
+      }
+    }
+    *out_data = reinterpret_cast<void *>(data_out);
+    return;
+  }
+
+  AnfNodePtr MulConstantTensors(const AnfNodePtr &vnode_1, const AnfNodePtr &vnode_2, const AnfNodePtr &node_3) {
+    if (!vnode_1->isa<ValueNode>() || !vnode_2->isa<ValueNode>() || (vnode_1->abstract() == nullptr) ||
+        (vnode_2->abstract() == nullptr) || (node_3->abstract() == nullptr)) {
+      return nullptr;
+    }
+
+    auto value_1 = GetValueNode(vnode_1);
+    auto value_2 = GetValueNode(vnode_2);
+
+    if (!value_1->isa<tensor::Tensor>() || !value_2->isa<tensor::Tensor>()) {
+      return nullptr;
+    }
+
+    auto tensor_ptr_1 = dyn_cast<tensor::Tensor>(value_1);
+    auto tensor_ptr_2 = dyn_cast<tensor::Tensor>(value_2);
+
+    auto tensor_1_abstract = vnode_1->abstract()->cast<abstract::AbstractTensorPtr>();
+    auto tensor_2_abstract = vnode_1->abstract()->cast<abstract::AbstractTensorPtr>();
+    auto tensor_3_abstract = node_3->abstract()->cast<abstract::AbstractTensorPtr>();
+
+    TypePtr tensor_1_type_ptr = tensor_1_abstract->element()->BuildType();
+    TypePtr tensor_2_type_ptr = tensor_2_abstract->element()->BuildType();
+    TypePtr tensor_3_type_ptr = tensor_3_abstract->element()->BuildType();
+
+    if ((tensor_1_type_ptr->type_id() != tensor_3_type_ptr->type_id()) ||
+        (tensor_2_type_ptr->type_id() != tensor_3_type_ptr->type_id())) {
+      return nullptr;
+    }
+
+    std::vector<int> tensor_out_shape = tensor_3_abstract->shape()->shape();
+
+    int data_out_size = 1;
+    for (auto it : tensor_out_shape) {
+      data_out_size *= it;
+    }
+    if ((tensor_ptr_1->DataSize() > 1) && (tensor_ptr_1->DataSize() != data_out_size)) {
+      return nullptr;
+    }
+    if ((tensor_ptr_2->DataSize() > 1) && (tensor_ptr_2->DataSize() != data_out_size)) {
+      return nullptr;
+    }
+
+    void *data_out;
+
+    if ((tensor_3_type_ptr->type_id() == TypeId::kNumberTypeFloat32) ||
+        (tensor_3_type_ptr->type_id() == TypeId::kNumberTypeFloat)) {
+      Multiply<float>(tensor_ptr_1->data_c(), tensor_ptr_1->DataSize(), tensor_ptr_2->data_c(),
+                      tensor_ptr_2->DataSize(), &data_out, data_out_size);
+    } else {
+      if (tensor_3_type_ptr->type_id() == TypeId::kNumberTypeFloat64) {
+        Multiply<double>(tensor_ptr_1->data_c(), tensor_ptr_1->DataSize(), tensor_ptr_2->data_c(),
+                         tensor_ptr_2->DataSize(), &data_out, data_out_size);
+      } else {
+        if ((tensor_3_type_ptr->type_id() == TypeId::kNumberTypeInt32) ||
+            (tensor_3_type_ptr->type_id() == TypeId::kNumberTypeInt)) {
+          Multiply<int>(tensor_ptr_1->data_c(), tensor_ptr_1->DataSize(), tensor_ptr_2->data_c(),
+                        tensor_ptr_2->DataSize(), &data_out, data_out_size);
+        } else {
+          // Un-support data types
+          return nullptr;
+        }
+      }
+    }
+
+    auto new_tensor_ptr = std::make_shared<tensor::Tensor>(tensor_3_type_ptr->type_id(), tensor_out_shape);
+    size_t mem_size = GetTypeByte(tensor_3_type_ptr) * IntToSize(new_tensor_ptr->ElementsNum());
+    char *data = reinterpret_cast<char *>(new_tensor_ptr->data_c(true));
+    memcpy(data, data_out, mem_size);
+
+    auto new_vnode = NewValueNode(new_tensor_ptr);
+    new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
+    return new_vnode;
+  }
+
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     // {prim::kPrimMul, Tensor1, {...}}
     AnfVisitor::Match(prim::kPrimMul, {IsNode, IsNode})(node);
-    if (vnode_ == nullptr || cnode_ == nullptr) {
+    if (vnode_ == nullptr || c_p_node_ == nullptr) {
       return nullptr;
     }
+
+    if (!IsCNode(c_p_node_)) {
+      return nullptr;
+    }
+
     auto tensor1 = vnode_;
-    auto mul = cnode_;
+    auto mul = c_p_node_->cast<CNodePtr>();
 
     Reset();
     // {prim::kPrimMul, Tensor2, {...}}
     AnfVisitor::Match(prim::kPrimMul, {IsNode, IsNode})(mul);
-    if (vnode_ == nullptr || cnode_ == nullptr) {
+    if (vnode_ == nullptr || c_p_node_ == nullptr) {
       return nullptr;
     }
     auto tensor2 = vnode_;
-    auto cnode = cnode_;
+    auto c_p_node = c_p_node_;
 
     auto PrimMul = GetValueNode<PrimitivePtr>(mul->input(0));
     auto fg = node->func_graph();
-    auto ttmul = NewCNode({NewValueNode(PrimMul), tensor1, tensor2}, fg);
-    return NewCNode({NewValueNode(PrimMul), cnode, ttmul}, fg);
+
+    auto new_mul_tensor = MulConstantTensors(tensor1, tensor2, c_p_node);
+    if (new_mul_tensor == nullptr) {
+      auto ttmul = NewCNode({NewValueNode(PrimMul), tensor1, tensor2}, fg);
+      return NewCNode({NewValueNode(PrimMul), c_p_node, ttmul}, fg);
+    }
+    return NewCNode({NewValueNode(PrimMul), c_p_node, new_mul_tensor}, fg);
   }
 
   void Visit(const AnfNodePtr &node) override {
@@ -213,19 +550,40 @@ class ConstantDuplicateMul : public AnfVisitor {
       vnode_ = node;
     }
 
-    if (IsCNode(node)) {
-      cnode_ = node->cast<CNodePtr>();
+    if (IsCNode(node) || IsParam(node)) {
+      c_p_node_ = node;
     }
   }
 
   void Reset() {
     vnode_ = nullptr;
-    cnode_ = nullptr;
+    c_p_node_ = nullptr;
   }
 
  private:
   AnfNodePtr vnode_;
-  CNodePtr cnode_;
+  AnfNodePtr c_p_node_;
+};
+
+class PowerOneEliminate : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!IsPrimitiveCNode(node, prim::kPrimPow) || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto &inputs = node->cast<CNodePtr>()->inputs();
+    if (!IsValueNode<Scalar>(inputs[2])) {
+      return nullptr;
+    }
+    auto scalar = GetValueNode<ScalarPtr>(inputs[2]);
+    if (scalar->isa<FloatImm>() && GetValue<float>(scalar) == 1.0) {
+      return inputs[1];
+    } else if (scalar->isa<IntergerImm>() && GetValue<int>(scalar) == 1) {
+      return inputs[1];
+    }
+    return nullptr;
+  }
 };
 
 // grad = AllReduce(grad) / worker_number
@@ -341,17 +699,21 @@ class ArithmeticSimplify {
  public:
   ArithmeticSimplify()
       : multiply_by_zero_or_one_(),
+        tensor_multiply_by_zero_or_one_(),
         add_by_zero_(),
         tensor_add_by_zero_(),
         identity_(prim::kPrimIdentity),
         opt_update_zero_tensor_(),
-        constant_duplicate_mul_() {
+        constant_duplicate_mul_(),
+        power_one_() {
     eliminaters_.emplace_back(multiply_by_zero_or_one_);
+    eliminaters_.emplace_back(tensor_multiply_by_zero_or_one_);
     eliminaters_.emplace_back(add_by_zero_);
     eliminaters_.emplace_back(tensor_add_by_zero_);
     eliminaters_.emplace_back(identity_);
     eliminaters_.emplace_back(opt_update_zero_tensor_);
     eliminaters_.emplace_back(constant_duplicate_mul_);
+    eliminaters_.emplace_back(power_one_);
   }
   ~ArithmeticSimplify() = default;
 
@@ -368,11 +730,13 @@ class ArithmeticSimplify {
 
  private:
   MultiplyByZeroOrOne multiply_by_zero_or_one_;
+  TensorMultiplyByZeroOrOne tensor_multiply_by_zero_or_one_;
   AddByZero add_by_zero_;
   TensorAddByZero tensor_add_by_zero_;
   PrimEliminater identity_;
   OptUpdateZeroTensor opt_update_zero_tensor_;
   ConstantDuplicateMul constant_duplicate_mul_;
+  PowerOneEliminate power_one_;
   std::vector<TransformFuncType> eliminaters_{};
 };
 }  // namespace irpass
diff --git a/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h b/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
index 5b973dc334..5afee45e95 100644
--- a/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
+++ b/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
@@ -21,6 +21,7 @@
 #include <algorithm>
 #include <unordered_map>
 #include <memory>
+#include <unordered_set>
 
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
@@ -28,7 +29,6 @@
 #include "ir/func_graph.h"
 #include "ir/func_graph_cloner.h"
 #include "operator/ops.h"
-
 namespace mindspore {
 namespace opt {
 namespace irpass {
@@ -81,13 +81,32 @@ class IncorporateGetitem : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsValueNode<Int32Imm>})(node);
-
-    if (node->func_graph() != nullptr && idx_ >= 0 && fg_ != nullptr) {
-      auto new_fg = getitem_transform_(fg_, idx_);
-      (void)args_.insert(args_.begin(), NewValueNode(new_fg));
-      return node->func_graph()->NewCNode(args_);
+    if (node->func_graph() == nullptr || idx_ == -1 || fg_ == nullptr) {
+      return nullptr;
     }
-    return nullptr;
+
+    if (fg_->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      // If graph kernel has muti output, do not split.
+      // some graph kernel output has EnvInstance node or DeadCode node should split.
+      auto output = fg_->output();
+      if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+        auto output_cnode = output->cast<CNodePtr>();
+        auto outputs = output_cnode->inputs();
+        int real_output_cnt = 0;
+        for (size_t i = 1; i < outputs.size(); ++i) {
+          if (IsCNode(outputs[i]) || IsValueNode<tensor::Tensor>(outputs[i]) || IsParam(outputs[i])) {
+            real_output_cnt++;
+            if (real_output_cnt > 1) {
+              return nullptr;
+            }
+          }
+        }
+      }
+    }
+
+    auto new_fg = getitem_transform_(fg_, idx_);
+    (void)args_.insert(args_.begin(), NewValueNode(new_fg));
+    return node->func_graph()->NewCNode(args_);
   }
 
   void Visit(const CNodePtr &cnode) override {
@@ -115,6 +134,172 @@ class IncorporateGetitem : public AnfVisitor {
   internal::GetitemTransform getitem_transform_;
 };
 
+class IncorporateGetitemFromParam : public AnfVisitor {
+ public:
+  void Process(const FuncGraphPtr &func_graph, const CNodePtr &cnode, const AnfNodePtr &param, size_t input_idx) {
+    auto mng = func_graph->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto &node_users = mng->node_users();
+    if (node_users.find(param) == node_users.end() || node_users[param].empty()) {
+      args_.push_back(cnode->input(input_idx + 1));
+      return;
+    }
+
+    for (auto &user : node_users[param]) {
+      if (!IsPrimitiveCNode(user.first, prim::kPrimTupleGetItem)) {
+        // we do not process this case.
+        args_.push_back(cnode->input(input_idx + 1));
+        return;
+      }
+    }
+
+    // update new args.
+    if (IsPrimitiveCNode(cnode->input(input_idx + 1), prim::kPrimMakeTuple)) {
+      // case 1
+      replace_parameters_[input_idx] = true;
+      need_update_ = true;
+      auto make_tuple_cnode = cnode->input(input_idx + 1)->cast<CNodePtr>();
+      auto &make_tuple_cnode_inputs = make_tuple_cnode->inputs();
+      inputs_num_[input_idx] = make_tuple_cnode_inputs.size() - 1;
+      args_.insert(args_.end(), make_tuple_cnode_inputs.begin() + 1, make_tuple_cnode_inputs.end());
+    } else {
+      // case 2
+      auto prev_cnode = cnode->input(input_idx + 1)->cast<CNodePtr>();
+      auto prev_fg = GetValueNode<FuncGraphPtr>(prev_cnode->input(0));
+      auto fg_output = prev_fg->output();
+      if (!IsPrimitiveCNode(fg_output, prim::kPrimMakeTuple)) {
+        MS_LOG(ERROR) << "The return of: " << prev_fg->ToString()
+                      << " should be a make tuple, but got: " << fg_output->DebugString();
+        return;
+      }
+      replace_parameters_[input_idx] = true;
+      need_update_ = true;
+      auto make_tuple_cnode = fg_output->cast<CNodePtr>();
+      inputs_num_[input_idx] = make_tuple_cnode->inputs().size() - 1;
+      for (size_t output_i = 0; output_i < inputs_num_[input_idx]; ++output_i) {
+        auto new_getitem =
+          func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), prev_cnode, NewValueNode(SizeToInt(output_i))});
+        auto aptr = std::make_shared<abstract::AbstractScalar>(std::make_shared<Int32Imm>(SizeToInt(output_i)));
+        new_getitem->input(2)->set_abstract(aptr);
+        new_getitem->set_abstract(make_tuple_cnode->input(output_i + 1)->abstract());
+        args_.push_back(new_getitem);
+      }
+    }
+  }
+
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    Reset();
+
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == nullptr) {
+      return nullptr;
+    }
+    auto &inputs = cnode->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    if (fg == nullptr) {
+      return nullptr;
+    }
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto parameters = fg->parameters();
+    if (parameters.size() != inputs.size() - 1) {
+      return nullptr;
+    }
+    replace_parameters_ = std::vector<bool>(parameters.size(), false);
+    inputs_num_ = std::vector<size_t>(parameters.size(), 1);
+    auto node_fg = node->func_graph();
+
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      if (IsPrimitiveCNode(inputs[i], prim::kPrimMakeTuple) || IsCNodeGraphKernel(inputs[i])) {
+        Process(node_fg, cnode, parameters[i - 1], i - 1);
+      } else {
+        args_.push_back(inputs[i]);
+      }
+    }
+
+    if (!need_update_) {
+      return nullptr;
+    }
+
+    FuncGraphPtr new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("sp"));
+    mng->AddFuncGraph(new_fg);
+
+    auto node_users = mng->node_users();
+    std::vector<AnfNodePtr> new_fg_parameters = new_fg->parameters();
+    std::vector<AnfNodePtr> new_parameters;
+    size_t curr_input_idx{0};
+    for (size_t param_i = 0; param_i < new_fg_parameters.size(); ++param_i) {
+      if (!replace_parameters_[param_i]) {
+        if (parameters[param_i]->abstract() != nullptr) {
+          new_fg_parameters[param_i]->set_abstract(parameters[param_i]->abstract());
+        }
+        new_parameters.push_back(new_fg_parameters[param_i]);
+        curr_input_idx++;
+        continue;
+      }
+
+      // make a new parameter.
+      for (size_t input_i = 0; input_i < inputs_num_[param_i]; ++input_i) {
+        auto new_param = std::make_shared<Parameter>(new_fg);
+        new_param->set_abstract(args_.at(curr_input_idx)->abstract());
+
+        // update users of new parameter.
+        for (auto &user : node_users[new_fg_parameters[param_i]]) {
+          idx_ = -1;
+          AnfVisitor::Match(prim::kPrimTupleGetItem, {IsParam, IsValueNode<Int32Imm>})(user.first);
+          if (idx_ == -1) {
+            MS_LOG(ERROR) << "User of: " << new_fg_parameters[param_i]->DebugString()
+                          << " must be tuple getitem here, but got: " << user.first->DebugString();
+            return nullptr;
+          }
+
+          if (input_i == IntToSize(idx_)) {
+            for (auto &sub_user : node_users[user.first]) {
+              auto sub_user_cnode = sub_user.first->cast<CNodePtr>();
+              MS_EXCEPTION_IF_NULL(sub_user_cnode);
+              sub_user_cnode->set_input(sub_user.second, new_param);
+              (void)mng->Replace(sub_user.first, sub_user_cnode);
+            }
+          }
+        }
+
+        // (void)mng->Replace(new_fg_parameters[param_i], new_param);
+        new_parameters.push_back(new_param);
+        curr_input_idx++;
+      }
+    }
+
+    mng->SetParameters(new_fg, new_parameters);
+    (void)args_.insert(args_.begin(), NewValueNode(new_fg));
+    auto new_call = node_fg->NewCNode(args_);
+    new_call->set_abstract(node->abstract());
+    return new_call;
+  }
+
+  void Visit(const ValueNodePtr &vnode) override { idx_ = GetValue<int>(vnode->value()); }
+
+  void Visit(const CNodePtr &cnode) override {}
+
+  void Reset() {
+    replace_parameters_.clear();
+    args_.clear();
+    inputs_num_.clear();
+    need_update_ = false;
+    idx_ = -1;
+  }
+
+ private:
+  std::vector<bool> replace_parameters_{};
+  std::vector<AnfNodePtr> args_{};
+  std::vector<size_t> inputs_num_{};
+  bool need_update_{false};
+  int idx_{-1};
+};
+
 // {prim::kPrimTupleGetItem, {{prim::kPrimSwitch, X, G1, G2}, Xs}, C}
 class IncorporateGetitemSwitch : public AnfVisitor {
  public:
diff --git a/mindspore/ccsrc/optimizer/irpass/inline.h b/mindspore/ccsrc/optimizer/irpass/inline.h
index 854b568453..64f192347c 100644
--- a/mindspore/ccsrc/optimizer/irpass/inline.h
+++ b/mindspore/ccsrc/optimizer/irpass/inline.h
@@ -86,20 +86,10 @@ bool IsUniqueUse(const FuncGraphPtr &fg, AnfNodePtr) {
 
 bool IsInside(FuncGraphPtr, const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node->func_graph());
-  auto &flags = node->func_graph()->flags();
-  if (flags.find("inline_inside") != flags.end()) {
-    return flags["inline_inside"];
-  }
-  return false;
+  return node->func_graph()->has_flag("inline_inside");
 }
 
-bool IsCore(const FuncGraphPtr &fg, AnfNodePtr) {
-  auto &flags = fg->flags();
-  if (flags.find("core") != flags.end()) {
-    return flags["core"];
-  }
-  return false;
-}
+bool IsCore(const FuncGraphPtr &fg, AnfNodePtr) { return fg->has_flag("core"); }
 
 bool NoCriterion(FuncGraphPtr, AnfNodePtr) { return true; }
 
@@ -123,6 +113,13 @@ class InlinerBase : public AnfVisitor {
     if (fg->has_flag(FUNC_GRAPH_FLAG_DEFER_INLINE)) {
       return nullptr;
     }
+    // Do not inline GraphKernel to Cell.
+    if (fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) && !node->func_graph()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      // If the GraphKernel only contains a return node, we make it inlined.
+      if (fg->nodes().size() - fg->parameters().size() > 1) {
+        return nullptr;
+      }
+    }
 
     Reset();
     bool is_match = false;
diff --git a/mindspore/ccsrc/optimizer/irpass/mark_interface_fusion.h b/mindspore/ccsrc/optimizer/irpass/mark_interface_fusion.h
new file mode 100644
index 0000000000..6f2bcc187f
--- /dev/null
+++ b/mindspore/ccsrc/optimizer/irpass/mark_interface_fusion.h
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_MARK_INTERFACE_FUSION_H
+#define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_MARK_INTERFACE_FUSION_H
+
+#include <string>
+#include <sstream>
+#include <unordered_map>
+
+#include "session/anf_runtime_algorithm.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/irpass.h"
+#include "ir/visitor.h"
+#include "operator/ops.h"
+#include "utils/graph_utils.h"
+#include "operator/composite/composite.h"
+
+namespace mindspore {
+namespace opt {
+namespace irpass {
+
+static int count = 0;
+
+std::string GetFusionNumber() {
+  std::stringstream ss;
+  ss << std::setw(4) << std::setfill('0') << count;
+  std::string num = ss.str();
+  ++count;
+
+  return "_" + num;
+}
+
+// Mark CNodes which can be merged in kernel build
+class MarkInterfaceFusion : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (node->func_graph()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) && IsPrimitiveCNode(node, prim::kPrimSelect)) {
+      auto cnode = node->cast<CNodePtr>();
+      auto condition = cnode->input(1);
+      std::string cmp;
+      std::unordered_map<std::string, std::string> cmp_list = {{"GreaterEqual", "GE"}, {"Greater", "GT"},
+                                                               {"LessEqual", "LE"},    {"Less", "LT"},
+                                                               {"Equal", "EQ"},        {"NotEqual", "NE"}};
+      if (IsPrimitiveCNode(condition)) {
+        auto prim_name = GetCNodeFuncName(condition->cast<CNodePtr>());
+        if (cmp_list.count(prim_name) != 0) {
+          // Mark Select and compare node
+          cmp = cmp_list[prim_name];
+          auto cnt = GetFusionNumber();
+          AnfAlgo::SetNodeAttr("fusion", MakeValue("Select" + cmp + cnt), condition);
+          AnfAlgo::SetNodeAttr("fusion", MakeValue("Select" + cmp + cnt + "_end"), node);
+          for (size_t i = 1; i < cnode->inputs().size(); ++i) {
+            if (IsPrimitiveCNode(cnode->input(i), prim::kPrimZerosLike)) {
+              AnfAlgo::SetNodeAttr("fusion", MakeValue("Select" + cmp + cnt), cnode->input(i));
+            }
+          }
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  void Visit(const AnfNodePtr &) override {}
+
+ private:
+  AnfNodePtr y_{nullptr};
+};
+
+}  // namespace irpass
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_OPTIMIZER_IRPASS_MARK_INTERFACE_FUSION_H
diff --git a/mindspore/ccsrc/optimizer/irpass/merge_addn.h b/mindspore/ccsrc/optimizer/irpass/merge_addn.h
index 35162ce4fe..94f9e26c5b 100644
--- a/mindspore/ccsrc/optimizer/irpass/merge_addn.h
+++ b/mindspore/ccsrc/optimizer/irpass/merge_addn.h
@@ -19,6 +19,7 @@
 
 #include <vector>
 #include <algorithm>
+#include <memory>
 
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
@@ -196,6 +197,131 @@ class AddNZeroFilter : public AnfVisitor {
   std::vector<AnfNodePtr> filtered_Xs_{}, Xs_{};
   bool has_zero_like_{false};
 };
+
+// {PrimAddN, {kPrimMakeTuple, Xs}}
+// Akg don't support AddN(ValueNode, Tensor, ...), converted to TensorAdd.
+// case0: AddN(inputs)(inputs size < 2) -> error
+// case1: AddN(inputs)(all inputs is ValueNode) -> error
+// case2: AddN(inputs)(inputs size = 2) -> TensorAdd(Tensor, Tensor)
+// case3: AddN(ValueNode, Tensor, Tensor, ...)(has one ValueNode input)
+//   -> TensorAdd(ValueNode, AddN(Tensor, Tensor, ...))
+class AddNEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!node->isa<CNode>() || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto &inputs = node->cast<CNodePtr>()->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    MS_EXCEPTION_IF_NULL(fg);
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    if (fg->recursive()) {
+      return nullptr;
+    }
+
+    auto new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("fg"));
+    mng->AddFuncGraph(new_fg);
+    need_update_ = false;
+    bool changed = false;
+    do {
+      changed = false;
+      changed |= Process(new_fg);
+    } while (changed);
+
+    if (!need_update_) {
+      return nullptr;
+    } else {
+      auto new_sx = inputs;
+      new_sx[0] = NewValueNode(new_fg);
+      return node->func_graph()->NewCNode(new_sx);
+    }
+  }
+
+  bool Process(const FuncGraphPtr &func_graph) {
+    auto mng = func_graph->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto nodes = TopoSort(func_graph->output());
+    bool changed = false;
+
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      auto node = nodes[i];
+      if (!IsPrimitiveCNode(node, prim::kPrimAddN)) {
+        continue;
+      }
+
+      auto cnode = node->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(cnode);
+      auto &tuple_input = cnode->input(1);
+      MS_EXCEPTION_IF_NULL(tuple_input);
+      auto tuple_input_cnode = tuple_input->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(tuple_input_cnode);
+      auto &tuple_inputs = tuple_input_cnode->inputs();
+      if (tuple_inputs.size() < 3) {
+        // case0: inputs size < 2, error
+        MS_EXCEPTION(ArgumentError) << "Inputs size of AddN less than 2. " << cnode->DebugString(2);
+      }
+
+      int valuenode_num =
+        std::accumulate(tuple_inputs.begin() + 1, tuple_inputs.end(), 0, [](int accumulator, const AnfNodePtr &node) {
+          if (IsValueNode<tensor::Tensor>(node)) {
+            return accumulator + 1;
+          } else {
+            return accumulator;
+          }
+        });
+      if (IntToSize(valuenode_num) == tuple_inputs.size()) {
+        // case1: all inputs is ValueNode, error
+        MS_EXCEPTION(ArgumentError) << "All inputs of AddN is ValueNode. " << cnode->DebugString(2);
+      }
+
+      if (tuple_inputs.size() == 3) {
+        // case2: inputs size = 2, -> TensorAdd(Tensor, Tensor)
+        MS_LOG(DEBUG) << "Replace AddN with two inputs with TensorAdd. " << cnode->DebugString(2);
+        ValuePtr prim_tensoradd = prim::GetPythonOps("TensorAdd", "mindspore.ops.operations");
+        std::vector<AnfNodePtr> new_xs{func_graph->NewCNode({NewValueNode(prim_tensoradd)}), tuple_inputs[1],
+                                       tuple_inputs[2]};
+        mng->Replace(node, func_graph->NewCNode(new_xs));
+        changed = true;
+        continue;
+      }
+
+      auto first_valuenode = std::find_if(tuple_inputs.begin() + 1, tuple_inputs.end(),
+                                          [](const AnfNodePtr &node) { return IsValueNode<tensor::Tensor>(node); });
+      if (first_valuenode == tuple_inputs.end()) {
+        // no ValueNode input found.
+        continue;
+      } else {
+        // case3: has one ValueNode input -> TensorAdd(ValueNode, AddN(Tensor, Tensor, ...))
+        std::vector<AnfNodePtr> make_tuple_new_xs{
+          NewValueNode(prim::kPrimMakeTuple),
+        };
+        std::for_each(tuple_inputs.begin() + 1, tuple_inputs.end(),
+                      [&make_tuple_new_xs, &first_valuenode](const AnfNodePtr &node) {
+                        if (node != *first_valuenode) {
+                          make_tuple_new_xs.push_back(node);
+                        }
+                      });
+        ValuePtr prim_addn = prim::GetPythonOps("AddN", "mindspore.ops.operations");
+        auto new_addn = func_graph->NewCNode(
+          {func_graph->NewCNode({NewValueNode(prim_addn)}), func_graph->NewCNode(make_tuple_new_xs)});
+        ValuePtr prim_tensoradd = prim::GetPythonOps("TensorAdd", "mindspore.ops.operations");
+        auto new_add =
+          func_graph->NewCNode({func_graph->NewCNode({NewValueNode(prim_tensoradd)}), *first_valuenode, new_addn});
+        (void)mng->Replace(node, new_add);
+        changed = true;
+        continue;
+      }
+    }
+
+    need_update_ |= changed;
+    return changed;
+  }
+
+ private:
+  bool need_update_{false};
+};
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h b/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h
index 73dbc152e5..d2e1d15f91 100644
--- a/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/reduce_eliminate.h
@@ -79,7 +79,7 @@ class ReduceOneEliminater : public AnfVisitor {
   }
 
   void Visit(const AnfNodePtr &node) override {
-    if (x_ == nullptr) {
+    if (!IsVNode(node) && x_ == nullptr) {
       if (IsValueNode<tensor::Tensor>(node)) {
         is_tensor_ = true;
       }
diff --git a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
index ab4f9bc32e..8d700ec7f8 100644
--- a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
@@ -23,6 +23,8 @@
 #include "optimizer/irpass.h"
 #include "ir/visitor.h"
 #include "operator/ops.h"
+#include "utils/graph_utils.h"
+#include "operator/composite/composite.h"
 
 namespace mindspore {
 namespace opt {
@@ -36,6 +38,7 @@ class MakeRefEliminater : public AnfVisitor {
       this->y_ = node;
       return true;
     };
+
     AnfVisitor::Match(prim::kPrimMakeRef, {IsNode, gety, IsNode})(node);
     return y_;
   }
diff --git a/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h b/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
index ed4ac24148..1dc8fbb344 100644
--- a/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
@@ -142,7 +142,7 @@ class ResetDeferInline : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     if (IsValueNode<FuncGraph>(node)) {
       auto fg = GetValueNode<FuncGraphPtr>(node);
-      fg->set_flags(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
+      fg->set_flag(FUNC_GRAPH_FLAG_DEFER_INLINE, false);
     }
     return nullptr;
   }
diff --git a/mindspore/ccsrc/optimizer/irpass/specialize_transform.h b/mindspore/ccsrc/optimizer/irpass/specialize_transform.h
index 905479df77..6ac4e40f5e 100644
--- a/mindspore/ccsrc/optimizer/irpass/specialize_transform.h
+++ b/mindspore/ccsrc/optimizer/irpass/specialize_transform.h
@@ -22,6 +22,7 @@
 #include <memory>
 #include <utility>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
@@ -41,7 +42,7 @@ class SpecializeTransform {
   ~SpecializeTransform() = default;
 
   FuncGraphPtr operator()(const FuncGraphPtr &func_graph, std::vector<FuncGraphPtr> graph_args,
-                          std::vector<PrimitivePtr> prim_args) {
+                          std::vector<PrimitivePtr> prim_args, std::vector<tensor::TensorPtr> value_args) {
     if (cache_.count(func_graph) == 0) {
       cache_[func_graph] = {};
     }
@@ -69,6 +70,13 @@ class SpecializeTransform {
           (void)mng->Replace(params[i], arg);
           continue;
         }
+        if (value_args[i] != nullptr) {
+          auto const_tensor = *value_args[i];
+          auto const_tensor_ptr = std::make_shared<tensor::Tensor>(const_tensor);
+          AnfNodePtr arg = NewValueNode(const_tensor_ptr);
+          (void)mng->Replace(params[i], arg);
+          continue;
+        }
         new_params.push_back(params[i]);
       }
 
@@ -108,6 +116,7 @@ class SpecializeOnGraphArguments : public AnfVisitor {
 
     std::vector<FuncGraphPtr> graph_args;
     std::vector<PrimitivePtr> prim_args;
+    std::vector<tensor::TensorPtr> value_node_args;
     std::vector<AnfNodePtr> new_xs;
     bool hasVNode = false;
     for (size_t i = 1; i < inputs.size(); i++) {
@@ -115,15 +124,24 @@ class SpecializeOnGraphArguments : public AnfVisitor {
         auto fg_vnode = GetValueNode<FuncGraphPtr>(inputs[i]);
         graph_args.push_back(fg_vnode);
         prim_args.emplace_back(nullptr);
+        value_node_args.emplace_back(nullptr);
         hasVNode = true;
       } else if (IsValueNode<Primitive>(inputs[i])) {
         auto p_vnode = GetValueNode<PrimitivePtr>(inputs[i]);
         graph_args.emplace_back(nullptr);
         prim_args.push_back(p_vnode);
+        value_node_args.emplace_back(nullptr);
+        hasVNode = true;
+      } else if (IsValueNode<tensor::Tensor>(inputs[i])) {
+        tensor::TensorPtr t_vnode = GetValueNode<tensor::TensorPtr>(inputs[i]);
+        graph_args.emplace_back(nullptr);
+        prim_args.emplace_back(nullptr);
+        value_node_args.emplace_back(t_vnode);
         hasVNode = true;
       } else {
         graph_args.emplace_back(nullptr);
         prim_args.emplace_back(nullptr);
+        value_node_args.emplace_back(nullptr);
         new_xs.push_back(inputs[i]);
       }
     }
@@ -132,7 +150,7 @@ class SpecializeOnGraphArguments : public AnfVisitor {
       return nullptr;
     }
 
-    auto new_fg = specialize_transform_(inp0_fg, graph_args, prim_args);
+    auto new_fg = specialize_transform_(inp0_fg, graph_args, prim_args, value_node_args);
     (void)new_xs.insert(new_xs.begin(), NewValueNode(new_fg));
 
     return node->func_graph()->NewCNode(new_xs);
@@ -141,6 +159,146 @@ class SpecializeOnGraphArguments : public AnfVisitor {
  private:
   internal::SpecializeTransform specialize_transform_;
 };
+
+// Eliminate unused parameters.
+// {G, Xs}
+class UnusedParasEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!node->isa<CNode>() || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto &inputs = cnode->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    MS_EXCEPTION_IF_NULL(fg);
+
+    std::vector<AnfNodePtr> parameters = fg->parameters();
+    size_t size = parameters.size();
+    if (size != inputs.size() - 1) {
+      return nullptr;
+    }
+
+    std::vector<AnfNodePtr> new_xs;
+    std::vector<bool> keep_parameters;
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    auto &node_users = mng->node_users();
+    bool has_unused_para = false;
+    for (size_t i = 0; i < size; ++i) {
+      auto iter = node_users.find(parameters[i]);
+      if (iter != node_users.end() && !iter->second.empty()) {
+        keep_parameters.push_back(true);
+        new_xs.push_back(inputs[i + 1]);
+        continue;
+      }
+      keep_parameters.push_back(false);
+      has_unused_para = true;
+    }
+
+    if (!has_unused_para) {
+      return nullptr;
+    }
+    FuncGraphPtr new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("sp"));
+    mng->AddFuncGraph(new_fg);
+
+    std::vector<AnfNodePtr> new_fg_parameters = new_fg->parameters();
+    std::vector<AnfNodePtr> new_parameters;
+    for (size_t i = 0; i < size; i++) {
+      if (keep_parameters[i]) {
+        if (parameters[i]->abstract() != nullptr) {
+          new_fg_parameters[i]->set_abstract(parameters[i]->abstract());
+        }
+        new_parameters.push_back(new_fg_parameters[i]);
+      }
+    }
+    mng->SetParameters(new_fg, new_parameters);
+
+    (void)new_xs.insert(new_xs.begin(), NewValueNode(new_fg));
+    return node->func_graph()->NewCNode(new_xs);
+  }
+};
+
+// Eliminate unused outputs.
+// {G, Xs}
+class UnusedOutputEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    if (!node->isa<CNode>() || node->func_graph() == nullptr) {
+      return nullptr;
+    }
+
+    auto &inputs = node->cast<CNodePtr>()->inputs();
+    auto fg = GetValueNode<FuncGraphPtr>(inputs[0]);
+    MS_EXCEPTION_IF_NULL(fg);
+    auto mng = fg->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    if (fg->recursive()) {
+      return nullptr;
+    }
+
+    auto new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("fg"));
+    mng->AddFuncGraph(new_fg);
+    auto new_fg_output = new_fg->output();
+    if (!IsPrimitiveCNode(new_fg_output, prim::kPrimMakeTuple)) {
+      return nullptr;
+    }
+
+    auto output_cnode = new_fg_output->cast<CNodePtr>();
+    auto &node_users = mng->node_users();
+    if (node_users.count(node) == 0 || node_users[node].empty()) {
+      return nullptr;
+    }
+    std::unordered_set<int> used_output_idx;
+    std::vector<std::pair<AnfNodePtr, int>> all_users;
+    for (auto &node_user : node_users[node]) {
+      if (!IsPrimitiveCNode(node_user.first, prim::kPrimTupleGetItem)) {
+        return nullptr;
+      }
+      auto user_cnode = node_user.first->cast<CNodePtr>();
+      size_t used_idx = GetValue<int>(user_cnode->input(2)->cast<ValueNodePtr>()->value());
+      used_output_idx.insert(used_idx);
+      all_users.push_back(std::make_pair(node_user.first, used_idx));
+    }
+
+    if (used_output_idx.size() >= output_cnode->inputs().size() - 1) {
+      // all output has users.
+      return nullptr;
+    }
+
+    if (used_output_idx.empty()) {
+      // we do not process this case.
+      return nullptr;
+    } else if (used_output_idx.size() == 1) {
+      // after eliminate, only one output left.
+      new_fg->set_output(output_cnode->input(*used_output_idx.begin() + 1));
+      // update users.
+      for (auto &ret_user : all_users) {
+        (void)mng->Replace(ret_user.first, node);
+      }
+    } else {
+      // after eliminate, create new multi output.
+      std::vector<AnfNodePtr> new_output_inputs{output_cnode->input(0)};
+      std::unordered_map<int, int> new_idx_map;
+      for (auto idx : used_output_idx) {
+        new_idx_map[idx] = SizeToInt(new_output_inputs.size() - 1);
+        new_output_inputs.push_back(output_cnode->input(idx + 1));
+      }
+      new_fg->set_output(new_fg->NewCNode(new_output_inputs));
+      // update users.
+      for (auto &ret_user : all_users) {
+        auto ret_user_cnode = ret_user.first->cast<CNodePtr>();
+        ret_user_cnode->set_input(2, NewValueNode(new_idx_map[ret_user.second]));
+      }
+    }
+
+    auto new_sx = inputs;
+    new_sx[0] = NewValueNode(new_fg);
+    return node->func_graph()->NewCNode(new_sx);
+  }
+};
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/optimizer/optimizer.h b/mindspore/ccsrc/optimizer/optimizer.h
index 805543f45b..3e77edc1e9 100644
--- a/mindspore/ccsrc/optimizer/optimizer.h
+++ b/mindspore/ccsrc/optimizer/optimizer.h
@@ -89,7 +89,7 @@ using OptPassGroupMap = std::vector<std::pair<std::string, OptPassConfig>>;
 class Optimizer : public std::enable_shared_from_this<Optimizer> {
  public:
   Optimizer(const std::string &name, const pipeline::ResourceBasePtr &resource_ptr)
-      : name_(name), resource_(resource_ptr), run_only_once_(false), is_watch_renormalize_(false) {}
+      : name_(name), resource_(resource_ptr), run_only_once_(false), is_watch_renormalize_(false), is_enable_(true) {}
   virtual ~Optimizer() = default;
 
   void Init(const OptPassGroupMap &passes, bool run_only_once) {
@@ -132,6 +132,9 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   }
 
   FuncGraphPtr step(FuncGraphPtr func_graph, bool use_profile = true) {
+    if (!is_enable_) {
+      return func_graph;
+    }
     // Optimizer step counter;
     int counter = -1;
     bool changes = true;
@@ -171,7 +174,7 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
           };
           use_profile ? (WITH(MsProfile::GetProfile()->Step(pass_names_[i])) opt_func) : opt_func();
           if (IS_OUTPUT_ON(mindspore::DEBUG) && MsContext::GetInstance()->save_graphs_flag()) {
-            MS_LOG(DEBUG) << name_ << " round " << counter << " OptPass " << pass_names_[i] << " end.";
+            MS_LOG(DEBUG) << "The opt " << name_ << " round " << counter << " OptPass " << pass_names_[i] << " end.";
             auto fg_name =
               "opt_substep_" + name_ + "_r" + std::to_string(counter) + "_" + std::to_string(i) + "_" + pass_names_[i];
             func_graph->DumpFuncGraph(fg_name);
@@ -211,6 +214,7 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   void enable_watch_renormalize() { is_watch_renormalize_ = true; }
   void disable_watch_renormalize() { is_watch_renormalize_ = false; }
   bool is_watch_renormalize() { return is_watch_renormalize_; }
+  void set_enable(bool enable) { is_enable_ = enable; }
 
  private:
   const std::string name_;
@@ -220,6 +224,7 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   bool run_only_once_;
   std::vector<AnfNodePtr> untyped_nodes_;
   bool is_watch_renormalize_;
+  bool is_enable_;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc b/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc
index 687bc12f05..999c4a85a9 100644
--- a/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc
+++ b/mindspore/ccsrc/parallel/allreduce_fusion/step_allreduce_fusion.cc
@@ -64,7 +64,7 @@ bool StepAllreduceFusion(const FuncGraphPtr &root, const opt::OptimizerPtr &opti
   DumpGraph(root, std::string(ALLREDUCE_FUSION_END));
 
   // allreduce fusion only run once
-  root->flags()[ALLREDUCE_FUSION_RUN_ONCE_ONLY] = true;
+  root->set_flag(ALLREDUCE_FUSION_RUN_ONCE_ONLY, true);
   res->results()[pipeline::kStepParallelGraph] = root;
 #if defined(_WIN32) || defined(_WIN64)
   auto end_time = std::chrono::steady_clock::now();
diff --git a/mindspore/ccsrc/parallel/context.cc b/mindspore/ccsrc/parallel/context.cc
index 6802292cb4..8957dc842c 100644
--- a/mindspore/ccsrc/parallel/context.cc
+++ b/mindspore/ccsrc/parallel/context.cc
@@ -158,8 +158,8 @@ void ParallelParameterContextRestoreInNoTraining(const FuncGraphPtr &func_graph,
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(param_node);
   MS_EXCEPTION_IF_NULL(ptr);
-  if (!func_graph->has_flag(AUTO_PARALLEL) || (func_graph->flags().count(TRAINING) == 0) ||
-      func_graph->flags()[TRAINING]) {
+  if (!func_graph->has_flag(AUTO_PARALLEL) || (func_graph->attrs().count(TRAINING) == 0) ||
+      func_graph->has_flag(TRAINING)) {
     return;
   }
 
diff --git a/mindspore/ccsrc/parallel/step_auto_parallel.cc b/mindspore/ccsrc/parallel/step_auto_parallel.cc
index 429241c8b7..8b4f7e2dec 100644
--- a/mindspore/ccsrc/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/parallel/step_auto_parallel.cc
@@ -107,7 +107,7 @@ bool StepAutoParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &) {
   time += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
   MS_LOG(INFO) << "Now leaving step auto parallel, used time: " << time << " us";
 
-  root->flags()[AUTO_PARALLEL_RUN_ONCE_ONLY] = true;
+  root->set_flag(AUTO_PARALLEL_RUN_ONCE_ONLY, true);
   return changes;
 }
 
diff --git a/mindspore/ccsrc/parallel/step_parallel.cc b/mindspore/ccsrc/parallel/step_parallel.cc
index a5e5dee990..fc7b48d267 100644
--- a/mindspore/ccsrc/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/parallel/step_parallel.cc
@@ -2270,10 +2270,10 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
       (root->has_flag(SEMI_AUTO_PARALLEL_RUN_ONCE_ONLY))) {
     if (!root->has_flag(CHECK_SET_STRATEGY_VALID_ONCE_ONLY)) {
       if (HasStrategy(root)) {
-        MS_LOG(INFO) << "strategies ignored in " << parallel_mode
+        MS_LOG(INFO) << "Strategies ignored in " << parallel_mode
                      << ", set_strategy() only valid in [semi_]auto_parallel.";
       }
-      root->flags()[CHECK_SET_STRATEGY_VALID_ONCE_ONLY] = true;
+      root->set_flag(CHECK_SET_STRATEGY_VALID_ONCE_ONLY, true);
     }
 
     return changes;
@@ -2330,11 +2330,11 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
   DumpGraph(root, std::string(STEP_PARALLEL_END));
 
   // step parallel only run once
-  root->flags()[SEMI_AUTO_PARALLEL_RUN_ONCE_ONLY] = true;
+  root->set_flag(SEMI_AUTO_PARALLEL_RUN_ONCE_ONLY, true);
   res->results()[pipeline::kStepParallelGraph] = root;
 
   // in auto parallel mode, no need to check if stategies set
-  root->flags()[CHECK_SET_STRATEGY_VALID_ONCE_ONLY] = true;
+  root->set_flag(CHECK_SET_STRATEGY_VALID_ONCE_ONLY, true);
 
   (void)gettimeofday(&end_time, nullptr);
   uint64_t time = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
diff --git a/mindspore/ccsrc/pipeline/init.cc b/mindspore/ccsrc/pipeline/init.cc
index b8472de409..7025447a29 100644
--- a/mindspore/ccsrc/pipeline/init.cc
+++ b/mindspore/ccsrc/pipeline/init.cc
@@ -151,7 +151,10 @@ PYBIND11_MODULE(_c_expression, m) {
     .def("set_check_bprop_flag", &mindspore::MsContext::set_check_bprop_flag, "Set whether to check bprop.")
     .def("get_max_device_memory", &mindspore::MsContext::max_device_memory, "Get deivce memory max size.")
     .def("set_max_device_memory", &mindspore::MsContext::set_max_device_memory, "Set deivce memory max size.")
-    .def("set_print_file_path", &mindspore::MsContext::set_print_file_path, "Set path to print.");
+    .def("set_print_file_path", &mindspore::MsContext::set_print_file_path, "Set path to print.")
+    .def("set_enable_graph_kernel", &mindspore::MsContext::set_enable_graph_kernel,
+         "Set the GraphKernel switch to on or off.")
+    .def("get_enable_graph_kernel", &mindspore::MsContext::enable_graph_kernel, "Get the value of GraphKernel switch.");
 
   (void)py::class_<mindspore::MpiConfig, std::shared_ptr<mindspore::MpiConfig>>(m, "MpiConfig")
     .def_static("get_instance", &mindspore::MpiConfig::GetInstance, "Get mpi config instance.")
diff --git a/mindspore/ccsrc/pipeline/parse/data_converter.cc b/mindspore/ccsrc/pipeline/parse/data_converter.cc
index 20f7c0c9ce..330d03d11c 100644
--- a/mindspore/ccsrc/pipeline/parse/data_converter.cc
+++ b/mindspore/ccsrc/pipeline/parse/data_converter.cc
@@ -278,7 +278,7 @@ bool ConvertCellObjToFuncGraph(py::object obj, ValuePtr *const data) {
     if (bprop_graph != nullptr) {
       (void)func_graph->transforms().insert(std::make_pair(CUSTOM_BPROP_NAME, FuncGraphTransform(bprop_graph)));
       (void)bprop_graph->transforms().insert(std::make_pair("primal", FuncGraphTransform(func_graph)));
-      func_graph->set_flags(FUNC_GRAPH_FLAG_DEFER_INLINE, true);
+      func_graph->set_flag(FUNC_GRAPH_FLAG_DEFER_INLINE, true);
     }
   }
   *data = func_graph;
diff --git a/mindspore/ccsrc/pipeline/parse/parse.cc b/mindspore/ccsrc/pipeline/parse/parse.cc
index 972f11230e..6d5c28c98c 100644
--- a/mindspore/ccsrc/pipeline/parse/parse.cc
+++ b/mindspore/ccsrc/pipeline/parse/parse.cc
@@ -1448,15 +1448,23 @@ bool ParseAst::UpdateFuncGraphFlags(const FuncGraphPtr &func_graph) {
   }
   py::dict flags = python_adapter::GetPyObjAttr(obj_, PYTHON_EXTERN_MINDSPORE_FLAG);
   for (auto &item : flags) {
-    if (!py::isinstance<py::str>(item.first) || !py::isinstance<py::bool_>(item.second)) {
+    if (!py::isinstance<py::str>(item.first)) {
       MS_LOG(ERROR) << "Type error in flags dict convert";
       return false;
     }
     auto name = py::cast<std::string>(item.first);
-    auto value = py::cast<bool>(item.second);
-    MS_LOG(DEBUG) << "Flag name: " << name << ". Value: " << value;
-
-    func_graph->set_flags(name, value);
+    if (py::isinstance<py::bool_>(item.second)) {
+      auto value = py::cast<bool>(item.second);
+      MS_LOG(DEBUG) << "Flag name: " << name << ". Value: " << value;
+      func_graph->set_flag(name, value);
+    } else if (py::isinstance<py::str>(item.second)) {
+      auto value = py::cast<std::string>(item.second);
+      MS_LOG(DEBUG) << "Flag name: " << name << ". Value: " << value;
+      func_graph->set_attr(name, MakeValue(value));
+    } else {
+      MS_LOG(ERROR) << "Type error in flags/attrs dict convert";
+      return false;
+    }
   }
 
   return true;
diff --git a/mindspore/ccsrc/pipeline/parse/parse.h b/mindspore/ccsrc/pipeline/parse/parse.h
index 969effbd18..0a56ccaed9 100644
--- a/mindspore/ccsrc/pipeline/parse/parse.h
+++ b/mindspore/ccsrc/pipeline/parse/parse.h
@@ -223,8 +223,8 @@ class Parser {
     FunctionBlockPtr block = std::make_shared<FunctionBlock>(parse);
     // In order to keep effect order in the sub-graphs which generated by control flow.
     // We copy the flags from the top graph to the sub-graphs.
-    if (func_graph_ && !func_graph_->flags().empty()) {
-      block->func_graph()->set_flags(func_graph_->flags());
+    if (func_graph_ && !func_graph_->attrs().empty()) {
+      block->func_graph()->set_attrs(func_graph_->attrs());
     }
     func_block_list_.push_back(block);
     return block;
diff --git a/mindspore/ccsrc/pipeline/pass.cc b/mindspore/ccsrc/pipeline/pass.cc
index 0ffaebac4c..94063fb780 100644
--- a/mindspore/ccsrc/pipeline/pass.cc
+++ b/mindspore/ccsrc/pipeline/pass.cc
@@ -25,12 +25,14 @@
 #include <functional>
 
 #include "ir/func_graph_cloner.h"
+#include "debug/anf_ir_utils.h"
 #include "pipeline/parse/parse_base.h"
 #include "pipeline/parse/data_converter.h"
 #include "pipeline/resource.h"
 #include "pipeline/validator.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/cse.h"
+#include "optimizer/graph_kernel_reuse.h"
 #include "optimizer/clean.h"
 #include "optimizer/irpass.h"
 #include "optimizer/control_depend.h"
@@ -38,6 +40,7 @@
 #include "parallel/step_auto_parallel.h"
 #include "parallel/allreduce_fusion/step_allreduce_fusion.h"
 #include "utils/any.h"
+#include "utils/log_adapter.h"
 
 namespace mindspore {
 namespace pipeline {
@@ -162,6 +165,40 @@ OptPassGroupMap GetOptPassesB(const opt::irpass::OptimizeIRPassLib &irpass) {
   return map;
 }
 
+OptPassGroupMap GetOptPassesGraphKernelA(const opt::irpass::OptimizeIRPassLib &irpass) {
+  opt::OptPassConfig interface_fusion = opt::OptPassConfig({
+    irpass.mark_interface_fusion_,
+  });
+  OptPassGroupMap map({
+    {"graph_kernel_reuse", opt::OptPassConfig(opt::GraphKernelReuse())},
+    {"interface_fusion", interface_fusion},
+    {"renormalize", opt::OptPassConfig::Renormalize()},
+    {"cse", opt::OptPassConfig(opt::CSE(false))},
+  });
+  return map;
+}
+
+OptPassGroupMap GetOptPassesGraphKernelB(const opt::irpass::OptimizeIRPassLib &irpass) {
+  opt::OptPassConfig elim_1 = opt::OptPassConfig({
+    irpass.addn_eliminate_,
+    irpass.incorporate_getitem_from_param_,
+  });
+  opt::OptPassConfig elim_2 = opt::OptPassConfig({
+    irpass.unused_parameter_eliminate_,
+    irpass.unused_output_eliminate_,
+  });
+  OptPassGroupMap map({
+    {"elim_1", elim_1},
+    {"renormalize", opt::OptPassConfig::Renormalize()},
+    {"elim_2", elim_2},
+  });
+  return map;
+}
+
+OptPassGroupMap GetOptPassesC(const opt::irpass::OptimizeIRPassLib &irpass) {
+  return OptPassGroupMap({{"renormalize", opt::OptPassConfig::Renormalize()}});
+}
+
 OptPassGroupMap GetControlPhases(const opt::irpass::OptimizeIRPassLib &irpass) {
   opt::OptPassConfig control_group = opt::OptPassConfig({irpass.convert_switch_replacement_}, true);
   OptPassGroupMap map({
@@ -191,8 +228,19 @@ void InitOpt(const ResourcePtr &res) {
     opt::irpass::OptimizeIRPassLib irpass;
     g_pass_opts["opt_a"] = Optimizer::MakeOptimizer("opt_a", res, GetOptPassesA(irpass));
     g_pass_opts["opt_b"] = Optimizer::MakeOptimizer("opt_b", res, GetOptPassesB(irpass), false, true);
+    g_pass_opts["opt_graph_kernel_a"] =
+      Optimizer::MakeOptimizer("opt_graph_kernel_a", res, GetOptPassesGraphKernelA(irpass), true);
+    g_pass_opts["opt_graph_kernel_b"] =
+      Optimizer::MakeOptimizer("opt_graph_kernel_b", res, GetOptPassesGraphKernelB(irpass), false);
+    g_pass_opts["renormal"] = Optimizer::MakeOptimizer("renormal", res, GetOptPassesC(irpass));
     g_pass_opts["opt_control"] = Optimizer::MakeOptimizer("opt_control", res, GetControlPhases(irpass), false, true);
     g_pass_opts["opt_prepare"] = Optimizer::MakeOptimizer("opt_prepare", res, GetPreparePhases(irpass));
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    if (!(context_ptr->enable_graph_kernel())) {
+      g_pass_opts["opt_graph_kernel_a"]->set_enable(false);
+      g_pass_opts["opt_graph_kernel_b"]->set_enable(false);
+    }
   }
 }
 }  // namespace
@@ -224,9 +272,13 @@ bool OptPassGroup(const ResourcePtr &res, const std::string &name) {
 
 bool OptPassAGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_a"); }
 bool OptPassBGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_b"); }
+bool OptPassGraphKernelGroupA(const ResourcePtr &res) { return OptPassGroup(res, "opt_graph_kernel_a"); }
+bool OptPassGraphKernelGroupB(const ResourcePtr &res) { return OptPassGroup(res, "opt_graph_kernel_b"); }
 bool ControlGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_control"); }
 bool PrepareGroup(const ResourcePtr &res) { return OptPassGroup(res, "opt_prepare"); }
 
+bool OptPassRNGroup(const ResourcePtr &res) { return OptPassGroup(res, "renormal"); }
+
 bool AddControlDependPass(const ResourcePtr &res) {
   FuncGraphPtr func_graph = res->func_graph();
   MS_EXCEPTION_IF_NULL(func_graph);
@@ -270,8 +322,10 @@ bool InferenceOptPreparePass(const ResourcePtr &res) {
 std::vector<PassItem> kVmPasses = {{"simplify_data_structures", SimplifyDataStructuresPass},
                                    {"opt_a", OptPassAGroup},
                                    {"opt_b", OptPassBGroup},
-                                   {"add_control_depend", AddControlDependPass},
-                                   {"cconv", CconvPass}};
+                                   {"cconv", CconvPass},
+                                   {"opt_graph_kernel_a", OptPassGraphKernelGroupA},
+                                   {"opt_graph_kernel_b", OptPassGraphKernelGroupB},
+                                   {"add_control_depend", AddControlDependPass}};
 
 std::vector<PassItem> kGePasses = {{"simplify_data_structures", SimplifyDataStructuresPass},
                                    {"opt_a", OptPassAGroup},
diff --git a/mindspore/ccsrc/pipeline/pipeline_ge.cc b/mindspore/ccsrc/pipeline/pipeline_ge.cc
index 2776109655..ea0ca14c7a 100644
--- a/mindspore/ccsrc/pipeline/pipeline_ge.cc
+++ b/mindspore/ccsrc/pipeline/pipeline_ge.cc
@@ -488,7 +488,7 @@ py::object ExecDFGraph(const std::map<std::string, ExecutorInfoPtr> &info, const
 #ifdef ENABLE_INFER
   // Now don't use the graph because the exec ge function don't take effect
   MS_EXCEPTION_IF_NULL(info.at(phase)->func_graph);
-  if (ENABLE_TRAIN != info.at(phase)->func_graph->flags()["training"]) {
+  if (ENABLE_TRAIN != info.at(phase)->func_graph->has_flag("training")) {
     MS_LOG(ERROR) << "Graph training mode mismatch mode of libraries";
     ConfigManager::GetInstance().ResetConfig();
     return py::none();
diff --git a/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc b/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc
index 254fd43c0b..c9b1ce4f93 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/evaluator.cc
@@ -165,7 +165,7 @@ AbstractBasePtrList FuncGraphEvaluator::BroadenUndeterminedArgs(const AbstractBa
         MS_LOG(DEBUG) << "Joined args: " << ::mindspore::ToString(joined_args_spec_list);
         // If there is loop variant, all arguments need to be broaden to avoid wrong constant propagation.
         if (!(joined_args_spec_list == args_spec_list)) {
-          func_graph_->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+          func_graph_->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
         }
         return joined_args_spec_list;
       }
@@ -178,7 +178,7 @@ AbstractBasePtrList FuncGraphEvaluator::BroadenUndeterminedArgs(const AbstractBa
       // If there is loop variant, all arguments need to be broaden to avoid wrong constant propagation.
       if (!(joined_args_spec_list == args_spec_list)) {
         trace_.push_back(joined_args_spec_list);
-        func_graph_->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
+        func_graph_->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
       }
       MS_LOG(DEBUG) << "Joined eval args: " << ::mindspore::ToString(joined_args_spec_list);
       return joined_args_spec_list;
diff --git a/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc b/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc
index 9299a02002..9da148d2a7 100644
--- a/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc
+++ b/mindspore/ccsrc/pipeline/static_analysis/static_analysis.cc
@@ -479,7 +479,7 @@ void AnalysisEngine::SetUndeterminedFlag(const EvaluatorPtr &evaluator) {
   if (undetermined_fgs) {
     auto fg_parent = fg->parent();
     MS_EXCEPTION_IF_NULL(fg_parent);
-    fg_parent->set_flags(kFuncGraphFlagUndetermined, true);
+    fg_parent->set_flag(kFuncGraphFlagUndetermined, true);
     MS_LOG(DEBUG) << "Set graph undetermined: " << fg_parent->ToString();
   }
 }
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
index f4f9d8da14..981e2255f3 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
@@ -16,6 +16,7 @@
 #include "pre_activate/ascend/ascend_backend_optimization.h"
 #include <memory>
 #include <string>
+#include <set>
 #include "pre_activate/common/optimizer.h"
 #include "pre_activate/ascend/ir_fission/bn_split.h"
 #include "pre_activate/ascend/ir_fission/bn_grad_split.h"
@@ -63,6 +64,9 @@
 #include "pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
 #include "pre_activate/pass/eliminate_redundant_op.h"
 #include "pre_activate/pass/common_subexpression_elimination.h"
+#include "pre_activate/pass/fuse_graph_kernel.h"
+#include "pre_activate/pass/fuse_basic.h"
+#include "pre_activate/pass/add_atomic_clean.h"
 #include "pre_activate/ascend/format_type/merge_cast_to_op.h"
 #include "pre_activate/ascend/format_type/check_consistency.h"
 #include "pre_activate/ascend/buffer_fusion/ub_pattern_fusion.h"
@@ -88,6 +92,8 @@
 #include "pre_activate/ascend/enhancer/insert_memcpy_async_for_getnext.h"
 #include "pre_activate/ascend/ir_fission/batch_norm_grad_infer_fission.h"
 #include "pre_activate/ascend/ir_fission/split_fission.h"
+#include "pre_activate/ascend/format_type/modify_ops_attrs.h"
+#include "pre_activate/ascend/format_type/remove_no_use_reshape_op.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "debug/anf_ir_dump.h"
@@ -164,6 +170,19 @@ void RunOpAscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_g
   kernel_graph->SetExecOrderByDefault();
 }
 
+void AscendGraphKernelCommonProcess(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto optimizer = std::make_shared<GraphOptimizer>();
+  MS_EXCEPTION_IF_NULL(optimizer);
+  auto common_process = std::make_shared<PassManager>("graph_kernel_common_process");
+  MS_EXCEPTION_IF_NULL(common_process);
+  common_process->AddPass(std::make_shared<ModifyOpAttrs>());
+  common_process->AddPass(std::make_shared<RemoveNoUseReshapeOp>());
+  optimizer->AddPassManager(common_process);
+  (void)optimizer->Optimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+}
+
 void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto optimizer = std::make_shared<GraphOptimizer>();
@@ -332,7 +351,94 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
     std::string file_path =
       save_graphs_path + "/" + "hwopt_d_end" + "_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
     DumpIR(file_path, kernel_graph, true);
-    DumpIRProto(kernel_graph, "after_hwopt_" + std::to_string(kernel_graph->graph_id()));
+    DumpIRProto(kernel_graph, "after_hwopt");
+    kernel_graph->DumpFuncGraph("hwopt_d_end");
+  }
+}
+
+void AscendBackendGraphKernelOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                                 bool is_before_kernel_select) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!(context_ptr->enable_graph_kernel())) {
+    return;
+  }
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_graph_kernel_opt_before_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph);
+  }
+
+  // Fuse graph kernels with basic ops
+  FuseGraphKernel(kernel_graph, is_before_kernel_select);
+
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_graph_kernel_opt_end_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph, true);
+  }
+}
+
+void AscendBackendFuseBasicOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                               bool is_before_kernel_select) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!(context_ptr->enable_graph_kernel())) {
+    return;
+  }
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_fuse_basic_opt_before_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph, true);
+  }
+
+  // Fuse basic ops with basic ops
+  FuseBasic(kernel_graph, is_before_kernel_select);
+
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_fuse_basic_opt_end_graph_" +
+                            std::to_string(!is_before_kernel_select) + "_" + std::to_string(kernel_graph->graph_id()) +
+                            ".ir";
+    DumpIR(file_path, kernel_graph, true);
+  }
+}
+
+void AscendBackendAddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (!(context_ptr->enable_graph_kernel())) {
+    return;
+  }
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path = save_graphs_path + "/" + "hwopt_d_add_atomic_clean_before" + "_graph_" +
+                            std::to_string(kernel_graph->graph_id()) + ".ir";
+    DumpIR(file_path, kernel_graph);
+  }
+
+  AddAtomicClean(kernel_graph);
+
+  if (save_graphs) {
+    std::string file_path =
+      save_graphs_path + "/" + "hwopt_d_end" + "_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
+    DumpIR(file_path, kernel_graph, true);
   }
 }
 
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h
index 46d9f9bd1b..222c4b90b5 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.h
@@ -24,6 +24,12 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
 void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+void AscendGraphKernelCommonProcess(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+void AscendBackendGraphKernelOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                                 bool is_before_kernel_select = false);
+void AscendBackendFuseBasicOpt(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                               bool is_before_kernel_select = false);
+void AscendBackendAddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph);
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
index 8a14b438bb..9c498bd736 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
@@ -22,6 +22,7 @@
 #include "utils/utils.h"
 #include "device/kernel_info.h"
 #include "kernel/oplib/oplib.h"
+#include "kernel/common_utils.h"
 #include "operator/ops.h"
 #include "session/anf_runtime_algorithm.h"
 #include "session/kernel_graph.h"
@@ -229,7 +230,7 @@ AnfNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr
   if (kernel::OpLib::FindOp(prim::kPrimCast->name(), kernel::kTBE) != nullptr) {
     builder.SetKernelType(KernelType::TBE_KERNEL);
   } else {
-    builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+    builder.SetKernelType(KernelType::AKG_KERNEL);
   }
   // if kernel info is null , it remarks this function is running ut
   if (cast->kernel_info() == nullptr) {
@@ -284,22 +285,17 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
   MS_EXCEPTION_IF_NULL(cnode);
   std::vector<AnfNodePtr> new_inputs = {AnfAlgo::GetCNodePrimitiveNode(cnode)};
   for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(cnode); ++input_index) {
-    TypeId origin_type;
+    const auto infer_type = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
+    TypeId origin_type(kTypeUnknown);
     auto cur_input = AnfAlgo::GetInputNode(cnode, input_index);
     auto kernel_with_index = AnfAlgo::VisitKernel(cur_input, 0);
-    auto is_weight_boundary = [](const AnfNodePtr &node) -> bool {
-      if (node->isa<ValueNode>()) {
-        return true;
-      }
-      if (node->isa<Parameter>() && AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>())) {
-        return true;
-      }
-      return false;
-    };
     auto real_input_node = kernel_with_index.first;
-    if (is_weight_boundary(real_input_node)) {
+    if (kernel::IsWeightBoundary(real_input_node) || func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
       // weight
-      origin_type = AnfAlgo::GetPrevNodeOutputDeviceDataType(cnode, input_index);
+      origin_type = AnfAlgo::GetPrevNodeOutputPrecision(cnode, input_index);
+      if (origin_type == kTypeUnknown) {
+        origin_type = AnfAlgo::GetPrevNodeOutputDeviceDataType(cnode, input_index);
+      }
     } else {
       // feature map
       origin_type = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
@@ -307,9 +303,13 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
     const std::string dev_fmt = AnfAlgo::GetInputFormat(cnode, input_index);
     const std::vector<size_t> origin_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, input_index);
     const TypeId device_type = AnfAlgo::GetInputDeviceDataType(cnode, input_index);
-    if (origin_type != device_type) {
+    // In graph kernel, we check parameter,
+    // the eliminate pass will not eliminate this case, so we just do not insert the noused cast.
+    if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) && IsValueNode<tensor::Tensor>(cur_input)) {
+      new_inputs.push_back(cur_input);
+    } else if (origin_type != device_type) {
       auto cast =
-        AddCastOpNodeToGraph(func_graph, cur_input, dev_fmt, origin_type, device_type, origin_shape, origin_type);
+        AddCastOpNodeToGraph(func_graph, cur_input, dev_fmt, origin_type, device_type, origin_shape, infer_type);
       MS_EXCEPTION_IF_NULL(cast);
       cast->set_scope(cnode->scope());
       AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), cast);
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc
index d2557a4bb7..7c8fb70fda 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/check_consistency.cc
@@ -17,9 +17,12 @@
 
 #include <string>
 #include <memory>
+#include <vector>
 
 #include "utils/utils.h"
 #include "session/anf_runtime_algorithm.h"
+#include "common/utils.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
@@ -74,11 +77,21 @@ const AnfNodePtr CheckConsistency::Process(const FuncGraphPtr &, const AnfNodePt
   if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealKernel(node)) {
     return nullptr;
   }
-  CNodePtr cnode = node->cast<CNodePtr>();
-  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(cnode); i++) {
-    if (!CheckFormatForConsistency(cnode, i) || !CheckDataTypeForConsistency(cnode, i)) {
-      MS_LOG(EXCEPTION) << "Found inconsistent format or data type! Op: " << AnfAlgo::GetCNodeName(node) << "["
-                        << node->DebugString() << "]";
+
+  std::vector<AnfNodePtr> todos = {node};
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    kernel::GetValidKernelNodes(sub_graph, &todos);
+  }
+
+  for (auto &t : todos) {
+    CNodePtr cnode = t->cast<CNodePtr>();
+    for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(cnode); i++) {
+      if (!CheckFormatForConsistency(cnode, i) || !CheckDataTypeForConsistency(cnode, i)) {
+        MS_LOG(EXCEPTION) << "Found inconsistent format or data type! Op: " << AnfAlgo::GetCNodeName(cnode) << "["
+                          << cnode->DebugString() << "]";
+      }
     }
   }
   return nullptr;
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc
index 0fefab10d0..3d09233d99 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <utility>
 
 #include "device/kernel_info.h"
 #include "pre_activate/ascend/ascend_helper.h"
@@ -27,34 +28,45 @@
 #include "session/anf_runtime_algorithm.h"
 #include "session/kernel_graph.h"
 #include "utils/utils.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
 namespace {
-AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
+AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode,
+                                       const std::vector<bool> &need_insert_cast) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(cnode);
   std::vector<AnfNodePtr> make_tuple_inputs;
   AbstractBasePtrList abstract_list;
   make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
   for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
-    const std::string dev_fmt = AnfAlgo::GetOutputFormat(cnode, output_idx);
-    const std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
-    const TypeId origin_type = AnfAlgo::GetOutputInferDataType(cnode, output_idx);
-    const TypeId device_type = AnfAlgo::GetOutputDeviceDataType(cnode, output_idx);
+    AnfNodePtr replace_node = nullptr;
+    const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
+    const auto infer_type = AnfAlgo::GetOutputInferDataType(cnode, output_idx);
     auto idx = NewValueNode(SizeToInt(output_idx));
     MS_EXCEPTION_IF_NULL(idx);
     auto imm = std::make_shared<Int32Imm>(output_idx);
     idx->set_abstract(std::make_shared<abstract::AbstractScalar>(imm));
     auto getitem = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), cnode, idx});
-    AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, getitem.get());
-    AnfNodePtr replace_node = nullptr;
-    if (origin_type != device_type) {
-      replace_node =
-        AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape, origin_type);
-      MS_EXCEPTION_IF_NULL(replace_node);
-      replace_node->set_scope(cnode->scope());
-      AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+    AnfAlgo::SetOutputInferTypeAndShape({infer_type}, {origin_shape}, getitem.get());
+    if (need_insert_cast[output_idx]) {
+      const auto dev_fmt = AnfAlgo::GetOutputFormat(cnode, output_idx);
+      TypeId origin_type(kTypeUnknown);
+      if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+        origin_type = AnfAlgo::GetCNodeOutputPrecision(cnode);
+      }
+      origin_type = origin_type == kTypeUnknown ? infer_type : origin_type;
+      const auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, output_idx);
+      if (origin_type != device_type) {
+        replace_node =
+          AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape, infer_type);
+        MS_EXCEPTION_IF_NULL(replace_node);
+        replace_node->set_scope(cnode->scope());
+        AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+      } else {
+        replace_node = getitem;
+      }
     } else {
       replace_node = getitem;
     }
@@ -65,9 +77,10 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
   MS_EXCEPTION_IF_NULL(make_tuple);
   make_tuple->set_abstract(std::make_shared<abstract::AbstractTuple>(abstract_list));
   return make_tuple;
-}
+}  // namespace
 
-AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
+AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &cnode,
+                               const std::vector<bool> &need_insert_cast) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(cnode);
   if (AnfAlgo::GetOutputTensorNum(cnode) == 0) {
@@ -76,14 +89,23 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
   MS_EXCEPTION_IF_NULL(cnode->Type());
   // Single output
   if (!cnode->Type()->isa<Tuple>()) {
+    if (!need_insert_cast[0]) {
+      return cnode;
+    }
+
     const std::string dev_fmt = AnfAlgo::GetOutputFormat(cnode, 0);
     std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(cnode, 0);
-    const TypeId origin_type = AnfAlgo::GetOutputInferDataType(cnode, 0);
+    const auto infer_type = AnfAlgo::GetOutputInferDataType(cnode, 0);
+    TypeId origin_type(kTypeUnknown);
+    if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+      origin_type = AnfAlgo::GetCNodeOutputPrecision(cnode);
+    }
+    origin_type = origin_type == kTypeUnknown ? infer_type : origin_type;
     const TypeId device_type = AnfAlgo::GetOutputDeviceDataType(cnode, 0);
     AnfNodePtr replace_node = cnode;
     if (origin_type != device_type) {
       replace_node =
-        AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape, origin_type);
+        AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape, infer_type);
       MS_EXCEPTION_IF_NULL(replace_node);
       replace_node->set_scope(cnode->scope());
       AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
@@ -91,7 +113,57 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
     return replace_node;
   }
   // Multiple output
-  return InsertCastForMultipleOutput(func_graph, cnode);
+  return InsertCastForMultipleOutput(func_graph, cnode, need_insert_cast);
+}
+
+AnfNodePtr ProcessGraphKernelOp(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
+  // insert cast for ops in graph kernel.
+  auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(sub_graph);
+  auto mng = sub_graph->manager();
+  MS_EXCEPTION_IF_NULL(mng);
+  std::vector<AnfNodePtr> todo;
+  std::vector<std::pair<AnfNodePtr, size_t>> graph_rets;
+  kernel::GetValidKernelNodes(sub_graph, &todo);
+  kernel::GetGraphRealOutput(sub_graph, &graph_rets);
+  for (auto &t : todo) {
+    AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), t);
+    // process input
+    CNodePtr t_cnode = t->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(t_cnode);
+    auto t_new_node = InsertCastForInput(sub_graph, t_cnode);
+    AnfNodePtr t_new_node_1 = nullptr;
+    std::vector<bool> need_insert_cast(AnfAlgo::GetOutputTensorNum(t), true);
+    // process output
+    auto iter = std::find_if(graph_rets.begin(), graph_rets.end(),
+                             [&t](const std::pair<AnfNodePtr, size_t> &ret) { return ret.first == t; });
+    if (iter != graph_rets.end()) {
+      auto t_fix_output_type = AnfAlgo::GetCNodeOutputPrecision(t);
+      auto t_output_type = AnfAlgo::GetOutputDeviceDataType(t, iter->second);
+      auto graph_output_type = AnfAlgo::GetOutputDeviceDataType(node, iter - graph_rets.begin());
+      if (t_fix_output_type == kTypeUnknown && t_output_type == graph_output_type) {
+        need_insert_cast[iter->second] = false;
+      } else if (t_fix_output_type == t_output_type && t_output_type == graph_output_type) {
+        need_insert_cast[iter->second] = false;
+      }
+      t_new_node_1 = InsertCastForOutput(sub_graph, t_new_node, need_insert_cast);
+    } else {
+      t_new_node_1 = InsertCastForOutput(sub_graph, t_new_node, need_insert_cast);
+    }
+
+    if (t_new_node_1 != nullptr && t_new_node_1 != t) {
+      (void)mng->Replace(t, t_new_node_1);
+    }
+  }
+
+  // insert cast for graph kernel.
+  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
+  // process input
+  CNodePtr cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto new_node = InsertCastForInput(func_graph, cnode);
+  // process output
+  return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
 }  // namespace
 
@@ -106,13 +178,27 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
   if (!AnfAlgo::IsRealCNodeKernel(node) || func_graph == nullptr) {
     return nullptr;
   }
+
+  if (AnfAlgo::IsGraphKernel(node)) {
+    return ProcessGraphKernelOp(func_graph, node);
+  } else {
+    // insert cast for single op.
+    AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
+    // process input
+    CNodePtr cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    auto new_node = InsertCastForInput(func_graph, cnode);
+    // process output
+    return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
+  }
+  // insert cast for single op.
   AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
   // process input
   CNodePtr cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   auto new_node = InsertCastForInput(func_graph, cnode);
   // process output
-  return InsertCastForOutput(func_graph, new_node);
+  return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc
index 8bb58c18a5..3c37e098e7 100644
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/merge_cast_to_op.cc
@@ -133,6 +133,9 @@ AnfNodePtr MergeCastToNextOp(const FuncGraphPtr &graph, const CNodePtr &node, co
     return nullptr;
   }
   auto next_cnode = next_node->cast<CNodePtr>();
+  if (AnfAlgo::IsGraphKernel(next_node)) {
+    return nullptr;
+  }
   auto next_op_name = AnfAlgo::GetCNodeName(next_node);
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
   kernel_query->Query(next_cnode, &kernel_info_list);
@@ -206,6 +209,9 @@ AnfNodePtr MergeCastToPriorOp(const FuncGraphPtr &graph, const CNodePtr &cur_nod
     return nullptr;
   }
   MS_EXCEPTION_IF_NULL(prior_op);
+  if (AnfAlgo::IsGraphKernel(prior_op)) {
+    return nullptr;
+  }
 
   std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
   kernel_query->Query(prior_op, &kernel_info_list);
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.cc
new file mode 100644
index 0000000000..42061957b9
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/ascend/format_type/modify_ops_attrs.h"
+#include <vector>
+#include <memory>
+#include "utils/utils.h"
+#include "pre_activate/common/helper.h"
+#include "kernel/common_utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+AnfNodePtr ModifyReduceOpsAttrs(const CNodePtr &cnode) {
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
+  auto input_format = AnfAlgo::GetInputFormat(cnode, 0);
+  if (input_shape.size() == 5 || input_format != kOpFormat_NC1HWC0) {
+    return nullptr;
+  }
+  if (!AnfAlgo::HasNodeAttr(kAttrKeepDims, cnode)) {
+    return nullptr;
+  }
+
+  AnfAlgo::SetNodeAttr(kAttrKeepDims, MakeValue(true), cnode);
+  return cnode;
+}
+
+AnfNodePtr ModifyTileOpAttrs(const CNodePtr &cnode) {
+  auto input_shape = AnfAlgo::GetInputDeviceShape(cnode, 0);
+  if (input_shape.size() != 5) {
+    return nullptr;
+  }
+  if (!AnfAlgo::HasNodeAttr(kAttrMultiples, cnode)) {
+    return nullptr;
+  }
+
+  auto multiples = AnfAlgo::GetNodeAttr<std::vector<int>>(cnode, kAttrMultiples);
+  if (multiples.size() == 4 && multiples[1] == 1) {
+    multiples.push_back(1);
+    AnfAlgo::SetNodeAttr(kAttrMultiples, MakeValue(multiples), cnode);
+  }
+
+  return cnode;
+}
+
+AnfNodePtr ModifyAttrs(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto op_name = AnfAlgo::GetCNodeName(cnode);
+  if (op_name == prim::kPrimTile->name()) {
+    return ModifyTileOpAttrs(cnode);
+  } else if (op_name == prim::kPrimReduceSum->name()) {
+    // kPrimReduceMean
+    // kPrimReduceSum
+    // kPrimReduceAll
+    // kPrimReduceMax
+    // kPrimReduceMin
+    return ModifyReduceOpsAttrs(cnode);
+  }
+  return nullptr;
+}
+}  // namespace
+
+const AnfNodePtr ModifyOpAttrs::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                        const EquivPtr &) const {
+  if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsGraphKernel(node)) {
+    return nullptr;
+  }
+  MS_LOG(DEBUG) << "====Process op: " << AnfAlgo::GetCNodeName(node);
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  auto manager = fg->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  std::vector<AnfNodePtr> todos;
+  kernel::GetValidKernelNodes(fg, &todos);
+  for (auto &t : todos) {
+    auto new_node = ModifyAttrs(t->cast<CNodePtr>());
+    if (new_node != nullptr && new_node != t) {
+      (void)manager->Replace(t, new_node);
+    }
+  }
+  return node;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.h b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.h
new file mode 100644
index 0000000000..25ec94b6b4
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/modify_ops_attrs.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_MODIFY_OPS_ATTRS_H
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_MODIFY_OPS_ATTRS_H
+
+#include "pre_activate/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ModifyOpAttrs : public PatternProcessPass {
+ public:
+  explicit ModifyOpAttrs(bool multigraph = true) : PatternProcessPass("modify_ops_attrs", multigraph) {}
+  ~ModifyOpAttrs() override = default;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_MODIFY_OPS_ATTRS_H
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.cc b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.cc
new file mode 100644
index 0000000000..dde40a5090
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/ascend/format_type/remove_no_use_reshape_op.h"
+#include <vector>
+#include <memory>
+#include "pre_activate/common/helper.h"
+#include "kernel/common_utils.h"
+#include "session/anf_runtime_algorithm.h"
+#include "operator/ops.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+AnfNodePtr RemoveReshapeOp(const CNodePtr &cnode) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto op_name = AnfAlgo::GetCNodeName(cnode);
+  if (op_name != prim::kPrimReshape->name()) {
+    return nullptr;
+  }
+
+  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
+  auto input_format = AnfAlgo::GetPrevNodeOutputFormat(cnode, 0);
+  if (input_shape.size() != 1 || input_format != kOpFormat_NC1HWC0) {
+    return nullptr;
+  }
+
+  return cnode->input(1);
+}
+}  // namespace
+
+const AnfNodePtr RemoveNoUseReshapeOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                               const EquivPtr &) const {
+  if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsGraphKernel(node)) {
+    return nullptr;
+  }
+  MS_LOG(DEBUG) << "====process op: " << AnfAlgo::GetCNodeName(node);
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  auto manager = fg->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  std::vector<AnfNodePtr> todos;
+  kernel::GetValidKernelNodes(fg, &todos);
+  for (auto &t : todos) {
+    auto new_node = RemoveReshapeOp(t->cast<CNodePtr>());
+    if (new_node != nullptr && new_node != t) {
+      (void)manager->Replace(t, new_node);
+    }
+  }
+  return node;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.h b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.h
new file mode 100644
index 0000000000..4942c2fc08
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/remove_no_use_reshape_op.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_NO_USE_RESHAPE_OP_H
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_NO_USE_RESHAPE_OP_H
+
+#include "pre_activate/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class RemoveNoUseReshapeOp : public PatternProcessPass {
+ public:
+  explicit RemoveNoUseReshapeOp(bool multigraph = true) : PatternProcessPass("remove_no_use_reshape_op", multigraph) {}
+  ~RemoveNoUseReshapeOp() override = default;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_NO_USE_RESHAPE_OP_H
diff --git a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc
index fba1ab40af..b16387d8f1 100644
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.cc
@@ -121,6 +121,9 @@ const AnfNodePtr LayerNormBetaGammaBackpropFusion::Process(const FuncGraphPtr &f
   if (node == nullptr || !node->isa<CNode>()) {
     return nullptr;
   }
+  if (AnfAlgo::IsGraphKernel(node)) {
+    return nullptr;
+  }
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   std::vector<CNodePtr> cast_nodes;
diff --git a/mindspore/ccsrc/pre_activate/common/helper.cc b/mindspore/ccsrc/pre_activate/common/helper.cc
index c59260564a..1c2ade201c 100644
--- a/mindspore/ccsrc/pre_activate/common/helper.cc
+++ b/mindspore/ccsrc/pre_activate/common/helper.cc
@@ -102,9 +102,12 @@ bool UnVisited(const BaseRef &n) {
       auto prim_py = value->cast<PrimitivePtr>();
       MS_EXCEPTION_IF_NULL(prim_py);
       return !prim_py->HasAttr(kAttrVisited);
-    } else {
-      return false;
+    } else if (IsValueNode<FuncGraph>(in)) {
+      auto func_graph = GetValueNode<FuncGraphPtr>(in);
+      MS_EXCEPTION_IF_NULL(func_graph);
+      return !func_graph->has_flag(kAttrVisited);
     }
+    return false;
   }
   return false;
 }
@@ -188,9 +191,12 @@ bool Visited(const BaseRef &n) {
       auto prim_py = value->cast<PrimitivePtr>();
       MS_EXCEPTION_IF_NULL(prim_py);
       return prim_py->HasAttr(kAttrVisited);
-    } else {
-      return false;
+    } else if (IsValueNode<FuncGraph>(in)) {
+      auto func_graph = GetValueNode<FuncGraphPtr>(in);
+      MS_EXCEPTION_IF_NULL(func_graph);
+      return func_graph->has_flag(kAttrVisited);
     }
+    return false;
   }
   return false;
 }
diff --git a/mindspore/ccsrc/pre_activate/common/node_pass.cc b/mindspore/ccsrc/pre_activate/common/node_pass.cc
index a6e93d2f07..876da8667b 100644
--- a/mindspore/ccsrc/pre_activate/common/node_pass.cc
+++ b/mindspore/ccsrc/pre_activate/common/node_pass.cc
@@ -22,6 +22,7 @@
 #include "ir/anf.h"
 #include "ir/func_graph.h"
 #include "ir/manager.h"
+#include "session/anf_runtime_algorithm.h"
 
 namespace mindspore {
 namespace opt {
@@ -52,8 +53,13 @@ bool NodePass::Run(const FuncGraphPtr &func_graph) {
     if (new_node && IsValueNode<FuncGraph>(new_node)) {
       auto const_func_graph = GetValueNode<FuncGraphPtr>(new_node);
       MS_EXCEPTION_IF_NULL(const_func_graph);
-      todo.push_back(const_func_graph->output());
+      if (!const_func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+        todo.push_back(const_func_graph->output());
+      }
     } else if (new_node && new_node->isa<CNode>()) {
+      if (AnfAlgo::IsGraphKernel(new_node)) {
+        todo.push_back(new_node);
+      }
       auto cnode = new_node->cast<CNodePtr>();
       MS_EXCEPTION_IF_NULL(cnode);
       auto inputs = cnode->inputs();
diff --git a/mindspore/ccsrc/pre_activate/common/optimizer.cc b/mindspore/ccsrc/pre_activate/common/optimizer.cc
index fa51a0bd8c..71a523ea1d 100644
--- a/mindspore/ccsrc/pre_activate/common/optimizer.cc
+++ b/mindspore/ccsrc/pre_activate/common/optimizer.cc
@@ -86,11 +86,8 @@ void GraphOptimizer::AddPassManager(const PassManagerPtr &pass_manager) {
 FuncGraphPtr GraphOptimizer::Optimize(const FuncGraphPtr &func_graph, bool run_only_once) {
   MS_EXCEPTION_IF_NULL(func_graph);
   run_only_once_ = (pass_managers_.size() == 1) ? true : run_only_once;
-  auto manager = func_graph->manager();
-  if (manager == nullptr) {
-    manager = Manage(func_graph, false);
-    func_graph->set_manager(manager);
-  }
+  // Performance risk by creating new manager each time
+  auto manager = Manage(func_graph, true);
 
   bool changed = true;
   while (changed) {
diff --git a/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.cc b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.cc
new file mode 100644
index 0000000000..0c2b22578f
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.cc
@@ -0,0 +1,122 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pre_activate/pass/add_atomic_clean.h"
+#include <memory>
+#include <vector>
+#include <functional>
+#include "operator/ops.h"
+#include "utils/utils.h"
+#include "utils/graph_utils.h"
+#include "utils/log_adapter.h"
+#include "session/anf_runtime_algorithm.h"
+#include "session/kernel_graph.h"
+#include "debug/anf_ir_dump.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+
+static std::vector<size_t> g_output_idx;
+
+bool HasAtomic(const AnfNodePtr &input) {
+  if (IsPrimitiveCNode(input)) {
+    const auto &cnode = input->cast<CNodePtr>();
+    const auto &prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+    return prim->HasAttr("atomic_add");
+  }
+  return false;
+}
+
+std::vector<int> CalCleanSize(const CNodePtr &pre_node) {
+  MS_EXCEPTION_IF_NULL(pre_node);
+  std::vector<int> clean_size_list;
+  // clean output
+  for (auto &index : g_output_idx) {
+    TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(pre_node, index);
+    size_t type_size = GetTypeByte(TypeIdToType(output_type_id));
+    std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(pre_node, index);
+    auto size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
+    clean_size_list.push_back((size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize);
+  }
+  MS_LOG(DEBUG) << "Clear output size: " << clean_size_list.size() << ", pre_node: " << pre_node->fullname_with_scope();
+  return clean_size_list;
+}
+
+CNodePtr CreateTbeAtomicCleanNode(const std::shared_ptr<session::KernelGraph> &kernel_graph,
+                                  const mindspore::CNodePtr &pre_node) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  MS_EXCEPTION_IF_NULL(pre_node);
+  auto clean_zero_prim = std::make_shared<Primitive>(kAtomicAddrCleanOpName);
+  auto new_value_node = NewValueNode(clean_zero_prim);
+  std::vector<AnfNodePtr> inputs = {new_value_node};
+  CNodePtr clean_zero = kernel_graph->NewCNode(inputs);
+  AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>();
+  clean_zero->set_abstract(abstract);
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  builder->SetKernelType(KernelType::TBE_KERNEL);
+  AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_zero.get());
+  auto clean_size = CalCleanSize(pre_node);
+  AnfAlgo::SetNodeAttr(kAttrAutomicAddMemSize, MakeValue(clean_size), clean_zero);
+  AnfAlgo::SetNodeAttr(kAttrAutomicOutputIndexs, MakeValue(g_output_idx), clean_zero);
+  AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(pre_node.get()), clean_zero.get());
+  return clean_zero;
+}
+}  // namespace
+
+void AddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(kernel_graph, true);
+    kernel_graph->set_manager(mng);
+  }
+  auto &todos = kernel_graph->execution_order();
+  for (auto iter = todos.cbegin(); iter != todos.end(); ++iter) {
+    auto node = *iter;
+    if (AnfAlgo::IsGraphKernel(node) && kernel_graph->nodes().contains(node)) {
+      auto fg = GetValueNode<FuncGraphPtr>(node->input(kAnfPrimitiveIndex));
+      MS_EXCEPTION_IF_NULL(fg);
+      auto input = fg->get_return()->input(1);
+      if (IsPrimitiveCNode(input, prim::kPrimMakeTuple)) {
+        const auto &cnode = input->cast<CNodePtr>();
+        for (size_t i = 0; i < cnode->inputs().size(); ++i) {
+          if (HasAtomic(cnode->input(i))) {
+            g_output_idx.push_back(i - 1);
+          }
+        }
+      } else if (HasAtomic(input)) {
+        g_output_idx.push_back(0);
+      }
+
+      if (!g_output_idx.empty()) {
+        auto zero_node = CreateTbeAtomicCleanNode(kernel_graph, node);
+        auto depend = kernel_graph->NewCNode({NewValueNode(prim::kPrimDepend), node->input(1), zero_node});
+        std::vector<AnfNodePtr> new_input = node->inputs();
+        new_input[1] = depend;
+        auto new_cnode = std::make_shared<CNode>(new_input, kernel_graph);
+        // Set abstract
+        new_cnode->set_abstract(node->abstract());
+        // Set kernel info
+        new_cnode->set_kernel_info(node->kernel_info_ptr());
+        mng->Replace(node, new_cnode);
+        g_output_idx.clear();
+      }
+    }
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.h b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.h
new file mode 100644
index 0000000000..bb1edb0e35
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/add_atomic_clean.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ADD_ATOMIC_CLEAN_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ADD_ATOMIC_CLEAN_H_
+
+#include <memory>
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+void AddAtomicClean(const std::shared_ptr<session::KernelGraph> &kernel_graph);
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ADD_ATOMIC_CLEAN_H
diff --git a/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc b/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
index f8604d7638..9af50eac33 100644
--- a/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
+++ b/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
@@ -45,6 +45,8 @@ bool BackendCSE::CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node) co
     auto node_value = GetValueNode(node);
     if (main_value->isa<Primitive>() && node_value->isa<Primitive>()) {
       replace = false;
+    } else if (main_value->isa<tensor::Tensor>() && node_value->isa<tensor::Tensor>()) {
+      replace = (AbsOf(main) == AbsOf(node)) && CheckEqualKernelBuildInfo(main, node);
     } else {
       replace = (AbsOf(main) == AbsOf(node)) && (*main_value == *node_value);
     }
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc
index 1f9e2712a6..38d629c415 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_attr.cc
@@ -26,6 +26,7 @@
 #include "utils/context/ms_context.h"
 #include "operator/ops.h"
 #include "session/anf_runtime_algorithm.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
@@ -34,14 +35,24 @@ const AnfNodePtr ConvertConstInputToAttr::Process(const FuncGraphPtr &, const An
   if (node == nullptr || !AnfAlgo::IsRealCNodeKernel(node)) {
     return nullptr;
   }
-  CNodePtr cnode = node->cast<CNodePtr>();
-
-  ConstInputToAttrInfoRegister reg;
-  if (!ConstInputToAttrInfoRegistry::Instance().GetRegisterByOpName(AnfAlgo::GetCNodeName(cnode), &reg)) {
-    return nullptr;
+  std::vector<AnfNodePtr> todos;
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    kernel::GetValidKernelNodes(sub_graph, &todos);
+  } else {
+    todos.push_back(node);
   }
-  ConstInputToAttr(cnode, reg.GetConstInputAttrInfo());
-  return cnode;
+
+  for (auto &t : todos) {
+    CNodePtr cnode = t->cast<CNodePtr>();
+    ConstInputToAttrInfoRegister reg;
+    if (!ConstInputToAttrInfoRegistry::Instance().GetRegisterByOpName(AnfAlgo::GetCNodeName(cnode), &reg)) {
+      continue;
+    }
+    ConstInputToAttr(cnode, reg.GetConstInputAttrInfo());
+  }
+  return node;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc
index 56be2e273d..b4f98cc6d7 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_const_input_to_tensor_input.cc
@@ -17,15 +17,39 @@
 
 #include <vector>
 #include <memory>
+#include <utility>
 
 #include "utils/graph_utils.h"
 #include "pre_activate/common/helper.h"
 #include "session/anf_runtime_algorithm.h"
 #include "session/kernel_graph.h"
+#include "kernel/common_utils.h"
+#include "device/kernel_info.h"
 
 namespace mindspore {
 namespace opt {
 namespace {
+ValueNodePtr MakeValueNode(const ValueNodePtr &value_node) {
+  MS_EXCEPTION_IF_NULL(value_node);
+  ValueNodePtr new_value_node = std::make_shared<ValueNode>(value_node->value());
+  new_value_node->set_abstract(value_node->abstract());
+  // create kernel_info fo new value node
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  new_value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node
+  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // set the format of value_node to DEFAULT_FORMAT
+  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+  // set value node initial device data type = infer data type
+  std::vector<TypeId> types;
+  for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(value_node); ++index) {
+    types.push_back(kTypeUnknown);
+  }
+  kernel_build_info_builder->SetOutputsDeviceType(types);
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get());
+  return new_value_node;
+}
+
 AnfNodePtr CreateTensorInput(const KernelGraphPtr &kernel_graph, const AnfNodePtr &input_node) {
   MS_EXCEPTION_IF_NULL(input_node);
   auto value_node = input_node->cast<ValueNodePtr>();
@@ -50,6 +74,8 @@ AnfNodePtr CreateTensorInput(const KernelGraphPtr &kernel_graph, const AnfNodePt
   if (kernel_graph != nullptr) {
     tensor_input = kernel_graph->NewValueNode(tensor_input);
     kernel_graph->AddValueNodeToGraph(tensor_input);
+  } else {
+    tensor_input = MakeValueNode(tensor_input);
   }
   tensor_input->set_scope(input_node->scope());
   return tensor_input;
@@ -89,6 +115,26 @@ AnfNodePtr ConstInputToTensorInput(const FuncGraphPtr &func_graph, const CNodePt
   }
   return nullptr;
 }
+
+AnfNodePtr ProcessGraphKernelOp(const AnfNodePtr &node) {
+  auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(sub_graph);
+  auto mng = sub_graph->manager();
+  MS_EXCEPTION_IF_NULL(mng);
+  std::vector<AnfNodePtr> todo;
+  std::vector<std::pair<AnfNodePtr, size_t>> graph_rets;
+  kernel::GetValidKernelNodes(sub_graph, &todo);
+  kernel::GetGraphRealOutput(sub_graph, &graph_rets);
+
+  for (auto &t : todo) {
+    auto t_new_node = ConstInputToTensorInput(sub_graph, t->cast<CNodePtr>());
+    if (t_new_node != nullptr && t_new_node != t) {
+      (void)mng->Replace(t, t_new_node);
+    }
+  }
+
+  return node;
+}
 }  // namespace
 
 const AnfNodePtr ConvertConstInputToTensorInput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
@@ -96,8 +142,11 @@ const AnfNodePtr ConvertConstInputToTensorInput::Process(const FuncGraphPtr &fun
   if (node == nullptr || func_graph == nullptr || !AnfAlgo::IsRealCNodeKernel(node)) {
     return nullptr;
   }
-  CNodePtr cnode = node->cast<CNodePtr>();
-  return ConstInputToTensorInput(func_graph, cnode);
+  if (AnfAlgo::IsGraphKernel(node)) {
+    return ProcessGraphKernelOp(node);
+  } else {
+    return ConstInputToTensorInput(func_graph, node->cast<CNodePtr>());
+  }
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc b/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc
index ab2395b1f5..a03087c1a4 100644
--- a/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc
+++ b/mindspore/ccsrc/pre_activate/pass/convert_tuple_input_to_dynamic_input.cc
@@ -21,10 +21,37 @@
 #include "session/anf_runtime_algorithm.h"
 #include "pre_activate/common/helper.h"
 #include "session/kernel_graph.h"
+#include "kernel/common_utils.h"
+#include "device/kernel_info.h"
 
 namespace mindspore {
 namespace opt {
 namespace {
+bool MakeValueNode(const AnfNodePtr &node) {
+  auto value_node = node->cast<ValueNodePtr>();
+  if (value_node == nullptr) {
+    return false;
+  }
+
+  // create kernel_info fo new value node
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node
+  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // set the format of value_node to DEFAULT_FORMAT
+  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+  // set value node initial device data type = infer data type
+  TypeId infer_data_type;
+  if (AnfAlgo::GetOutputTensorNum(value_node) == 0) {
+    infer_data_type = kTypeUnknown;
+  } else {
+    infer_data_type = AnfAlgo::GetOutputInferDataType(value_node, 0);
+  }
+  kernel_build_info_builder->SetOutputsDeviceType(std::vector<TypeId>{infer_data_type});
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), value_node.get());
+  return true;
+}
+
 void ConvertTupleOuputToPlantInputs(const FuncGraphPtr &graph, const AnfNodePtr &input_node,
                                     std::vector<AnfNodePtr> *plant_inputs, std::vector<int> *dyn_input_sizes) {
   MS_EXCEPTION_IF_NULL(plant_inputs);
@@ -50,12 +77,12 @@ void ConvertTupleOuputToPlantInputs(const FuncGraphPtr &graph, const AnfNodePtr
   (void)std::copy(convert_inputs.begin(), convert_inputs.end(), std::back_inserter(*plant_inputs));
 }
 
-CNodePtr ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNodePtr &cnode_ptr) {
+void ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNodePtr &cnode_ptr) {
   MS_EXCEPTION_IF_NULL(cnode_ptr);
   MS_EXCEPTION_IF_NULL(graph);
   auto &ori_args = cnode_ptr->inputs();
   if (ori_args.size() < 1) {
-    return nullptr;
+    return;
   }
   std::vector<AnfNodePtr> plant_inputs;
   std::vector<int> dyn_input_sizes;
@@ -68,7 +95,16 @@ CNodePtr ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNo
       auto cnode = input_node->cast<CNodePtr>();
       MS_EXCEPTION_IF_NULL(cnode);
       auto inputs = cnode->inputs();
-      (void)std::copy(inputs.begin() + 1, inputs.end(), std::back_inserter(plant_inputs));
+      for (size_t j = 1; j < inputs.size(); ++j) {
+        MS_EXCEPTION_IF_NULL(inputs[j]);
+        if (IsValueNode<tensor::Tensor>(inputs[j])) {
+          auto success = MakeValueNode(inputs[j]);
+          if (!success) {
+            MS_LOG(WARNING) << "Make value node failed, " << inputs[j]->DebugString();
+          }
+        }
+        plant_inputs.push_back(inputs[j]);
+      }
     } else if (input_node->Type() != nullptr && AnfAlgo::IsTupleOutput(input_node)) {
       ConvertTupleOuputToPlantInputs(graph, input_node, &plant_inputs, &dyn_input_sizes);
     } else {
@@ -81,7 +117,6 @@ CNodePtr ConvertMakeTupleInputToPlantInputs(const FuncGraphPtr &graph, const CNo
     AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(dyn_input_sizes), cnode_ptr);
     cnode_ptr->set_inputs(plant_inputs);
   }
-  return cnode_ptr;
 }
 }  // namespace
 
@@ -96,7 +131,18 @@ const AnfNodePtr ConvertTupleInputToDynamicInput::Process(const FuncGraphPtr &fu
   if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealKernel(node)) {
     return nullptr;
   }
-  return ConvertMakeTupleInputToPlantInputs(func_graph, node->cast<CNodePtr>());
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    std::vector<AnfNodePtr> todos;
+    kernel::GetValidKernelNodes(sub_graph, &todos);
+    for (auto &t : todos) {
+      ConvertMakeTupleInputToPlantInputs(sub_graph, t->cast<CNodePtr>());
+    }
+  } else {
+    ConvertMakeTupleInputToPlantInputs(func_graph, node->cast<CNodePtr>());
+  }
+  return node;
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc
index 2fc971881d..4d3dcfccc0 100644
--- a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc
+++ b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.cc
@@ -18,10 +18,12 @@
 #include <memory>
 #include <utility>
 #include <unordered_map>
+#include <unordered_set>
 #include "session/anf_runtime_algorithm.h"
 #include "utils/utils.h"
 #include "pre_activate/common/helper.h"
 #include "operator/ops.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace opt {
@@ -125,13 +127,7 @@ void EliminateRedundantOp::Init() {
     kTransDataOpName, std::pair<std::string, ConditionFunc>(kTransDataOpName, TransDataOpEliminateCondition)));
 }
 
-const AnfNodePtr EliminateRedundantOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                               const EquivPtr &) const {
-  MS_EXCEPTION_IF_NULL(node);
-  auto cnode = node->cast<CNodePtr>();
-  if (cnode == nullptr || func_graph == nullptr) {
-    return nullptr;
-  }
+const AnfNodePtr EliminateRedundantOp::DoEliminate(const FuncGraphPtr &func_graph, const CNodePtr &cnode) const {
   // match the first name
   auto name1 = AnfAlgo::GetCNodeName(cnode);
   auto it = redundant_process_map_.find(name1);
@@ -160,5 +156,35 @@ const AnfNodePtr EliminateRedundantOp::Process(const FuncGraphPtr &func_graph, c
 
   return ProcessMatchedNodes(func_graph, cnode, prev_cnode, &pass_vector);
 }
+
+const AnfNodePtr EliminateRedundantOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                               const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  if (cnode == nullptr || func_graph == nullptr) {
+    return nullptr;
+  }
+
+  if (AnfAlgo::IsGraphKernel(node)) {
+    // do eliminate for ops in graph kernel.
+    auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(sub_graph);
+    auto mng = sub_graph->manager();
+    MS_EXCEPTION_IF_NULL(mng);
+    std::vector<AnfNodePtr> todo;
+    kernel::GetValidKernelNodes(sub_graph, &todo);
+    for (auto &t : todo) {
+      CNodePtr t_cnode = t->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(t_cnode);
+      auto t_new_node = DoEliminate(sub_graph, t_cnode);
+      if (t_new_node != nullptr && t_new_node != t) {
+        (void)mng->Replace(t, t_new_node);
+      }
+    }
+    return node;
+  }
+  // do eliminate for single op.
+  return DoEliminate(func_graph, cnode);
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h
index 9e0dacecb1..c44190f645 100644
--- a/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h
+++ b/mindspore/ccsrc/pre_activate/pass/eliminate_redundant_op.h
@@ -40,6 +40,7 @@ class EliminateRedundantOp : public PatternProcessPass {
 
  private:
   void Init();
+  const AnfNodePtr DoEliminate(const FuncGraphPtr &func_graph, const CNodePtr &cnode) const;
   std::unordered_map<std::string, RedundantOpPair> redundant_process_map_;
 };
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc b/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc
index 4ea817df85..3b566b4f7c 100644
--- a/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc
+++ b/mindspore/ccsrc/pre_activate/pass/erase_visit_attr.cc
@@ -16,6 +16,8 @@
 
 #include "pre_activate/pass/erase_visit_attr.h"
 #include <memory>
+#include <vector>
+#include "kernel/common_utils.h"
 #include "session/anf_runtime_algorithm.h"
 #include "pre_activate/common/helper.h"
 
@@ -28,7 +30,20 @@ const BaseRef EraseVisitAttr::DefinePattern() const {
 }
 
 const AnfNodePtr EraseVisitAttr::Process(const FuncGraphPtr &, const AnfNodePtr &node, const EquivPtr &) const {
-  AnfAlgo::EraseNodeAttr(kAttrVisited, node);
+  if (node != nullptr && AnfAlgo::IsRealCNodeKernel(node)) {
+    if (AnfAlgo::IsGraphKernel(node)) {
+      auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+      MS_EXCEPTION_IF_NULL(fg);
+      std::vector<AnfNodePtr> todos;
+      kernel::GetValidKernelNodes(fg, &todos);
+      for (auto &t : todos) {
+        AnfAlgo::EraseNodeAttr(kAttrVisited, t);
+      }
+    }
+    AnfAlgo::EraseNodeAttr(kAttrVisited, node);
+  } else {
+    AnfAlgo::EraseNodeAttr(kAttrVisited, node);
+  }
   return nullptr;
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_basic.cc b/mindspore/ccsrc/pre_activate/pass/fuse_basic.cc
new file mode 100644
index 0000000000..84edd5c5e2
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_basic.cc
@@ -0,0 +1,222 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/pass/fuse_basic.h"
+#include "pre_activate/pass/fuse_graph_kernel.h"
+
+#include <memory>
+#include <algorithm>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+#include "operator/ops.h"
+#include "utils/utils.h"
+#include "utils/graph_utils.h"
+#include "pre_activate/common/helper.h"
+#include "session/anf_runtime_algorithm.h"
+#include "vm/segment_runner.h"
+#include "debug/draw.h"
+#include "debug/anf_ir_dump.h"
+#include "ir/func_graph_cloner.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+std::vector<PrimitivePtr> get_fusable_basic_ops(bool is_before_kernel_select) {
+  std::vector<PrimitivePtr> fusable_basic_ops = {prim::kPrimTensorAdd, prim::kPrimMul, prim::kPrimSub,
+                                                 prim::kPrimExpandDims};
+  if (!is_before_kernel_select) {
+    fusable_basic_ops.push_back(prim::kPrimCast);
+  }
+  return fusable_basic_ops;
+}
+
+IncludeType IncludeFusedBasicOpForward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
+                                       const AnfNodePtr &node) {
+  if (cur_node == node) {
+    return FOLLOW;
+  }
+  if (!IsPrimitiveCNode(node)) {
+    return EXCLUDE;
+  }
+
+  auto fusable_basic_ops = get_fusable_basic_ops(info.is_before_kernel_select);
+  bool is_fusable = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                                [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+
+  return is_fusable ? FOLLOW : EXCLUDE;
+}
+
+std::vector<AnfNodePtr> FindFuseCNodes(const CNodePtr &cnode, bool is_before_kernel_select) {
+  GraphKernelInfo info;
+  info.is_before_kernel_select = is_before_kernel_select;
+  // Search fusable nodes according input direction.
+  auto include_func_forward = std::bind(IncludeFusedBasicOpForward, cnode, info, std::placeholders::_1);
+  auto used_nodes = DeepLinkedGraphSearch(cnode, include_func_forward);
+  if (used_nodes.size() > 1) {
+    used_nodes = RemoveCircle(used_nodes, false);
+  }
+  TopoSortForNodeList(&used_nodes);
+  return used_nodes;
+}
+
+void RemoveControlDependOut(const FuncGraphPtr &fg, AnfNodePtrList *outputs, const FuncGraphManagerPtr &mng) {
+  AnfNodeSet outputs_set;
+  for (auto out : *outputs) {
+    outputs_set.insert(out);
+  }
+
+  AnfNodePtrList vir_outputs;
+  std::unordered_map<AnfNodePtr, AnfNodePtr> eqv;
+  auto fg_outputs = fg->output();
+  if (IsPrimitiveCNode(fg_outputs, prim::kPrimMakeTuple)) {
+    auto cnode = fg_outputs->cast<CNodePtr>();
+    for (size_t i = 1; i < cnode->size(); ++i) {
+      vir_outputs.push_back(cnode->input(i));
+    }
+  } else {
+    vir_outputs.push_back(fg_outputs);
+  }
+
+  if (vir_outputs.size() != outputs->size()) {
+    MS_LOG(EXCEPTION) << "The size of virtual output of the fg is not the same with the real output";
+  }
+  bool has_erase_outs = false;
+  size_t index = -1;
+  for (auto it = outputs->begin(); it != outputs->end();) {
+    index++;
+    auto out = *it;
+    eqv[out] = vir_outputs[index];
+    auto users = mng->node_users()[out];
+    bool is_only_control_depend_use = true;
+    std::vector<size_t> control_depend_use_index;
+    std::vector<CNodePtr> control_depend_nodes;
+    AnfNodePtr use_out = nullptr;
+    for (auto &user : users) {
+      auto use_node = user.first;
+      if (outputs_set.count(use_node) == 0 && !(IsPrimitiveCNode(use_node, prim::kPrimControlDepend))) {
+        is_only_control_depend_use = false;
+        continue;
+      }
+      if (outputs_set.count(use_node) != 0) {
+        use_out = use_node;
+      }
+
+      if (IsPrimitiveCNode(use_node, prim::kPrimControlDepend)) {
+        control_depend_nodes.push_back(use_node->cast<CNodePtr>());
+        control_depend_use_index.push_back(user.second);
+      }
+    }
+
+    if (is_only_control_depend_use && !control_depend_nodes.empty()) {
+      MS_EXCEPTION_IF_NULL(use_out);
+      it = outputs->erase(it);
+      for (size_t i = 0; i < control_depend_nodes.size(); ++i) {
+        auto control_depend_node = control_depend_nodes[i];
+        std::vector<AnfNodePtr> new_control_depend_inputs;
+        for (size_t j = 0; j < control_depend_node->size(); ++j) {
+          if (j == control_depend_use_index[i]) {
+            new_control_depend_inputs.push_back(use_out);
+          } else {
+            new_control_depend_inputs.push_back(control_depend_node->input(j));
+          }
+        }
+        auto new_control_depend = control_depend_node->func_graph()->NewCNode(new_control_depend_inputs);
+        mng->Replace(control_depend_node, new_control_depend);
+        has_erase_outs = true;
+      }
+    } else {
+      it++;
+    }
+  }
+
+  if (!has_erase_outs) {
+    return;
+  }
+
+  AnfNodePtr fg_new_output;
+  if (outputs->size() > 1) {
+    std::vector<AnfNodePtr> output_args;
+    output_args.push_back(NewValueNode(prim::kPrimMakeTuple));
+    (void)std::transform(std::begin(*outputs), std::end(*outputs), std::back_inserter(output_args),
+                         [&eqv](const AnfNodePtr &o) -> AnfNodePtr { return eqv[o]; });
+    // Set output for AnfGraph
+    fg_new_output = fg->NewCNode(output_args);
+  } else {
+    fg_new_output = eqv[(*outputs)[0]];
+  }
+  fg->set_output(fg_new_output, true);
+}
+
+void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, const std::vector<AnfNodePtr> &todos,
+               std::unordered_set<AnfNodePtr> *fused_ops, bool is_before_kernel_select) {
+  auto mng = kernel_graph->manager();
+  for (auto iter = todos.cbegin(); iter != todos.cend(); ++iter) {
+    auto node = (*iter)->cast<CNodePtr>();
+    if (node == nullptr) {
+      continue;
+    }
+    if (fused_ops->count(node)) {
+      continue;
+    }
+    auto fusable_basic_ops = get_fusable_basic_ops(is_before_kernel_select);
+    bool is_basic_op = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                                   [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+    if (!is_basic_op || !kernel_graph->nodes().contains(node)) {
+      continue;
+    }
+
+    auto fuse_nodes = FindFuseCNodes(node, is_before_kernel_select);
+    if (fuse_nodes.size() <= 1) {
+      continue;
+    }
+
+    FuncGraphPtr fg;
+    AnfNodePtrList inputs;
+    AnfNodePtrList outputs;
+    std::tie(fg, inputs, outputs) = compile::TransformSegmentToAnfGraph(fuse_nodes);
+    RemoveControlDependOut(fg, &outputs, mng);
+    auto fuse_new_node = CreateNewFuseCNode(kernel_graph, fg, inputs, outputs, is_before_kernel_select);
+
+    ReplaceNewFuseCNode(kernel_graph, fuse_new_node, outputs);
+
+    // Set graph kernel attr
+    std::string fuse_op_name = "";
+    for (auto &fuse_node : fuse_nodes) {
+      fuse_op_name += AnfAlgo::GetCNodePrimitive(fuse_node)->name() + "_";
+    }
+    fused_ops->insert(fuse_nodes.begin(), fuse_nodes.end());
+    fg->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(fuse_op_name));
+  }
+}
+}  // namespace
+
+void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(kernel_graph, true);
+    kernel_graph->set_manager(mng);
+  }
+  std::unordered_set<AnfNodePtr> fused_ops;
+  auto todos = TopoSort(kernel_graph->get_return());
+  std::reverse(todos.begin(), todos.end());
+  FuseBasic(kernel_graph, todos, &fused_ops, is_before_kernel_select);
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_basic.h b/mindspore/ccsrc/pre_activate/pass/fuse_basic.h
new file mode 100644
index 0000000000..fbbf5d9937
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_basic.h
@@ -0,0 +1,29 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_BASIC_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_BASIC_H_
+
+#include <memory>
+#include "pre_activate/common/optimizer.h"
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+void FuseBasic(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select);
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_BASIC_H_
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.cc b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.cc
new file mode 100644
index 0000000000..591b210335
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.cc
@@ -0,0 +1,562 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pre_activate/pass/fuse_graph_kernel.h"
+
+#include <memory>
+#include <string>
+#include <algorithm>
+#include <unordered_set>
+#include <map>
+#include <set>
+#include <queue>
+#include <vector>
+
+#include "operator/ops.h"
+#include "utils/utils.h"
+#include "utils/graph_utils.h"
+#include "pre_activate/common/helper.h"
+#include "session/anf_runtime_algorithm.h"
+#include "vm/segment_runner.h"
+#include "debug/draw.h"
+#include "debug/anf_ir_dump.h"
+#include "ir/func_graph_cloner.h"
+
+namespace mindspore {
+namespace opt {
+std::vector<PrimitivePtr> get_fusable_basic_ops(bool is_before_kernel_select) {
+  std::vector<PrimitivePtr> fusable_basic_ops = {
+    prim::kPrimAddN,       prim::kPrimTensorAdd,  prim::kPrimMul,      prim::kPrimSub, prim::kPrimMaximum,
+    prim::kPrimMinimum,    prim::kPrimNeg,        prim::kPrimRealDiv,  prim::kPrimPow, prim::kPrimSqrt,
+    prim::kPrimReciprocal, prim::kPrimExpandDims, prim::kPrimLessEqual};
+  if (!is_before_kernel_select) {
+    fusable_basic_ops.push_back(prim::kPrimCast);
+  }
+  return fusable_basic_ops;
+}
+
+std::vector<PrimitivePtr> get_fusable_basic_ops_with_reduce(bool is_before_kernel_select) {
+  std::vector<PrimitivePtr> fusable_basic_ops_with_reduce;
+  if (!is_before_kernel_select) {
+    fusable_basic_ops_with_reduce.push_back(prim::kPrimCast);
+  }
+  return fusable_basic_ops_with_reduce;
+}
+
+std::vector<PrimitivePtr> get_reduce_ops() {
+  std::vector<PrimitivePtr> reduce_ops = {prim::kPrimReduceSum, prim::kPrimReduceMean, prim::kPrimReduceMin,
+                                          prim::kPrimReduceMax, prim::kPrimReduceAll};
+  return reduce_ops;
+}
+
+void GetGraphKernelInfo(const FuncGraphPtr fg, GraphKernelInfo *info) {
+  MS_EXCEPTION_IF_NULL(fg);
+  auto reduce_ops = get_reduce_ops();
+  const auto &nodes = fg->nodes();
+  info->op_type = ELEWISE;
+  info->cal_step = -1;
+  info->reduce_op_num = 0;
+  for (auto node : nodes) {
+    auto cnode = node->cast<CNodePtr>();
+    if (cnode == nullptr) {
+      continue;
+    }
+    info->cal_step++;
+    auto prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+    if (prim != nullptr) {
+      bool is_reudce = std::any_of(reduce_ops.begin(), reduce_ops.end(), [&prim](const PrimitivePtr &op) {
+        return op->hash() == prim->hash() && op->name() == prim->name();
+      });
+      if (is_reudce) {
+        info->op_type = REDUCE;
+        info->reduce_op_num++;
+      }
+    }
+  }
+}
+
+bool IsFuse(const GraphKernelInfo &info, const AnfNodePtr &node) {
+  auto fusable_basic_ops = get_fusable_basic_ops(info.is_before_kernel_select);
+  auto fusable_basic_ops_with_reduce = get_fusable_basic_ops_with_reduce(info.is_before_kernel_select);
+  bool is_fusable = false;
+  if (info.op_type == REDUCE &&
+      (info.cal_step >= MAX_REDUCE_OP_FUSION_CAL_STEP || info.reduce_op_num >= MAX_REDUCE_OP_FUSION_REDUCE_NUM)) {
+    is_fusable = std::any_of(fusable_basic_ops_with_reduce.begin(), fusable_basic_ops_with_reduce.end(),
+                             [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+  } else {
+    is_fusable = std::any_of(fusable_basic_ops.begin(), fusable_basic_ops.end(),
+                             [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
+  }
+
+  return is_fusable;
+}
+
+IncludeType IncludeFusedBasicOpForward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
+                                       const AnfNodePtr &node) {
+  if (cur_node == node) {
+    return FOLLOW;
+  }
+  if (!IsPrimitiveCNode(node)) {
+    return EXCLUDE;
+  }
+
+  bool is_fusable = IsFuse(info, node);
+  return is_fusable ? FOLLOW : EXCLUDE;
+}
+
+IncludeType IncludeFusedBasicOpBackward(const AnfNodePtr &cur_node, const GraphKernelInfo &info,
+                                        const AnfNodePtr &node) {
+  if (cur_node == node) {
+    return FOLLOW;
+  }
+  if (AnfAlgo::IsGraphKernel(node)) {
+    auto cnode = node->cast<CNodePtr>();
+    auto fg = GetValueNode<FuncGraphPtr>(cnode->input(kAnfPrimitiveIndex));
+    auto fg_attr_val = fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+    MS_EXCEPTION_IF_NULL(fg_attr_val);
+    auto fg_attr = GetValue<std::string>(fg_attr_val);
+    if (fg_attr == kApplyMomentumOpName) {
+      return FOLLOW;
+    }
+    return EXCLUDE;
+  }
+  if (!IsPrimitiveCNode(node)) {
+    return EXCLUDE;
+  }
+
+  bool is_fusable = IsFuse(info, node);
+  return is_fusable ? FOLLOW : EXCLUDE;
+}
+
+bool CheckCircle(const std::set<AnfNodePtr> &fused_op_set, const AnfNodePtr &check_node,
+                 std::set<AnfNodePtr> *cached_unconnected_set) {
+  if (!check_node->isa<CNode>() || AnfAlgo::IsGraphKernel(check_node)) {
+    return false;
+  }
+
+  auto cnode = check_node->cast<CNodePtr>();
+  const auto &inputs = cnode->inputs();
+  // there is a input not in fused_op_set, but the input depends on the fused_op_set
+  bool has_circle = false;
+  for (auto input : inputs) {
+    if (input->isa<CNode>() && !fused_op_set.count(input)) {
+      std::set<AnfNodePtr> done;
+      std::vector<AnfNodePtr> todos = {input};
+      while (!todos.empty()) {
+        auto node = todos.back();
+        todos.pop_back();
+        if (done.count(node) || cached_unconnected_set->count(node)) {
+          continue;
+        }
+
+        done.insert(node);
+        if (fused_op_set.count(node)) {
+          has_circle = true;
+          break;
+        }
+
+        if (node->isa<CNode>()) {
+          auto cnode_ptr = node->cast<CNodePtr>();
+          for (auto it : cnode_ptr->inputs()) {
+            if (it->isa<CNode>()) {
+              todos.push_back(it);
+            }
+          }
+        }
+      }
+
+      if (has_circle) {
+        return true;
+      }
+      cached_unconnected_set->insert(done.begin(), done.end());
+    }
+  }
+
+  return false;
+}
+
+bool IsMakeTupleOut(const AnfNodePtr &out, AnfNodePtrList *real_outs) {
+  if (IsPrimitiveCNode(out, prim::kPrimMakeTuple)) {
+    auto &inputs = out->cast<CNodePtr>()->inputs();
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      real_outs->push_back(inputs[i]);
+    }
+    return true;
+  }
+
+  if (AnfAlgo::GetCNodeFuncGraphPtr(out) != nullptr) {
+    auto fg = AnfAlgo::GetCNodeFuncGraphPtr(out);
+    auto fg_out = fg->output();
+    if (IsPrimitiveCNode(fg_out, prim::kPrimMakeTuple)) {
+      auto inputs = fg_out->cast<CNodePtr>()->inputs();
+      for (size_t i = 1; i < inputs.size(); ++i) {
+        real_outs->push_back(inputs[i]);
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<AnfNodePtr> RemoveCircle(const std::vector<AnfNodePtr> &fused_op, bool is_backward) {
+  std::set<AnfNodePtr> cached_unconnected_set;
+  std::set<AnfNodePtr> fused_op_set(fused_op.begin(), fused_op.end());
+  auto include = [&fused_op_set](const AnfNodePtr &node) {
+    if (fused_op_set.count(node)) {
+      return FOLLOW;
+    }
+    return EXCLUDE;
+  };
+  for (auto iter = fused_op.rbegin(); iter != fused_op.rend(); ++iter) {
+    bool has_circle = CheckCircle(fused_op_set, *iter, &cached_unconnected_set);
+    // delete the circle node and the node which depend on the circle node in fused op
+    if (has_circle) {
+      auto mng = (*iter)->func_graph()->manager();
+      std::vector<AnfNodePtr> erase_nodes;
+      if (is_backward) {
+        erase_nodes = DeepUsersSearch(*iter, include, mng);
+      } else {
+        erase_nodes = DeepLinkedGraphSearch(*iter, include);
+      }
+      for (auto erase_node : erase_nodes) {
+        fused_op_set.erase(erase_node);
+      }
+    }
+  }
+
+  std::vector<AnfNodePtr> res;
+  for (auto node : fused_op) {
+    if (fused_op_set.count(node)) {
+      res.push_back(node);
+    }
+  }
+  return res;
+}
+
+void TopoSortForNodeList(std::vector<AnfNodePtr> *lst) {
+  if (lst->size() < 2) {
+    return;
+  }
+
+  std::vector<AnfNodePtr> res;
+  std::set<AnfNodePtr> node_sets(lst->begin(), lst->end());
+  std::map<AnfNodePtr, std::set<AnfNodePtr>> ins;
+  std::map<AnfNodePtr, std::set<AnfNodePtr>> outs;
+  std::queue<AnfNodePtr> q;
+  for (auto node : *lst) {
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    for (auto input : cnode->inputs()) {
+      if (!node_sets.count(input)) {
+        continue;
+      }
+      // out_degree
+      outs[input].insert(node);
+      // in_degree
+      ins[node].insert(input);
+    }
+    if (!ins.count(node)) {
+      ins[node] = {};
+    }
+  }
+
+  for (auto p : ins) {
+    if (p.second.size() == 0) {
+      q.push(p.first);
+    }
+  }
+
+  while (!q.empty()) {
+    auto node = q.front();
+    q.pop();
+    res.push_back(node);
+    if (!outs.count(node)) {
+      continue;
+    }
+    for (auto out : outs[node]) {
+      if (!ins.count(out)) {
+        continue;
+      }
+      ins[out].erase(node);
+      if (ins[out].size() == 0) {
+        q.push(out);
+      }
+    }
+  }
+
+  lst->assign(res.begin(), res.end());
+}
+
+std::vector<AnfNodePtr> FindFuseCNodes(const CNodePtr &cnode, bool is_before_kernel_select) {
+  auto func_graph = cnode->func_graph();
+  auto graph_kernel_g = GetValueNode<FuncGraphPtr>(cnode->input(0));
+  GraphKernelInfo info;
+  info.is_before_kernel_select = is_before_kernel_select;
+  GetGraphKernelInfo(graph_kernel_g, &info);
+  auto mng = func_graph->manager();
+  // Search fusable nodes according input direction.
+  auto include_func_forward = std::bind(IncludeFusedBasicOpForward, cnode, info, std::placeholders::_1);
+  auto used_nodes = DeepLinkedGraphSearch(cnode, include_func_forward);
+  std::reverse(used_nodes.begin(), used_nodes.end());
+  // Search fusable nodes according output direction.
+  auto include_func_backward = std::bind(IncludeFusedBasicOpBackward, cnode, info, std::placeholders::_1);
+  auto user_nodes = DeepUsersSearch(cnode, include_func_backward, mng);
+
+  used_nodes.insert(used_nodes.end(), user_nodes.begin() + 1, user_nodes.end());
+  if (used_nodes.size() > 1) {
+    used_nodes = RemoveCircle(used_nodes);
+  }
+  TopoSortForNodeList(&used_nodes);
+  return used_nodes;
+}
+
+AbstractBasePtr GetOutputAbstract(const AnfNodePtr &node, size_t output_idx) {
+  auto out_spec = node->abstract();
+  if (out_spec->isa<abstract::AbstractTuple>()) {
+    return out_spec->cast<abstract::AbstractTuplePtr>()->elements()[output_idx];
+  }
+  return out_spec;
+}
+
+AnfNodePtr CreateNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const FuncGraphPtr &fg,
+                              const AnfNodePtrList &inputs, const AnfNodePtrList &outputs,
+                              bool is_before_kernel_select) {
+  auto func_node = NewValueNode(fg);
+  std::vector<AnfNodePtr> fn_inputs;
+  fn_inputs.push_back(func_node);
+  fn_inputs.insert(fn_inputs.end(), inputs.begin(), inputs.end());
+  auto fuse_cnode = kernel_graph->NewCNode(fn_inputs);
+  // Set output abstract
+  if (outputs.size() > 1) {
+    std::vector<AbstractBasePtr> out_specs;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      out_specs.push_back(outputs[i]->abstract());
+    }
+    auto out_spec = std::make_shared<abstract::AbstractTuple>(out_specs);
+    fuse_cnode->set_abstract(out_spec);
+  } else {
+    fuse_cnode->set_abstract(outputs[0]->abstract());
+  }
+  // Set parameter abstract.
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto kernel_with_index = AnfAlgo::VisitKernel(inputs[i], 0);
+    auto input_abs = GetOutputAbstract(kernel_with_index.first, kernel_with_index.second);
+    fg->parameters()[i]->set_abstract(input_abs);
+    if (is_before_kernel_select) {
+      fg->parameters()[i]->set_kernel_info(std::make_shared<device::KernelInfo>());
+    }
+  }
+  // Set kernel info.
+  if (!is_before_kernel_select) {
+    std::vector<std::string> graph_input_format;
+    std::vector<TypeId> graph_input_type;
+    std::vector<std::string> graph_output_format;
+    std::vector<TypeId> graph_output_type;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto kernel_with_index = AnfAlgo::VisitKernel(inputs[i], 0);
+      auto input_format = AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second);
+      graph_input_format.push_back(input_format);
+      auto input_type = AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second);
+      graph_input_type.push_back(input_type);
+      auto input_abs = GetOutputAbstract(kernel_with_index.first, kernel_with_index.second);
+      fg->parameters()[i]->set_abstract(input_abs);
+    }
+    auto new_outputs = outputs;
+    if (outputs.size() == 1 && AnfAlgo::IsGraphKernel(outputs[0])) {
+      std::vector<AnfNodePtr> real_outs;
+      if (IsMakeTupleOut(outputs[0], &real_outs)) {
+        new_outputs = real_outs;
+      }
+    }
+    for (size_t i = 0; i < new_outputs.size(); ++i) {
+      auto kernel_with_index = AnfAlgo::VisitKernel(new_outputs[i], 0);
+      auto output_format = AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second);
+      auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second);
+      graph_output_format.push_back(output_format);
+      graph_output_type.push_back(output_type);
+    }
+    kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
+    graph_info_builder.SetInputsFormat(graph_input_format);
+    graph_info_builder.SetInputsDeviceType(graph_input_type);
+    graph_info_builder.SetOutputsFormat(graph_output_format);
+    graph_info_builder.SetOutputsDeviceType(graph_output_type);
+    graph_info_builder.SetProcessor(kernel::Processor::AICORE);
+    graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
+    graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
+    auto graph_selected_info = graph_info_builder.Build();
+    AnfAlgo::SetSelectKernelBuildInfo(graph_selected_info, fuse_cnode.get());
+  }
+  return fuse_cnode;
+}
+
+void ReplaceNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const AnfNodePtr &new_fuse_cnode,
+                         const AnfNodePtrList &outputs) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  MS_EXCEPTION_IF_NULL(mng);
+  // single out
+  if (outputs.size() == 1) {
+    mng->Replace(outputs[0], new_fuse_cnode);
+    return;
+  }
+
+  std::vector<AnfNodePtr> fn_inputs;
+  for (size_t out_idx = 0; out_idx < outputs.size(); out_idx++) {
+    AnfNodePtrList real_outs;
+    // not make tuple out, replace
+    if (!IsMakeTupleOut(outputs[out_idx], &real_outs)) {
+      fn_inputs.clear();
+      fn_inputs.push_back(NewValueNode(prim::kPrimTupleGetItem));
+      fn_inputs.push_back(new_fuse_cnode);
+      fn_inputs.push_back(NewValueNode(MakeValue(SizeToInt(out_idx))));
+      auto new_out = kernel_graph->NewCNode(fn_inputs);
+      new_out->set_abstract(outputs[out_idx]->abstract());
+      mng->Replace(outputs[out_idx], new_out);
+      continue;
+    }
+
+    // the out is make tuple , modify the get_item node's value
+    auto users = mng->node_users()[outputs[out_idx]];
+    for (auto &user : users) {
+      auto use_node = user.first;
+      if (use_node->isa<CNode>() && (IsPrimitiveCNode(use_node, prim::kPrimTupleGetItem))) {
+        auto get_item_cnode = use_node->cast<CNodePtr>();
+        auto value_input = get_item_cnode->input(kInputNodeOutputIndexInTupleGetItem);
+        MS_EXCEPTION_IF_NULL(value_input);
+        auto value_node = value_input->cast<ValueNodePtr>();
+        MS_EXCEPTION_IF_NULL(value_node);
+        int item_idx = GetValue<int>(value_node->value());
+        int new_item_idx = SizeToInt(out_idx) + item_idx;
+        fn_inputs.clear();
+        fn_inputs.push_back(NewValueNode(prim::kPrimTupleGetItem));
+        fn_inputs.push_back(new_fuse_cnode);
+        fn_inputs.push_back(NewValueNode(new_item_idx));
+        auto new_out = kernel_graph->NewCNode(fn_inputs);
+        new_out->set_abstract(get_item_cnode->abstract());
+        mng->Replace(get_item_cnode, new_out);
+      }
+    }
+  }
+}
+
+AnfNodePtrList EliminateMakeTuple(FuncGraphPtr *fg, FuncGraphManagerPtr *mng) {
+  AnfNodePtrList outs;
+  auto out_node = (*fg)->output();
+  if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple)) {
+    std::vector<AnfNodePtr> output_args;
+    auto out_cnode = out_node->cast<CNodePtr>();
+    for (auto out : out_cnode->inputs()) {
+      if (IsPrimitiveCNode(out, prim::kPrimMakeTuple)) {
+        auto inputs = out->cast<CNodePtr>()->inputs();
+        for (size_t i = 1; i < inputs.size(); ++i) {
+          output_args.push_back(inputs[i]);
+        }
+      } else {
+        output_args.push_back(out);
+      }
+    }
+    if (output_args.size() != out_cnode->inputs().size()) {
+      auto new_out = (*fg)->NewCNode(output_args);
+      (*mng)->Replace(out_node, new_out);
+    }
+
+    for (size_t i = 1; i < output_args.size(); ++i) {
+      outs.push_back(output_args[i]);
+    }
+    return outs;
+  }
+
+  outs.push_back(out_node);
+  return outs;
+}
+
+AnfNodePtrList GetExpandOuts(const AnfNodePtrList &outs) {
+  AnfNodePtrList res;
+  if (outs.size() <= 1) {
+    return outs;
+  }
+
+  for (auto out : outs) {
+    AnfNodePtrList real_outs;
+    if (IsMakeTupleOut(out, &real_outs)) {
+      res.insert(res.end(), real_outs.begin(), real_outs.end());
+      continue;
+    }
+    res.push_back(out);
+  }
+  return res;
+}
+
+void FuseGraphKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto mng = kernel_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(kernel_graph, true);
+    kernel_graph->set_manager(mng);
+  }
+  auto &todos = kernel_graph->execution_order();
+  for (auto iter = todos.cbegin(); iter != todos.cend(); ++iter) {
+    auto node = *iter;
+    if (!AnfAlgo::IsGraphKernel(node) || !kernel_graph->nodes().contains(node)) {
+      continue;
+    }
+
+    auto origin_fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    auto fg_attr = origin_fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+    if (fg_attr != nullptr) {
+      auto fg_name = GetValue<std::string>(fg_attr);
+      if (graph_kernel_black_list.count(fg_name) != 0) {
+        continue;
+      }
+    }
+
+    auto fuse_nodes = FindFuseCNodes(node, is_before_kernel_select);
+    if (fuse_nodes.size() <= 1) {
+      continue;
+    }
+
+    FuncGraphPtr fg;
+    AnfNodePtrList inputs;
+    AnfNodePtrList outputs;
+    std::tie(fg, inputs, outputs) = compile::TransformSegmentToAnfGraph(fuse_nodes);
+
+    // Remove nest make tuple in outs
+    auto expand_out = GetExpandOuts(outputs);
+    auto fuse_new_node = CreateNewFuseCNode(kernel_graph, fg, inputs, expand_out, is_before_kernel_select);
+
+    ReplaceNewFuseCNode(kernel_graph, fuse_new_node, outputs);
+
+    // Inline origin graphkernel
+    auto cnodes = fg->GetOrderedCnodes();
+    for (const auto &n : cnodes) {
+      if (!AnfAlgo::IsGraphKernel(n)) {
+        continue;
+      }
+      auto graph_kernel_g = GetValueNode<FuncGraphPtr>(n->input(0));
+      AnfNodePtrList ins;
+      ins.insert(ins.end(), n->inputs().begin() + 1, n->inputs().end());
+      auto out = InlineClone(graph_kernel_g, fg, ins, n->input(0)->scope());
+      mng->Replace(n, out);
+    }
+
+    EliminateMakeTuple(&fg, &mng);
+    // Set graphkernel flag
+    auto ori_fg = GetValueNode<FuncGraphPtr>(node->input(kAnfPrimitiveIndex));
+    fg->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, ori_fg->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.h b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.h
new file mode 100644
index 0000000000..a5a26765a3
--- /dev/null
+++ b/mindspore/ccsrc/pre_activate/pass/fuse_graph_kernel.h
@@ -0,0 +1,63 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_GRAPH_KERNEL_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_GRAPH_KERNEL_H_
+
+#include <set>
+#include <string>
+#include <vector>
+#include <memory>
+#include "pre_activate/common/optimizer.h"
+#include "session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+enum GraphKernelType {
+  ELEWISE = 0,  // only contain elewise basic ops
+  REDUCE,       // contain reduce ops
+  CUBE,         // contain cube ops
+};
+struct GraphKernelInfo {
+  GraphKernelType op_type = ELEWISE;
+  bool is_before_kernel_select = false;
+  int reduce_op_num = 0;
+  int cal_step = 0;
+};
+
+// when reduce graph kernel's cal step is greater than this number, not fuse
+const int MAX_REDUCE_OP_FUSION_CAL_STEP = 5;
+// when reduce graph kernel contain reduce op num is greater than this number, not fuse
+const int MAX_REDUCE_OP_FUSION_REDUCE_NUM = 2;
+
+const std::set<std::string> graph_kernel_black_list = {"BNTrainingUpdateSum", "ApplyMomentum", "LayerNormForward",
+                                                       "LambNextMV", "LambUpdateWithLR"};
+
+std::vector<AnfNodePtr> RemoveCircle(const std::vector<AnfNodePtr> &fused_op, bool is_backward = true);
+
+void TopoSortForNodeList(std::vector<AnfNodePtr> *lst);
+
+AnfNodePtr CreateNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const FuncGraphPtr &fg,
+                              const AnfNodePtrList &inputs, const AnfNodePtrList &outputs,
+                              bool is_before_kernel_select);
+
+void ReplaceNewFuseCNode(const std::shared_ptr<session::KernelGraph> &kernel_graph, const AnfNodePtr &new_fuse_cnode,
+                         const AnfNodePtrList &outputs);
+
+void FuseGraphKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph, bool is_before_kernel_select = false);
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_FUSE_GRAPH_KERNEL_H_
diff --git a/mindspore/ccsrc/pynative/pynative_execute.cc b/mindspore/ccsrc/pynative/pynative_execute.cc
index ceaf4bd43e..d65807b9f6 100644
--- a/mindspore/ccsrc/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pynative/pynative_execute.cc
@@ -985,7 +985,7 @@ FuncGraphPtr PynativeExecutor::GradGraph(FuncGraphPtr g, const GradOperationPtr
   auto nparam = top_g_->parameters().size();
   std::ostringstream ss;
   ss << "grad{" << nparam << "}";
-  df_builder_->set_flags(FUNC_GRAPH_FLAG_CORE, true);
+  df_builder_->set_flag(FUNC_GRAPH_FLAG_CORE, true);
   df_builder_->debug_info()->set_name(ss.str());
 
   auto df = grad_op->GetGrad(NewValueNode(g), nullptr, top_g_->parameters(), weights);
diff --git a/mindspore/ccsrc/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/session/anf_runtime_algorithm.cc
index 1ec11d50db..5db7dbc324 100644
--- a/mindspore/ccsrc/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/session/anf_runtime_algorithm.cc
@@ -178,12 +178,29 @@ bool AnfRuntimeAlgorithm::CheckPrimitiveType(const AnfNodePtr &node, const Primi
   return IsPrimitive(cnode->input(kAnfPrimitiveIndex), primitive_type);
 }
 
+FuncGraphPtr AnfRuntimeAlgorithm::GetCNodeFuncGraphPtr(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto attr_input = cnode->input(kAnfPrimitiveIndex);
+  MS_EXCEPTION_IF_NULL(attr_input);
+  auto value_node = attr_input->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  auto value = value_node->value();
+  MS_EXCEPTION_IF_NULL(value);
+  return value->cast<FuncGraphPtr>();
+}
+
 std::string AnfRuntimeAlgorithm::GetCNodeName(const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   if (node->isa<CNode>()) {
     auto primitive = AnfAlgo::GetCNodePrimitive(node);
-    MS_EXCEPTION_IF_NULL(primitive);
-    return primitive->name();
+    if (primitive != nullptr) {
+      return primitive->name();
+    }
+    auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(node);
+    MS_EXCEPTION_IF_NULL(func_graph);
+    return func_graph->ToString();
   }
   MS_LOG(EXCEPTION) << "Unknown anf node type " << node->DebugString();
 }
@@ -198,9 +215,16 @@ void AnfRuntimeAlgorithm::SetNodeAttr(const std::string &key, const ValuePtr &va
   if (!node->isa<CNode>()) {
     MS_LOG(EXCEPTION) << "Only cnode has attr, but this anf is " << node->DebugString();
   }
+  // single op cnode.
   auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  primitive->set_attr(key, value);
+  if (primitive != nullptr) {
+    primitive->set_attr(key, value);
+    return;
+  }
+  // graph kernel cnode.
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  fg->set_attr(key, value);
 }
 
 void AnfRuntimeAlgorithm::CopyNodeAttr(const std::string &key, const AnfNodePtr &from, const AnfNodePtr &to) {
@@ -241,16 +265,33 @@ void AnfRuntimeAlgorithm::EraseNodeAttr(const std::string &key, const AnfNodePtr
   if (!node->isa<CNode>()) {
     MS_LOG(EXCEPTION) << "Only cnode has attr, but this anf is " << node->DebugString();
   }
+  // single op cnode.
   auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  primitive->EraseAttr(key);
+  if (primitive != nullptr) {
+    primitive->EraseAttr(key);
+    return;
+  }
+  // graph kernel cnode.
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  fg->erase_flag(key);
 }
 
 bool AnfRuntimeAlgorithm::HasNodeAttr(const std::string &key, const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
+  if (!node->isa<CNode>()) {
+    MS_LOG(WARNING) << "Only cnode has attr, but this anf is " << node->DebugString();
+    return false;
+  }
+  // single op cnode.
   auto primitive = AnfAlgo::GetCNodePrimitive(node);
-  MS_EXCEPTION_IF_NULL(primitive);
-  return primitive->HasAttr(key);
+  if (primitive != nullptr) {
+    return primitive->HasAttr(key);
+  }
+  // graph kernel cnode.
+  auto fg = AnfAlgo::GetCNodeFuncGraphPtr(node);
+  MS_EXCEPTION_IF_NULL(fg);
+  return fg->has_flag(key);
 }
 
 size_t AnfRuntimeAlgorithm::GetInputTensorNum(const AnfNodePtr &node) {
@@ -782,6 +823,26 @@ bool AnfRuntimeAlgorithm::IsRealCNodeKernel(const AnfNodePtr &node) {
   return IsRealKernel(node);
 }
 
+bool AnfRuntimeAlgorithm::IsGraphKernel(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  // graph kernel should be a real cnode kernel.
+  if (!IsRealCNodeKernel(node)) {
+    return false;
+  }
+
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto input = cnode->input(kAnfPrimitiveIndex);
+  // graph kernel should has func_graph as first input.
+  if (!IsValueNode<FuncGraph>(input)) {
+    return false;
+  }
+
+  auto func_graph = GetValueNode<FuncGraphPtr>(input);
+  MS_EXCEPTION_IF_NULL(func_graph);
+  return func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL);
+}
+
 bool AnfRuntimeAlgorithm::IsParameterWeight(const ParameterPtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   return node->has_default();
@@ -1014,5 +1075,44 @@ void AnfRuntimeAlgorithm::ReorderExecList(NotNull<std::vector<CNodePtr> *> node_
   std::copy(all_opt_list.begin(), all_opt_list.end(), std::back_inserter(*node_list));
 }
 
+TypeId AnfRuntimeAlgorithm::GetCNodeOutputPrecision(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto prim = AnfAlgo::GetCNodePrimitive(node);
+  if (prim == nullptr) {
+    return kTypeUnknown;
+  }
+
+  TypeId except_type = kTypeUnknown;
+  if (prim->GetAttr(kAttrOutputPrecision) != nullptr) {
+    auto output_type_str = GetValue<std::string>(prim->GetAttr(kAttrOutputPrecision));
+    if (output_type_str == "float16") {
+      except_type = kNumberTypeFloat16;
+    } else if (output_type_str == "float32") {
+      except_type = kNumberTypeFloat32;
+    } else {
+      MS_LOG(EXCEPTION) << "The fix precision must be float16 or float32, but got " << output_type_str;
+    }
+  }
+
+  return except_type;
+}
+
+TypeId AnfRuntimeAlgorithm::GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx) {
+  if (!node->isa<CNode>()) {
+    MS_LOG(EXCEPTION) << node->DebugString() << ", input node is not CNode.";
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (input_idx + 1 >= cnode->inputs().size()) {
+    MS_LOG(EXCEPTION) << "Input index " << input_idx << " is larger than input number " << GetInputTensorNum(cnode);
+  }
+  auto input_node = cnode->input(input_idx + 1);
+  MS_EXCEPTION_IF_NULL(input_node);
+  auto kernel_with_index = VisitKernel(input_node, 0);
+  if (!kernel_with_index.first->isa<CNode>()) {
+    return kTypeUnknown;
+  }
+  return GetCNodeOutputPrecision(kernel_with_index.first);
+}
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/session/anf_runtime_algorithm.h b/mindspore/ccsrc/session/anf_runtime_algorithm.h
index cd14a8b20d..c46f0b5955 100644
--- a/mindspore/ccsrc/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/session/anf_runtime_algorithm.h
@@ -54,6 +54,8 @@ class AnfRuntimeAlgorithm {
   static PrimitivePtr GetCNodePrimitive(const AnfNodePtr &node);
   // check whether anf node is a node of 'primitive_type',such as make_tuple is a cnode of kPrimMakeTuple
   static bool CheckPrimitiveType(const AnfNodePtr &node, const PrimitivePtr &primitive_type);
+  // get cnode primitive
+  static FuncGraphPtr GetCNodeFuncGraphPtr(const AnfNodePtr &node);
   // get kernel_name of anf node
   static std::string GetCNodeName(const AnfNodePtr &node);
   // get detail info of anf node
@@ -161,6 +163,8 @@ class AnfRuntimeAlgorithm {
   static bool IsRealKernel(const AnfNodePtr &node);
   // checkout whether the anf node is a real kernel that is a cnode and can run on device
   static bool IsRealCNodeKernel(const AnfNodePtr &node);
+  // checkout whether the anf node is a graph kernel.
+  static bool IsGraphKernel(const AnfNodePtr &node);
   // check parameter is weight or data
   static bool IsParameterWeight(const ParameterPtr &node);
   // set stream id of kernel,which will be set in stream assign and be used in stream generate
@@ -190,6 +194,11 @@ class AnfRuntimeAlgorithm {
   static bool IsScalarInput(const CNodePtr &cnode, size_t index);
   static bool IsScalarOutput(const CNodePtr &cnode, size_t index);
   static void ReorderExecList(NotNull<std::vector<CNodePtr> *> node_list);
+  static bool IsWhileTrueGraph(const KernelGraphPtr &child_graph);
+  // get fix output precision of cnode.
+  static TypeId GetCNodeOutputPrecision(const AnfNodePtr &node);
+  // get fix output precision from prev node, input_idx is the input index of current node related to prev node.
+  static TypeId GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx);
 };
 }  // namespace session
 using AnfAlgo = session::AnfRuntimeAlgorithm;
diff --git a/mindspore/ccsrc/session/ascend_session.cc b/mindspore/ccsrc/session/ascend_session.cc
index 7173a26ed1..c69fa63bc8 100644
--- a/mindspore/ccsrc/session/ascend_session.cc
+++ b/mindspore/ccsrc/session/ascend_session.cc
@@ -37,6 +37,7 @@
 #include "ir/scalar.h"
 #include "debug/anf_ir_dump.h"
 #include "debug/anf_ir_utils.h"
+#include "debug/draw.h"
 #include "common/utils.h"
 #include "pre_activate/common/helper.h"
 #include "device/kernel_runtime_manager.h"
@@ -48,7 +49,7 @@ namespace mindspore {
 namespace session {
 const size_t kInvalidIndex = SIZE_MAX;
 namespace {
-void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order) {
+void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
   MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
   MS_LOG(INFO) << "[index][stream_label][graph_id][node string]";
   int i = 0;
@@ -60,6 +61,24 @@ void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order) {
                  << "[" << cnode->DebugString() << "]";
     i++;
   }
+
+  std::stringstream buf;
+  buf << "================== execution order ==================\n";
+  if (!tag.empty()) {
+    buf << tag << "\n";
+  }
+  buf << "execution_order size: " << execution_order.size() << "\n";
+  i = 0;
+  for (auto &cnode : execution_order) {
+    MS_EXCEPTION_IF_NULL(cnode);
+    buf << i << ":\n";
+    buf << "\t" << cnode->DebugString() << "\n";
+    buf << "\t" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "\n";
+    buf << "\t" << AnfAlgo::GetGraphId(cnode.get()) << "\n";
+    i++;
+  }
+  buf << "================== execution order ==================\n";
+  // std::cout << buf.str() << std::endl;
 }
 
 void DumpGraphInputArgs(const VectorRef &args) {
@@ -378,8 +397,28 @@ void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) {
   MS_EXCEPTION_IF_NULL(child_graph);
   MS_LOG(INFO) << "CompileChildGraph " << child_graph->ToString();
   opt::AscendBackendIRFusionOptimization(child_graph);
+  opt::AscendBackendFuseBasicOpt(child_graph, true);
+  opt::AscendBackendGraphKernelOpt(child_graph, true);
+  child_graph->SetExecOrderByDefault();
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool save_graphs = context_ptr->save_graphs_flag();
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (save_graphs) {
+    std::string file_path =
+      save_graphs_path + "/" + "select_kernel_before" + "_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
+    DumpIR(file_path, child_graph);
+  }
   // select kernel build info
   SelectKernel(*child_graph);
+  if (save_graphs) {
+    std::string file_path =
+      save_graphs_path + "/" + "select_kernel_after" + "_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
+    DumpIR(file_path, child_graph);
+  }
   // convert kernel Graph to model
   predictmodel::StepConvertGraph(child_graph);
   // optimize graph
@@ -543,6 +582,9 @@ void AscendSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_
   device::ascend::KernelPreBuild(kernel_graph.get());
   MS_LOG(INFO) << "HardwareOptimize start!";
   opt::AscendBackendOptimization(kernel_graph);
+  opt::AscendGraphKernelCommonProcess(kernel_graph);
+  opt::AscendBackendFuseBasicOpt(kernel_graph, false);
+  opt::AscendBackendAddAtomicClean(kernel_graph);
   MS_EXCEPTION_IF_NULL(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
   MS_LOG(INFO) << "HardwareOptimize Finish!";
diff --git a/mindspore/ccsrc/session/kernel_graph.cc b/mindspore/ccsrc/session/kernel_graph.cc
index 6bc0ec8677..7e9bb62aab 100644
--- a/mindspore/ccsrc/session/kernel_graph.cc
+++ b/mindspore/ccsrc/session/kernel_graph.cc
@@ -24,6 +24,7 @@
 #include "device/kernel_info.h"
 #include "kernel/kernel_build_info.h"
 #include "device/kernel_runtime_manager.h"
+#include "kernel/common_utils.h"
 
 namespace mindspore {
 namespace session {
@@ -75,6 +76,31 @@ std::vector<AnfNodePtr> GetCallRealOutputs(const AnfNodePtr &call_node) {
   }
   return real_inputs;
 }
+
+AnfNodePtr MakeValueNode(const AnfNodePtr &node) {
+  auto value_node = node->cast<ValueNodePtr>();
+  if (value_node == nullptr) {
+    return nullptr;
+  }
+
+  ValueNodePtr new_value_node = std::make_shared<ValueNode>(value_node->value());
+  new_value_node->set_abstract(value_node->abstract());
+  // create kernel_info fo new value node
+  auto kernel_info = std::make_shared<device::KernelInfo>();
+  new_value_node->set_kernel_info(kernel_info);
+  // create kernel_build_info for new value node
+  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // set the format of value_node to DEFAULT_FORMAT
+  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{kOpFormat_DEFAULT});
+  // set value node initial device data type = infer data type
+  std::vector<TypeId> types;
+  for (size_t index = 0; index < AnfAlgo::GetOutputTensorNum(value_node); ++index) {
+    types.push_back(kTypeUnknown);
+  }
+  kernel_build_info_builder->SetOutputsDeviceType(types);
+  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get());
+  return new_value_node;
+}
 }  // namespace
 std::vector<AnfNodePtr> KernelGraph::outputs() const {
   auto graph_output = output();
@@ -231,7 +257,8 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   auto cnode = FuncGraph::NewCNode(inputs);
   MS_EXCEPTION_IF_NULL(cnode);
   cnode->set_abstract(std::make_shared<abstract::AbstractNone>());
-  // create kernel_info from new parameter
+  CreateKernelInfoFromNewParameter(cnode);
+
   auto kernel_info = std::make_shared<device::KernelInfo>();
   std::vector<size_t> feature_map_input_indexs;
   // if the node only has the primitive(such as getNext) or the node's input has a feature map input
@@ -257,6 +284,41 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   return cnode;
 }
 
+void KernelGraph::CreateKernelInfoFromNewParameter(const CNodePtr &cnode) {
+  if (!AnfAlgo::IsGraphKernel(cnode)) {
+    return;
+  }
+  auto func_graph = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
+  MS_EXCEPTION_IF_NULL(func_graph);
+
+  std::vector<AnfNodePtr> node_list;
+  std::vector<AnfNodePtr> input_list;
+  std::vector<AnfNodePtr> output_list;
+  kernel::GetValidKernelNodes(func_graph, &node_list, &input_list, &output_list);
+  for (auto &anf_node : node_list) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_info = std::make_shared<device::KernelInfo>();
+    anf_node->set_kernel_info(kernel_info);
+    auto anf_cnode = anf_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(anf_cnode);
+    for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_cnode); ++i) {
+      auto input_node = anf_cnode->input(i + 1);
+      MS_EXCEPTION_IF_NULL(input_node);
+      if (IsValueNode<tensor::Tensor>(input_node)) {
+        auto new_input_node = MakeValueNode(input_node);
+        if (new_input_node != nullptr) {
+          anf_cnode->set_input(i + 1, new_input_node);
+        }
+      }
+    }
+  }
+  for (auto &anf_node : input_list) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_info = std::make_shared<device::KernelInfo>();
+    anf_node->set_kernel_info(kernel_info);
+  }
+}
+
 CNodePtr KernelGraph::NewCNode(const CNodePtr &cnode) {
   MS_EXCEPTION_IF_NULL(cnode);
   auto new_cnode = std::make_shared<CNode>(*cnode);
@@ -352,21 +414,7 @@ std::vector<AnfNodePtr> KernelGraph::SplitTupleValueNodeToNodeList(const ValueNo
 
 ValueNodePtr KernelGraph::NewValueNode(const ValueNodePtr &value_node) {
   MS_EXCEPTION_IF_NULL(value_node);
-  ValueNodePtr new_value_node = std::make_shared<ValueNode>(value_node->value());
-  new_value_node->set_abstract(value_node->abstract());
-  // create kernel_info fo new value node
-  auto kernel_info = std::make_shared<device::KernelInfo>();
-  kernel_info->SetFeatureMapFlag(false);
-  new_value_node->set_kernel_info(kernel_info);
-  // create kernel_build_info for new value node
-  auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
-  // set the format of value_node to DEFAULT_FORMAT
-  auto output_tensor_num = AnfAlgo::GetOutputTensorNum(value_node);
-  kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>(output_tensor_num, kOpFormat_DEFAULT));
-  // set value node initial device data type = infer data type
-  std::vector<TypeId> types = std::vector<TypeId>(output_tensor_num, kTypeUnknown);
-  kernel_build_info_builder->SetOutputsDeviceType(types);
-  AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get());
+  auto new_value_node = MakeValueNode(value_node)->cast<ValueNodePtr>();
   AnfAlgo::SetGraphId(graph_id_, new_value_node.get());
   return new_value_node;
 }
diff --git a/mindspore/ccsrc/session/kernel_graph.h b/mindspore/ccsrc/session/kernel_graph.h
index 9954b5b1d0..3009ab0ce9 100644
--- a/mindspore/ccsrc/session/kernel_graph.h
+++ b/mindspore/ccsrc/session/kernel_graph.h
@@ -51,6 +51,7 @@ class KernelGraph : public FuncGraph {
   std::vector<AnfNodePtr> *MutableInputs() const { return inputs_.get(); }
   std::vector<AnfNodePtr> outputs() const;
   CNodePtr NewCNode(const std::vector<AnfNodePtr> &inputs) override;
+  void CreateKernelInfoFromNewParameter(const CNodePtr &cnode);
   CNodePtr NewCNode(const CNodePtr &cnode);
   ParameterPtr NewParameter(const ParameterPtr &parameter = nullptr);
   ValueNodePtr NewValueNode(const ValueNodePtr &value_node = nullptr);
diff --git a/mindspore/ccsrc/session/session_basic.cc b/mindspore/ccsrc/session/session_basic.cc
index e5e58045cd..a78c2ac4d8 100644
--- a/mindspore/ccsrc/session/session_basic.cc
+++ b/mindspore/ccsrc/session/session_basic.cc
@@ -21,6 +21,7 @@
 #include "pipeline/parse/data_converter.h"
 #include "ir/manager.h"
 #include "ir/param_value_py.h"
+#include "kernel/common_utils.h"
 #include "operator/ops.h"
 #include "common/trans.h"
 #include "utils/context/ms_context.h"
@@ -33,6 +34,7 @@
 #include "common/utils.h"
 #include "ir/dtype.h"
 #include "ir/anf.h"
+#include "ir/func_graph_cloner.h"
 
 namespace mindspore {
 namespace session {
@@ -367,10 +369,17 @@ CNodePtr SessionBasic::CreateNewCNode(const CNodePtr &cnode, bool valid_input, K
   MS_EXCEPTION_IF_NULL(other_graph_cnode);
   *from_other_graph = false;
   // get primitive of old node
+  std::vector<AnfNodePtr> cnode_inputs;
   auto prim = AnfAlgo::GetCNodePrimitive(cnode);
-  MS_EXCEPTION_IF_NULL(prim);
-  // push attr to inputs[0] of new cnode
-  std::vector<AnfNodePtr> cnode_inputs = {std::make_shared<ValueNode>(std::make_shared<Primitive>(*prim))};
+  if (prim != nullptr) {
+    // push attr to inputs[0] of new cnode
+    cnode_inputs.push_back(std::make_shared<ValueNode>(std::make_shared<Primitive>(*prim)));
+  } else {
+    auto fg = AnfAlgo::GetCNodeFuncGraphPtr(cnode);
+    MS_EXCEPTION_IF_NULL(fg);
+    auto new_fg = BasicClone(fg);
+    cnode_inputs.push_back(std::make_shared<ValueNode>(new_fg));
+  }
   // if has multiple depends,only select first depend as parameter
   for (size_t input_idx = 1; input_idx < cnode->inputs().size(); input_idx++) {
     auto anf = cnode->inputs()[input_idx];
diff --git a/mindspore/ccsrc/transform/convert.h b/mindspore/ccsrc/transform/convert.h
index 8a63f00c6c..2f6c9bb0ad 100644
--- a/mindspore/ccsrc/transform/convert.h
+++ b/mindspore/ccsrc/transform/convert.h
@@ -102,22 +102,15 @@ class DfGraphConvertor {
   explicit DfGraphConvertor(const AnfGraphPtr &anf_graph)
       : anf_graph_(anf_graph), df_graph_(std::make_shared<DfGraph>(anf_graph_->ToString())) {
 #if (!defined ENABLE_GE) || (defined ENABLE_INFER)
-    auto it_training = anf_graph->flags().find("training");
-    if (it_training != anf_graph->flags().end()) {
-      training_ = it_training->second;
-    } else {
-      training_ = false;
-    }
+    training_ = anf_graph->has_flag("training");
 #else
     training_ = ENABLE_TRAIN;
 #endif
-    auto it_distribute = anf_graph->flags().find("broadcast_flag");
-    if (it_distribute != anf_graph->flags().end()) {
+    distribute_ = anf_graph->has_flag("broadcast_flag");
+    if (anf_graph->has_flag("broadcast_flag")) {
       ConfigManager::GetInstance().set_parallel_strategy(ParallelStrategy::DISTRIBUTION);
-      distribute_ = it_distribute->second;
     } else {
       ConfigManager::GetInstance().set_parallel_strategy(ParallelStrategy::ONE_DEVICE);
-      distribute_ = false;
     }
 
     MS_LOG(INFO) << "Create DfGraphConvertor with training: " << training_ << ", distribute: " << distribute_;
diff --git a/mindspore/ccsrc/utils/context/ms_context.cc b/mindspore/ccsrc/utils/context/ms_context.cc
index f9f5fa1ef1..9f283319a7 100644
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -84,6 +84,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   check_bprop_flag_ = false;
   max_device_memory_ = kDefaultMaxDeviceMemory;
   print_file_path_ = "";
+  enable_graph_kernel_ = false;
 }
 
 std::shared_ptr<MsContext> MsContext::GetInstance() {
diff --git a/mindspore/ccsrc/utils/context/ms_context.h b/mindspore/ccsrc/utils/context/ms_context.h
index a1ab728bc7..a5f936d65c 100644
--- a/mindspore/ccsrc/utils/context/ms_context.h
+++ b/mindspore/ccsrc/utils/context/ms_context.h
@@ -157,6 +157,9 @@ class MsContext {
   float max_device_memory() const { return max_device_memory_; }
   void set_max_device_memory(float max_device_memory) { max_device_memory_ = max_device_memory; }
 
+  void set_enable_graph_kernel(bool enable_graph_kernel) { enable_graph_kernel_ = enable_graph_kernel; }
+  bool enable_graph_kernel() const { return enable_graph_kernel_; }
+
  private:
   MsContext(const std::string &backend_policy, const std::string &target);
   void GetGeOptions(std::map<std::string, std::string> *ge_options) const;
@@ -199,6 +202,7 @@ class MsContext {
   bool check_bprop_flag_;
   float max_device_memory_;
   std::string print_file_path_;
+  bool enable_graph_kernel_;
 };
 
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/graph_utils.h b/mindspore/ccsrc/utils/graph_utils.h
index e2703a2877..93edda3e34 100644
--- a/mindspore/ccsrc/utils/graph_utils.h
+++ b/mindspore/ccsrc/utils/graph_utils.h
@@ -62,6 +62,10 @@ std::vector<AnfNodePtr> DeepLinkedGraphSearch(const AnfNodePtr &root, const Incl
 std::vector<AnfNodePtr> DeepScopedGraphSearchWithFilter(const AnfNodePtr &root, const IncludeFunc &include,
                                                         const FilterFunc &filter);
 
+class FuncGraphManager;
+using FuncGraphManagerPtr = std::shared_ptr<FuncGraphManager>;
+std::vector<AnfNodePtr> DeepUsersSearch(const AnfNodePtr &root, const IncludeFunc &include,
+                                        const FuncGraphManagerPtr &mng);
 std::vector<AnfNodePtr> TopoSort(const AnfNodePtr &root, const SuccFunc &succ = SuccIncoming,
                                  const IncludeFunc &include = AlwaysInclude);
 
diff --git a/mindspore/ccsrc/utils/graph_utils_extends.cc b/mindspore/ccsrc/utils/graph_utils_extends.cc
index 85f9986a0d..0740c24236 100644
--- a/mindspore/ccsrc/utils/graph_utils_extends.cc
+++ b/mindspore/ccsrc/utils/graph_utils_extends.cc
@@ -26,6 +26,7 @@
 #include <fstream>
 
 #include "ir/visitor.h"
+#include "ir/manager.h"
 #include "ir/func_graph.h"
 #include "debug/label.h"
 #include "utils/log_adapter.h"
@@ -161,6 +162,24 @@ class DeepLinkedGraphSearcher : public DeepFirstSearcher {
 
   void Visit(const ValueNodePtr &) override {}
 };
+
+class DeepUsersSearcher : public DeepFirstSearcher {
+ public:
+  explicit DeepUsersSearcher(const IncludeFunc &include, const FuncGraphManagerPtr &mng)
+      : DeepFirstSearcher(include), mng_(mng) {}
+  ~DeepUsersSearcher() override = default;
+
+  void Visit(const CNodePtr &cnode) override {
+    auto &users = mng_->node_users()[cnode];
+    for (auto iter = users.begin(); iter != users.end(); ++iter) {
+      DeepFirstSearcher::Visit(iter->first);
+    }
+  }
+  void Visit(const ValueNodePtr &) override {}
+
+ private:
+  FuncGraphManagerPtr mng_;
+};
 }  // namespace
 
 // include for if expand the node the search, filter for if put the node to results.
@@ -180,4 +199,9 @@ std::vector<AnfNodePtr> DeepUsedGraphSearch(const AnfNodePtr &root, const Includ
 std::vector<AnfNodePtr> DeepLinkedGraphSearch(const AnfNodePtr &root, const IncludeFunc &include) {
   return DeepLinkedGraphSearcher(include).Search(root);
 }
+
+std::vector<AnfNodePtr> DeepUsersSearch(const AnfNodePtr &root, const IncludeFunc &include,
+                                        const FuncGraphManagerPtr &mng) {
+  return DeepUsersSearcher(include, mng).Search(root);
+}
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h
index 97ffd739bb..83e3404d7b 100644
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -192,6 +192,9 @@ constexpr auto kAttrEventId = "event_id";
 constexpr auto kAttrDynInput = "dynamic";
 constexpr auto kAttrDynInputSizes = "dyn_input_sizes";
 constexpr auto kAttrSrcFormat = "src_format";
+constexpr auto kAttrMultiples = "multiples";
+constexpr auto kAttrFixPrecision = "fix_precision";
+constexpr auto kAttrOutputPrecision = "output_precision";
 constexpr auto kAttrOutputUsedNum = "output_used_num";
 constexpr auto kAttrHasBias = "has_bias";
 constexpr auto kAttrN = "n";
@@ -216,6 +219,7 @@ constexpr auto kAttrSplitDim = "split_dim";
 constexpr auto kAttrNumSplit = "num_split";
 constexpr auto kAttrOutputNum = "output_num";
 constexpr auto kAttrSizeSplits = "size_splits";
+constexpr auto kAttrOutputDefault = "output_default";
 
 // attr value
 constexpr auto kValueTargetSwitch = "target_switch";
diff --git a/mindspore/ccsrc/vm/segment_runner.cc b/mindspore/ccsrc/vm/segment_runner.cc
index dcd62a548d..9b2ee51b3f 100644
--- a/mindspore/ccsrc/vm/segment_runner.cc
+++ b/mindspore/ccsrc/vm/segment_runner.cc
@@ -92,6 +92,8 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
     } else if (eqv.find(a) == eqv.end()) {
       inputs.push_back(a);
       eqv[a] = fg->add_parameter();
+      eqv[a]->set_abstract(a->abstract());
+      eqv[a]->set_kernel_info(a->kernel_info_ptr());
     }
 
     return eqv[a];
@@ -107,15 +109,20 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
     if (inps.empty()) {
       MS_LOG(EXCEPTION) << "Input is empty";
     }
-    if (!IsValueNode<Primitive>(inps[0])) {
+    if (!IsValueNode<Primitive>(inps[0]) &&
+        !(IsValueNode<FuncGraph>(inps[0]) &&
+          inps[0]->cast<ValueNodePtr>()->value()->cast<FuncGraphPtr>()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL))) {
       MS_LOG(EXCEPTION) << "Input[0] Must be a Primitive valuenode";
     }
+
     auto fn = inps[0];
 
     std::vector<AnfNodePtr> args{fn};
     (void)std::transform(std::begin(inps) + 1, std::end(inps), std::back_inserter(args), ref);
 
     eqv[n] = fg->NewCNode(args);
+    eqv[n]->set_abstract(n->abstract());
+    eqv[n]->set_kernel_info(n->kernel_info_ptr());
   }
 
   std::vector<AnfNodePtr> eqv_keys;
@@ -123,15 +130,18 @@ std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> TransformSegmentToAnfGr
                        [](const std::pair<AnfNodePtr, AnfNodePtr> &elem) -> AnfNodePtr { return elem.first; });
 
   auto outputs = GetOutput(lst, lst[0]->func_graph()->manager()->node_users(), eqv_keys);
-  std::vector<AnfNodePtr> output_args;
-  output_args.push_back(NewValueNode(prim::kPrimMakeTuple));
-  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_args),
-                       [&eqv](const AnfNodePtr &o) -> AnfNodePtr { return eqv[o]; });
-
-  // Set output for AnfGraph
-  auto fg_output = fg->NewCNode(output_args);
+  AnfNodePtr fg_output;
+  if (outputs.size() > 1) {
+    std::vector<AnfNodePtr> output_args;
+    output_args.push_back(NewValueNode(prim::kPrimMakeTuple));
+    (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_args),
+                         [&eqv](const AnfNodePtr &o) -> AnfNodePtr { return eqv[o]; });
+    // Set output for AnfGraph
+    fg_output = fg->NewCNode(output_args);
+  } else {
+    fg_output = eqv[outputs[0]];
+  }
   fg->set_output(fg_output);
-
   return std::make_tuple(fg, inputs, outputs);
 }
 
diff --git a/mindspore/ccsrc/vm/transform.cc b/mindspore/ccsrc/vm/transform.cc
index 91aa974cdf..c1fba78be8 100644
--- a/mindspore/ccsrc/vm/transform.cc
+++ b/mindspore/ccsrc/vm/transform.cc
@@ -33,6 +33,7 @@
 #include "utils/graph_utils.h"
 #include "utils/context/ms_context.h"
 #include "debug/trace.h"
+#include "debug/anf_ir_dump.h"
 
 namespace mindspore {
 namespace compile {
@@ -269,6 +270,14 @@ bool CompileGraph::IsCut(const AnfNodePtr &node) {
     }
 
     AnfNodePtr fn = inputs[0];
+    MS_EXCEPTION_IF_NULL(fn);
+    if (IsValueNode<FuncGraph>(fn)) {
+      auto fg = GetValueNode<FuncGraphPtr>(fn);
+      if (fg->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
+        return false;
+      }
+    }
+
     if (!IsValueNode<Primitive>(fn)) {
       return true;
     }
@@ -316,7 +325,6 @@ VectorRef CompileGraph::SplitNodes(const FuncGraphPtr &graph) {
   for (auto &node : nodes) {
     MS_EXCEPTION_IF_NULL(node);
     if (IsCut(node)) {
-      MS_LOG(DEBUG) << "Cut node:" << node->DebugString(10) << ", size:" << split.size();
       if (split.size() != 0) {
         splits.push_back(split);
       }
@@ -330,10 +338,8 @@ VectorRef CompileGraph::SplitNodes(const FuncGraphPtr &graph) {
       }
       last_target = cur_target;
       split.push_back(node);
-      MS_LOG(DEBUG) << "Insert node:" << node->DebugString(10) << ", size:" << split.size();
     }
   }
-  MS_LOG(DEBUG) << "Split node size :" << splits.size();
   return splits;
 }
 
@@ -567,7 +573,6 @@ InstSet CompileGraph::GenMultiGraphsSinkInst(const FuncGraphPtr &graph) {
 
 InstSet CompileGraph::Run(const FuncGraphPtr &graph) {
   MS_EXCEPTION_IF_NULL(graph);
-  MS_LOG(DEBUG) << "Compile start graph: " << graph->ToString();
 
   Reset();
   PushParameters(graph);
@@ -793,16 +798,11 @@ CompileGraphs::CompileGraphs(const BackendPtr &backend, const std::vector<Primit
 // Convert graphs to unlinked instructions.
 void CompileGraphs::Compile(const FuncGraphPtr &graph) {
   MS_LOG(DEBUG) << "Start";
-  auto graph_manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(graph_manager);
-  FuncGraphSet graphs = graph_manager->func_graphs();
-  for (auto &g : graphs) {
-    mapping_[g] = static_cast<int>(insts_.size());
-    if (transform_ != nullptr) {
-      InstSet insts = transform_->Run(g);
-      if (!insts.empty()) {
-        (void)insts_.insert(insts_.end(), insts.begin(), insts.end());
-      }
+  mapping_[graph] = static_cast<int>(insts_.size());
+  if (transform_ != nullptr) {
+    InstSet insts = transform_->Run(graph);
+    if (!insts.empty()) {
+      (void)insts_.insert(insts_.end(), insts.begin(), insts.end());
     }
   }
   MS_LOG(DEBUG) << "End";
@@ -847,8 +847,15 @@ FinalVMPtr CompileGraphs::CompileAndLink(const FuncGraphPtr &graph) {
   Reset();
   MS_LOG(DEBUG) << "Begin parameter:" << graph->parameters().size();
 
-  (void)WrapPrimitives(graph);
-  Compile(graph);
+  FuncGraphPtr prim_graph = WrapPrimitives(graph);
+  Compile(prim_graph);
+  MS_EXCEPTION_IF_NULL(prim_graph);
+  FuncGraphSet graphs = prim_graph->manager()->func_graphs();
+  for (auto g : graphs) {
+    if (g != graph && g != nullptr && !(g->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL))) {
+      Compile(g);
+    }
+  }
 
   FinalVMPtr rt = Link(graph);
   Reset();
diff --git a/mindspore/context.py b/mindspore/context.py
index 6c4d616cf1..ad601f8fab 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -56,7 +56,8 @@ def _make_directory(path):
             os.makedirs(path)
             real_path = path
         except PermissionError as e:
-            logger.error(f"No write permission on the directory `{path}, error = {e}")
+            logger.error(
+                f"No write permission on the directory `{path}, error = {e}")
             raise ValueError(f"No write permission on the directory `{path}`.")
     return real_path
 
@@ -79,11 +80,13 @@ class _ThreadLocalInfo(threading.local):
     def reserve_class_name_in_scope(self, reserve_class_name_in_scope):
         """Sets whether to save the network class name in the scope."""
         if not isinstance(reserve_class_name_in_scope, bool):
-            raise ValueError("Set reserve_class_name_in_scope value must be bool!")
+            raise ValueError(
+                "Set reserve_class_name_in_scope value must be bool!")
         self._reserve_class_name_in_scope = reserve_class_name_in_scope
 
 
-_ContextRecord = namedtuple("_ContextRecord", ["is_pynative_mode", "switch_context_fn"])
+_ContextRecord = namedtuple(
+    "_ContextRecord", ["is_pynative_mode", "switch_context_fn"])
 
 
 class _ContextSwitchInfo(threading.local):
@@ -110,7 +113,8 @@ class _ContextSwitchInfo(threading.local):
         """
         if isinstance(switch_context_fn, FunctionType):
             switch_context_fn()
-        self.context_stack.append(_ContextRecord(is_pynative, switch_context_fn))
+        self.context_stack.append(
+            _ContextRecord(is_pynative, switch_context_fn))
 
     def pop(self):
         self.context_stack.pop()
@@ -194,7 +198,8 @@ class _Context:
 
     @save_graphs_path.setter
     def save_graphs_path(self, save_graphs_path):
-        self._context_handle.set_save_graphs_path(_make_directory(save_graphs_path))
+        self._context_handle.set_save_graphs_path(
+            _make_directory(save_graphs_path))
 
     @property
     def device_target(self):
@@ -213,7 +218,8 @@ class _Context:
     @device_id.setter
     def device_id(self, device_id):
         if device_id < 0 or device_id > 4095:
-            raise ValueError("Device id must be in [0, 4095], but got {}".format(device_id))
+            raise ValueError(
+                "Device id must be in [0, 4095], but got {}".format(device_id))
         success = self._context_handle.set_device_id(device_id)
         if not success:
             raise RuntimeError("Device id set failed!!!")
@@ -240,7 +246,8 @@ class _Context:
 
     @enable_auto_mixed_precision.setter
     def enable_auto_mixed_precision(self, enable_auto_mixed_precision):
-        self._context_handle.set_auto_mixed_precision_flag(enable_auto_mixed_precision)
+        self._context_handle.set_auto_mixed_precision_flag(
+            enable_auto_mixed_precision)
 
     @property
     def enable_reduce_precision(self):
@@ -248,7 +255,8 @@ class _Context:
 
     @enable_reduce_precision.setter
     def enable_reduce_precision(self, enable_reduce_precision):
-        self._context_handle.set_enable_reduce_precision_flag(enable_reduce_precision)
+        self._context_handle.set_enable_reduce_precision_flag(
+            enable_reduce_precision)
 
     @property
     def enable_dump(self):
@@ -280,12 +288,21 @@ class _Context:
 
     @profiling_options.setter
     def profiling_options(self, option):
-        options = ["training_trace", "task_trace", "task_trace:training_trace", "training_trace:task_trace", "op_trace"]
+        options = ["training_trace", "task_trace",
+                   "task_trace:training_trace", "training_trace:task_trace", "op_trace"]
         if option not in options:
             raise ValueError("Profiling options must be in 'training_trace' 'task_trace' "
                              "'task_trace:training_trace' 'training_trace:task_trace' or 'op_trace'.")
         self._context_handle.set_profiling_options(option)
 
+    @property
+    def enable_graph_kernel(self):
+        return self._context_handle.get_enable_graph_kernel()
+
+    @enable_graph_kernel.setter
+    def enable_graph_kernel(self, graph_kernel_switch_):
+        self._context_handle.set_enable_graph_kernel(graph_kernel_switch_)
+
     @property
     def reserve_class_name_in_scope(self):
         """Gets whether to save the network class name in the scope."""
@@ -303,13 +320,19 @@ class _Context:
     @variable_memory_max_size.setter
     def variable_memory_max_size(self, variable_memory_max_size):
         if not check_input_format(variable_memory_max_size):
-            raise ValueError("Context param variable_memory_max_size should be in correct format! Such as \"5GB\"")
+            raise ValueError(
+                "Context param variable_memory_max_size should be in correct format! Such as \"5GB\"")
         if int(variable_memory_max_size[:-2]) >= _DEVICE_APP_MEMORY_SIZE:
-            raise ValueError("Context param variable_memory_max_size should be less than 31GB.")
-        variable_memory_max_size_ = variable_memory_max_size[:-2] + " * 1024 * 1024 * 1024"
-        graph_memory_max_size = _DEVICE_APP_MEMORY_SIZE - int(variable_memory_max_size[:-2])
-        graph_memory_max_size_ = str(graph_memory_max_size) + " * 1024 * 1024 * 1024"
-        self._context_handle.set_variable_memory_max_size(variable_memory_max_size_)
+            raise ValueError(
+                "Context param variable_memory_max_size should be less than 31GB.")
+        variable_memory_max_size_ = variable_memory_max_size[:-
+                                                             2] + " * 1024 * 1024 * 1024"
+        graph_memory_max_size = _DEVICE_APP_MEMORY_SIZE - \
+            int(variable_memory_max_size[:-2])
+        graph_memory_max_size_ = str(
+            graph_memory_max_size) + " * 1024 * 1024 * 1024"
+        self._context_handle.set_variable_memory_max_size(
+            variable_memory_max_size_)
         self._context_handle.set_graph_memory_max_size(graph_memory_max_size_)
 
     @property
@@ -582,7 +605,8 @@ def get_context(attr_key):
         ValueError: If input key is not an attribute in context.
     """
     if not hasattr(_context(), attr_key):
-        raise ValueError("Get context keyword %s is not recognized!" % attr_key)
+        raise ValueError(
+            "Get context keyword %s is not recognized!" % attr_key)
     return getattr(_context(), attr_key)
 
 @args_type_check(enable_mpi=bool)
diff --git a/mindspore/nn/__init__.py b/mindspore/nn/__init__.py
index f3f59edcbf..8d5e7d3b0a 100644
--- a/mindspore/nn/__init__.py
+++ b/mindspore/nn/__init__.py
@@ -18,14 +18,14 @@ Neural Networks Cells.
 Pre-defined building blocks or computing units to construct Neural Networks.
 """
 from . import layer, loss, optim, metrics, wrap
-from .cell import Cell
+from .cell import Cell, GraphKernel
 from .layer import *
 from .loss import *
 from .optim import *
 from .metrics import *
 from .wrap import *
 
-__all__ = ["Cell"]
+__all__ = ["Cell", "GraphKernel"]
 __all__.extend(layer.__all__)
 __all__.extend(loss.__all__)
 __all__.extend(optim.__all__)
diff --git a/mindspore/nn/cell.py b/mindspore/nn/cell.py
index 65c1ce9548..c046c2e1bf 100755
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -707,9 +707,6 @@ class Cell:
         return cells
 
     def add_flags(self, **flags):
-        for x in flags:
-            if not isinstance(flags[x], bool):
-                raise TypeError(f"Flags (f{x}) must be bool but {type(flags[x])}.")
         if not hasattr(self, "_mindspore_flags"):
             self._mindspore_flags = {}
         self._mindspore_flags.update({**flags})
@@ -820,3 +817,27 @@ class Cell:
         """
         self._backward_hook = HookBackward(fn, self.cls_name + "(" + str(id(self)) + ")")
         self.enable_hook = True
+
+class GraphKernel(Cell):
+    """
+    Base class for GraphKernel.
+
+    A `GraphKernel` a composite of basic primitives and can be compiled into a fused kernel automaticly when
+    context.set_context(enable_graph_kernel=True).
+
+    Examples:
+        >>> class Relu(GraphKernel):
+        >>>    def __init__(self):
+        >>>        super(Relu, self).__init__()
+        >>>        self.max = P.Maximum()
+        >>>
+        >>>    def construct(self, x):
+        >>>        return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x)
+    """
+    def __init__(self, auto_prefix=True, pips=None):
+        super(GraphKernel, self).__init__(auto_prefix, pips)
+        class_name = self.__class__.__name__
+        self.add_flags(graph_kernel=class_name)
+
+    def construct(self):
+        raise NotImplementedError
diff --git a/mindspore/nn/graph_kernels/__init__.py b/mindspore/nn/graph_kernels/__init__.py
new file mode 100644
index 0000000000..8128f2db60
--- /dev/null
+++ b/mindspore/nn/graph_kernels/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+GraphKernel.
+
+GraphKernel provides a unified style to express graph and kernel for user.
+It breaks the boundary between graph and kernel and provides more opportunities to do compile optimization.
+"""
+from .graph_kernels import MaximumGrad, MinimumGrad, AbsGrad, ApplyMomentum, BiasAdd, EqualCount,     \
+    ReduceMean, ReLU, SoftmaxCrossEntropyWithLogits, LayerNorm, LayerNormXBackprop,   \
+    LayerNormBetaGammaBackprop, LogSoftmax, Tanh, TanhGrad, Gelu, Softmax, BiasAddGrad,            \
+    LambUpdateWithLR, LambNextMV
+
+__all__ = ['MaximumGrad', 'MinimumGrad', 'AbsGrad', 'ApplyMomentum', 'BiasAdd', 'EqualCount',
+           'ReduceMean', 'ReLU', 'SoftmaxCrossEntropyWithLogits', 'LayerNorm',
+           'LayerNormXBackprop', 'LayerNormBetaGammaBackprop', 'LogSoftmax', 'Tanh', 'TanhGrad',
+           'Gelu', 'Softmax', 'BiasAddGrad', 'LambUpdateWithLR', 'LambNextMV'
+           ]
diff --git a/mindspore/nn/graph_kernels/graph_kernels.py b/mindspore/nn/graph_kernels/graph_kernels.py
new file mode 100644
index 0000000000..21cc4f8710
--- /dev/null
+++ b/mindspore/nn/graph_kernels/graph_kernels.py
@@ -0,0 +1,1201 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Graph kernels. They are composites of basic primitives and can be compiled into
+a fused kernel automaticly when context.set_context(enable_graph_kernel=True).
+"""
+from ...common import dtype as mstype
+from ...ops import operations as P
+from ...ops.primitive import PrimitiveWithInfer, prim_attr_register
+from ...ops.composite import multitype_ops as C
+from ...ops.operations import _grad_ops as G
+from ..._checkparam import ParamValidator as validator
+from ..cell import Cell, GraphKernel
+
+
+class InplaceAssign(PrimitiveWithInfer):
+    """
+    Inplace assign `Parameter` with a value.
+
+    This primitive can only use in graph kernel.
+
+    Inputs:
+        - **variable** (Parameter) - The `Parameter`.
+        - **value** (Tensor) - The value to assign.
+        - **depend** (Tensor) - The dependent tensor to keep this op connected in graph.
+
+    Outputs:
+        Tensor, has the same type as original `variable`.
+
+    Examples:
+    >>> def construct(self, x):
+    >>> val = x - 1.0
+    >>> ret = x + 2.0
+    >>> return InplaceAssign()(x, val, ret)
+    >>> x = Tensor([2.0], mindspore.float32)
+    >>> net = Net()
+    >>> net(x)
+   """
+    @prim_attr_register
+    def __init__(self):
+        self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output'])
+
+    def infer_shape(self, x, y, z):
+        return z
+
+    def infer_dtype(self, x, y, z):
+        return z
+
+    def get_bprop(self):
+        def bprop(x, y, z, out, dout):
+            return (x, C.zeros_like(y), dout)
+        return bprop
+
+
+class MaximumGrad(GraphKernel):
+    """
+
+    Backprop function for Maximum operator.
+
+    Inputs:
+        - **x** (Tensor) - The first input tensor of maximum.
+        - **y** (Tensor) - The second input tensor of maximum.
+        - **dout** (Tensor) - has the same shape as x and y, next operator's backprop output.
+
+    Outputs:
+        dx (Tensor): has the same shape as x and y, returns dout element if
+        `x >= y` returns true at the same position, or returns zero at that
+        position
+        dy (Tensor): has the same shape as x and y, dy = dout - dx
+
+    Examples:
+        >>> layer = MaximumGrad()
+        >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
+    """
+
+    def __init__(self, grad_x=True, grad_y=True):
+        super(MaximumGrad, self).__init__()
+        self.grad_x = grad_x
+        self.grad_y = grad_y
+        self.select = P.Select()
+        self.greater_equal = P.GreaterEqual()
+        self.zeros_like = P.ZerosLike()
+        self.sub = P.Sub()
+
+    def construct(self, x, y, dout):
+        cmp_result = self.greater_equal(x, y)
+        dx = self.select(cmp_result, dout, self.zeros_like(dout))
+        dy = dout - dx
+
+        return dx, dy
+
+
+class MinimumGrad(GraphKernel):
+    """
+    Backprop function for Minimum operator.
+
+    Compares x and y elementwise, dout should has the same shape with x and y.
+
+    Inputs:
+        - **x** (Tensor) - The first input
+        - **y** (Tensor) - x and y should have same shape
+        - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output
+
+    Outputs:
+        - dx (Tensor) - Has the same shape as x and y, returns dout element if
+        `x <= y` returns true at the same position, or returns zero at that
+        position
+        - dy (Tensor) - Has the same shape as x and y, dy = dout - dx
+
+    Examples:
+        >>> layer = MinimumGrad()
+        >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
+    """
+
+    def __init__(self, grad_x=True, grad_y=True):
+        super(MinimumGrad, self).__init__()
+        self.grad_x = grad_x
+        self.grad_y = grad_y
+        self.select = P.Select()
+        self.less_equal = P.LessEqual()
+        self.zeros_like = P.ZerosLike()
+        self.sub = P.Sub()
+
+    def construct(self, x, y, dout):
+        cmp_result = self.less_equal(x, y)
+        dx = self.select(cmp_result, dout, self.zeros_like(dout))
+        # dy = self.select(cmp_result, self.zeros_like(dout), dout)
+        dy = dout - dx
+
+        return dx, dy
+
+
+class AbsGrad(GraphKernel):
+    """
+    Abs's backprop function.
+
+    Inputs:
+        **input_x** (Tensor) - input data of this operator.
+        **dout** (Tensor) - output of the next operator's backprop function.
+
+    Outputs:
+        Tensor, has the same shape as input_x.
+
+    Examples:
+        >>> back = AbsGrad()
+        >>> output = back(Tensor([1, 2, 3]), Tensor([4, 5, 6]))
+    """
+
+    def __init__(self):
+        super(AbsGrad, self).__init__()
+        self.mul = P.Mul()
+        self.abs = P.Abs()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.round = P.Round()
+
+    def construct(self, input_x, dout):
+        NUM_MAX = 32768
+        mul_max = self.mul(input_x, P.Fill()(P.DType()(input_x), (1,), NUM_MAX))
+        res_abs = self.abs(mul_max)
+        res_div = self.div(mul_max, res_abs)
+        res_round = self.round(res_div)
+        res = self.mul(res_round, dout)
+        return res
+
+
+class ApplyMomentum(GraphKernel):
+    """
+    Update parameter according to the ApplyMomentum algorithm.
+
+    Inputs:
+        variable (Tensor): mutable tensor var
+        accumulation (Tensor): mutable tensor accum
+        learning_rate (float32): learning rate
+        gradient (float32): The gradient
+        momentum (float32): Momentum
+
+    Outputs: updated accumulation and variable
+    """
+
+    def __init__(self,
+                 use_nesterov=False,
+                 use_locking=False,
+                 gradient_scale=1.0):
+        super(ApplyMomentum, self).__init__()
+        self.gradient_scale = validator.check_type('gradient_scale', gradient_scale, [float])
+        self.fake_output_assign_1 = InplaceAssign()
+        self.fake_output_assign_1.add_prim_attr("fake_output", True)
+        self.fake_output_assign_2 = InplaceAssign()
+        self.fake_output_assign_2.add_prim_attr("fake_output", True)
+
+    def construct(self, variable, accumulation, learning_rate, gradient, momentum):
+        gradient = gradient * self.gradient_scale
+        momt_accumulation = accumulation * momentum
+        accumulation_inplace = momt_accumulation + gradient
+
+        sum_gradient = accumulation_inplace * learning_rate
+        variable_inplace = variable - sum_gradient
+
+        accumulation_inplace = self.fake_output_assign_1(accumulation, accumulation_inplace, accumulation_inplace)
+        variable_inplace = self.fake_output_assign_2(variable, variable_inplace, variable_inplace)
+        return accumulation_inplace, variable_inplace
+
+
+class BiasAdd(GraphKernel):
+    """
+    Return the sum of x and bias.
+
+    Inputs:
+        x (Tensor): Tensor of input data.
+        bias (Tensor): The bias tensor.
+
+    Output:
+        Tensor, the sum of x and bias.
+
+    Example:
+    >>> layer = BiasGrad()
+    >>> output = BiasAdd(Tensor([1, 2, 3]), Tensor([1,]))
+    """
+
+    def __init__(self):
+        super(BiasAdd, self).__init__()
+
+    def construct(self, x, bias):
+        shape = P.Shape()(x)
+        if len(shape) == 4:
+            bias_shape = (1, P.Shape()(bias)[0], 1, 1)  # NCHW
+        else:
+            bias_shape = (1, P.Shape()(bias)[0])
+        res = x + P.Reshape()(bias, bias_shape)
+        return res
+
+class BiasAddGrad(GraphKernel):
+    """
+    Computes gradients of BiasAdd.
+
+    Inputs:
+        x (Tensor): the gradients of bias add output.
+
+    Output:
+        Tensor, the gradients of bias add input.
+
+    Examples:
+        >>> dout = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32)
+        >>> bias_add_grad = BiasAddGrad()
+        >>> dx = bias_add_grad(dout)
+    """
+    def __init__(self):
+        super(BiasAddGrad, self).__init__()
+
+    def construct(self, x):
+        shape_x = P.Shape()(x)
+        reduce_axis = [0]
+        for i in range(2, len(shape_x)):
+            reduce_axis.append(i)
+
+        res = P.ReduceSum()(x, reduce_axis)
+        return res
+
+
+class EqualCount(GraphKernel):
+    """
+    Computes the number of the same elements of two tensors.
+
+    The two input tensors should have same shape and data type.
+
+    Inputs:
+        x (Tensor): the first input tensor.
+        y (Tensor): the second input tensor.
+
+    Outputs:
+        Tensor, the type is same as input tensor and size as (1,).
+
+    Examples:
+        >>> x = Tensor(np.array([1, 2, 3]), mindspore.int32)
+        >>> y = Tensor(np.array([1, 2, 4]), mindspore.int32)
+        >>> equal_count = EqualCount()
+        >>> equal_count(x, y)
+    """
+    def __init__(self):
+        super(EqualCount, self).__init__()
+
+    def construct(self, x, y):
+        equal_bool = P.Equal()(P.Cast()(x, mstype.float32), P.Cast()(y, mstype.float32))
+        equal_count = P.Cast()(equal_bool, mstype.float16)
+
+        axes = (0,)
+        res = P.ReduceSum()(equal_count, axes)
+        res = P.Cast()(res, P.DType()(x))
+        return res
+
+
+class ReduceMean(GraphKernel):
+    """
+    Reduce a dimension of a tensor by averaging all elements in the dimension.
+
+    The dtype of the tensor to be reduced is number.
+
+    Args:
+        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
+                          If False, don't keep these dimensions. Default : False.
+
+    Inputs:
+        - **input_x** (Tensor[Number]) - The input tensor.
+        - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions.
+          Only constant value is allowed.
+
+    Outputs:
+        Tensor, has the same dtype as the 'input_x'.
+
+        - If axis is (), and keep_dims is false,
+          the output is a 0-D tensor representing the sum of all elements in the input tensor.
+        - If axis is int, set as 2, and keep_dims is false,
+          the shape of output is :math:`(x_1, x_3, ..., x_R)`.
+        - If axis is tuple(int), set as (2, 3), and keep_dims is false,
+          the shape of output is :math:`(x_1, x_4, ..., x_R)`.
+
+    Examples:
+        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> op = ReduceMean(keep_dims=True)
+        >>> output = op(input_x, 1)
+    """
+
+    def __init__(self, keep_dims=True):
+        super(ReduceMean, self).__init__()
+        self.keep_dims = validator.check_type('keep_dims', keep_dims, [bool])
+        self.sum = P.ReduceSum(self.keep_dims)
+
+    def construct(self, x, axis):
+        shape = P.Shape()(x)
+        value_num = 1
+        for i in axis:
+            value_num *= shape[i]
+
+        data_sum = self.sum(x, axis)
+        avg = 1.0 / P.Fill()(P.DType()(x), (1,), value_num)
+        res = data_sum * avg
+        return res
+
+
+class ReLU(GraphKernel):
+    r"""
+    Computes ReLU(Rectified Linear Unit) of input tensor element-wise.
+
+    It returns :math:`\max(x,\  0)` element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The input tensor.
+
+    Outputs:
+        Tensor, with the same type and shape as the `input_x`.
+
+    Examples:
+        >>> input_x = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
+        >>> relu = ReLU()
+        >>> result = relu(input_x)
+        [[0, 4.0, 0.0], [2.0, 0.0, 9.0]]
+    """
+    def __init__(self):
+        super(ReLU, self).__init__()
+        self.max = P.Maximum()
+
+    def construct(self, x):
+        return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x)
+
+
+class SoftmaxCrossEntropyWithLogits(GraphKernel):
+    r"""
+    Gets the softmax cross-entropy value between logits and labels which shoule be one-hot encoding.
+
+    Note:
+        Sets input logits as `X`, input label as `Y`, output as `loss`. Then,
+
+        .. math::
+            p_{ij} = softmax(X_{ij}) = \frac{exp(x_i)}{\sum_{j = 0}^{N-1}\exp(x_j)}
+
+        .. math::
+            loss_{ij} = -\sum_j{Y_{ij} * ln(p_{ij})}
+
+    Inputs:
+        - **logits** (Tensor) - Input logits, with shape :math:`(N, C)`.
+        - **labels** (Tensor) - Ground truth labels, with shape :math:`(N, C)`.
+
+    Outputs:
+        Tuple of 2 Tensor, the loss shape is `(N,)`, and the dlogits with the same shape as `logits`.
+
+    Examples:
+        >>> logits = Tensor([[2, 4, 1, 4, 5], [2, 1, 2, 4, 3]], mindspore.float32)
+        >>> labels = Tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], mindspore.float32)
+        >>> softmax_cross = SoftmaxCrossEntropyWithLogits()
+        >>> loss, backprop = softmax_cross(logits, labels)
+    """
+
+    def __init__(self):
+        super(SoftmaxCrossEntropyWithLogits, self).__init__()
+        self.max = P.ReduceMax(keep_dims=True)
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+
+    def construct(self, features, labels):
+        data_max = self.max(features, (1,))
+        data_sub = features - data_max
+        data_exp = P.Exp()(data_sub)
+        data_sum = self.sum_keep_dims(data_exp, (1,))
+        data_div = data_exp / data_sum
+        data_log_tmp = P.Log()(data_sum)
+        data_log = data_sub - data_log_tmp
+        data_mul = labels * data_log
+        data_muls = P.Neg()(data_mul)
+        loss = P.ReduceSum()(data_muls, (1,))
+        backprop = data_div - labels
+        return loss, backprop
+
+    def bprop(self, features, labels, out, dout):
+        grad = out[1]
+        grad = grad * P.ExpandDims()(dout[0], -1)
+        return grad, P.ZerosLike()(labels)
+
+
+class LayerNormForward(GraphKernel):
+    """ Forward function of the LayerNorm operator. """
+    def __init__(self, begin_norm_axis=1, begin_params_axis=1):
+        super(LayerNormForward, self).__init__()
+        self.begin_norm_axis = validator.check_type('begin_norm_axis', begin_norm_axis, [int])
+        self.begin_params_axis = validator.check_type('begin_params_axis', begin_params_axis, [int])
+        self.mul = P.Mul()
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+        self.sub = P.Sub()
+        self.add = P.TensorAdd()
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.eps = P.Eps()
+
+    def construct(self, input_x, input_gamma, input_beta):
+        shape_x = P.Shape()(input_x)
+
+        # Calculate the scaling ratio of the average
+        begin_norm_axis = self.begin_norm_axis
+        if begin_norm_axis < 0:
+            begin_norm_axis += len(shape_x)
+        reduce_axis = ()
+        for i in range(len(shape_x)):
+            if i > begin_norm_axis or i == begin_norm_axis:
+                reduce_axis = reduce_axis + (i,)
+
+        reduce_elts = 1.0
+        for i in reduce_axis:
+            reduce_elts *= shape_x[i]
+        mean_cof = 1.0 / reduce_elts
+
+        # Calculate mean
+        mean_muls = self.mul(input_x, mean_cof)
+        mean = self.sum_keep_dims(mean_muls, reduce_axis)
+
+        # Calculate variance
+        variance_sub = self.sub(input_x, mean)
+        variance_mul = self.mul(variance_sub, variance_sub)
+        variance_muls = self.mul(variance_mul, mean_cof)
+        variance = self.sum_keep_dims(variance_muls, reduce_axis)
+
+        # Calculate normalize
+        normalize_sub = self.sub(input_x, mean)
+        epsilon = self.eps(input_x)
+        normalize_add = self.add(variance, epsilon)
+        normalize_log = self.log(normalize_add)
+        normalize_log_mul = self.mul(normalize_log, -0.5)
+        normalize_exp = self.exp(normalize_log_mul)
+        normalize_mul = self.mul(normalize_sub, normalize_exp)
+
+        # Calculate scale and translate
+        if self.begin_params_axis == 0:
+            scale_mul = self.mul(input_gamma, normalize_mul)
+            res = self.add(scale_mul, input_beta)
+        else:
+            scale_mul = self.mul(input_gamma, normalize_mul)
+            res = self.add(scale_mul, input_beta)
+
+        return res, mean, variance
+
+
+class LayerNormXBackprop(GraphKernel):
+    r"""
+    Together with LayerNormBetaGammaBackprop, to supply the backprop
+    functionality for LayerNorm.
+
+    Note:
+        Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
+        input_gamma as :math:`\gamma`. Then,
+        .. math::
+            \begin{array}{ll} \\
+                \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
+                \frac {\partial L} {\partial x_i} =
+                    \frac{\gamma}{\sqrt{\sigma^2+\epsilon}}
+                    ( \frac{\partial L}{\partial y_i}
+                    - \frac{1}{m} \cdot \frac{\partial L}{\partial \beta}
+                    - \frac{\hat{x_i}}{m} \cdot \frac{\partial L}{\partial \gamma})
+            \end{array}
+
+    Inputs:
+        - **dy**(Tensor) - The first item of the next operator's backprop's output.
+        - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
+        - **variance**(Tensor) - The second input of the forward function of LayerNorm.
+        - **mean**(Tensor) - The third input of the forward function of LayerNorm.
+        - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
+
+    Outputs:
+        Tensor, the output of this operator, will be used as the first item of the result of
+            LayerNorm's backprop function, has the same shape and data type as 'input_x'.
+
+    Examples:
+        >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> op = LayerNormXBackprop(keep_dims=False)
+        >>> output = op(dy, input_x, variance, mean, input_gamma)
+    """
+
+    def __init__(self):
+        super(LayerNormXBackprop, self).__init__()
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.eps = P.Eps()
+
+    def construct(self, dy, input_x, variance, mean, input_gamma):
+        shape_x = P.Shape()(input_x)
+        shape_mean = P.Shape()(mean)
+        reduce_axis = ()
+        flag = -1
+        min_l = 0
+        if len(shape_x) > len(shape_mean):
+            min_l = len(shape_x)
+        else:
+            min_l = len(shape_mean)
+        for i in range(min_l):
+            if (shape_x[i] != shape_mean[i]) and (flag == -1):
+                flag = i
+        if flag != -1:
+            for i in range(flag, len(shape_x)):
+                reduce_axis = reduce_axis + (i,)
+        else:
+            reduce_axis = reduce_axis + (len(shape_x) - 1,)
+        mean_num = 1.0
+        for i in reduce_axis:
+            mean_num *= shape_x[i]
+        pd_xl = input_gamma * dy
+        epsilon = self.eps(input_x)
+        var_elta = variance + epsilon
+        var_elta_log = self.log(var_elta)
+        var_elta_mul = var_elta_log * -0.5
+        var_elta_2 = P.Exp()(var_elta_mul)
+        pdvar1_mul = var_elta_2 * var_elta_2
+        pd_var_1 = pdvar1_mul * var_elta_2
+        sub_x_mean = input_x - mean
+        pdvar_mul1 = pd_xl * sub_x_mean
+        pdvar_sum = self.sum_keep_dims(pdvar_mul1, reduce_axis)
+        pdvar_mul3 = pdvar_sum * pd_var_1
+        pd_var = pdvar_mul3 * -0.5
+        pdmean1_sum = self.sum_keep_dims(pd_xl, reduce_axis)
+        pdmean1_mul = pdmean1_sum * var_elta_2
+        pd_mean_1 = pdmean1_mul * -1.0
+        pdmean2_mul1 = sub_x_mean * -2.0
+        pdmean2_sum = self.sum_keep_dims(pdmean2_mul1, reduce_axis)
+        pdmean2_mul3 = pdmean2_sum * (1.0 / mean_num)
+        pd_mean_2 = pd_var * pdmean2_mul3
+        pd_mean = pd_mean_2 + pd_mean_1
+        pd_x_1 = var_elta_2 * pd_xl
+        pdx2_mul = pd_var * sub_x_mean
+        pd_x_2 = pdx2_mul * (2.0 * (1.0 / mean_num))
+        pd_x_3 = pd_mean * (1.0 / mean_num)
+        pdx_add = pd_x_1 + pd_x_2
+        pd_x = pdx_add + pd_x_3
+        return pd_x
+
+
+class LayerNormBetaGammaBackprop(GraphKernel):
+    r"""
+    Together with LayerNormXBackprop, to supply the backprop functionality for
+    LayerNorm.
+    Note:
+        Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
+        input_gamma as :math:`\gamma`. Then,
+        .. math::
+            \begin{array}{ll} \\
+                \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
+                \frac {\partial L} {\partial \beta} =
+                    \sum_{i=1}^m \\frac{\\partial L}{\partial y_i} \\
+                \frac {\partial L} {\partial \gamma} =
+                    \sum_{i=1}^m \\frac{\partial L}{\partial y_i} \cdot \hat{x_i}
+            \end{array}
+
+    Inputs:
+        - **dy**(Tensor) - The first item of the next operator's backprop's output.
+        - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
+        - **variance**(Tensor) - The second input of the forward function of LayerNorm.
+        - **mean**(Tensor) - The third input of the forward function of LayerNorm.
+        - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
+
+    Outputs:
+        Tuple of 2 Tensor, the backprop outputs.
+
+        - **pd_beta**(Tensor) - The first item of return value of this operator, will be used as
+                    the second item of the LayerNorm's backprop function.
+        - **pd_gamma**(Tensor) - The second item of return value of this operator, will be used as
+                    the third item of the LayerNorm's backprop function.
+
+    Examples:
+        >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
+        >>> op = LayerNormBetaGammaBackprop(keep_dims=False)
+        >>> pd_beta, pd_gamma = op(dy, input_x, variance, mean, input_gamma)
+    """
+    def __init__(self):
+        super(LayerNormBetaGammaBackprop, self).__init__()
+        self.sum_not_keep_dims = P.ReduceSum(keep_dims=False)
+        self.log = P.Log()
+        self.exp = P.Exp()
+        self.eps = P.Eps()
+
+    def construct(self, dy, input_x, variance, mean, shape_gamma):
+        shape_x = P.Shape()(input_x)
+        params_axis = ()
+
+        if len(shape_x) != len(shape_gamma):
+            sub = len(shape_x) - len(shape_gamma)
+            for i in range(sub):
+                params_axis = params_axis + (i,)
+
+        pd_beta = self.sum_not_keep_dims(dy, params_axis)
+        epsilon = self.eps(input_x)
+        var_elta = variance + epsilon
+        var_elta_log = self.log(var_elta)
+        var_elta_mul = var_elta_log * -0.5
+        var_elta_2 = P.Exp()(var_elta_mul)
+        sub_x_mean = input_x - mean
+        var_elta_2_cast = var_elta_2
+        xl_mul = var_elta_2_cast * sub_x_mean
+        pdga_mul = dy * xl_mul
+        pd_gamma = self.sum_not_keep_dims(pdga_mul, params_axis)
+        return pd_beta, pd_gamma
+
+
+class LogSoftmax(GraphKernel):
+    r"""
+    Log Softmax activation function.
+
+    Applies the Log Softmax function to the input tensor on the specified axis.
+    Suppose a slice along the given aixs :math:`x` then for each element :math:`x_i`
+    the Log Softmax function is shown as follows:
+
+    .. math::
+        \text{output}(x_i) = \log \left(\frac{exp(x_i)} {\sum_{j = 0}^{N-1}\exp(x_j)}\right),
+
+    where :math:`N` is the length of the Tensor.
+
+    Args:
+        axis (int): The axis to do the Log softmax operation. Default: -1.
+
+    Inputs:
+        logits (Tensor): The input of Log Softmax.
+
+    Outputs:
+        Tensor, with the same type and shape as the logits.
+
+    Examples:
+        >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
+        >>> log_softmax = LogSoftmax()
+        >>> log_softmax(input_x)
+        [-4.4519143, -3.4519143, -2.4519143, -1.4519144, -0.4519144]
+    """
+
+    def __init__(self, axis=-1):
+        super(LogSoftmax, self).__init__()
+        self.axis = validator.check_type('axis', axis, [int])
+        self.max_keep_dims = P.ReduceMax(keep_dims=True)
+        self.sub = P.Sub()
+        self.exp = P.Exp()
+        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
+        self.log = P.Log()
+        self.mul = P.Mul()
+
+    def construct(self, input_x):
+        data_max = self.max_keep_dims(input_x, (self.axis,))
+        data_sub = self.sub(input_x, data_max)
+
+        data_exp = self.exp(data_sub)
+        data_sum = self.sum_keep_dims(data_exp, (self.axis,))
+        data_log = self.log(data_sum)
+
+        res = self.sub(data_sub, data_log)
+        return res
+
+    def bprop(self, input_x, out, dout):
+        input_x = out
+        input_dy = dout
+
+        data_exp = self.exp(input_x)
+        data_sum = self.sum_keep_dims(input_dy, (self.axis,))
+        data_softmax = self.mul(data_exp, data_sum)
+
+        res = self.sub(input_dy, data_softmax)
+        return (res,)
+
+
+class Tanh(GraphKernel):
+    r"""
+    Tanh activation function.
+
+    Computes hyperbolic tangent of input element-wise. The Tanh function is defined as:
+
+    .. math::
+        tanh(x_i) = \frac{\exp(x_i) - \exp(-x_i)}{\exp(x_i) + \exp(-x_i)} = \frac{\exp(2x_i) - 1}{\exp(2x_i) + 1},
+
+    where :math:`x_i` is an element of the input Tensor.
+
+    Inputs:
+        - **input_x** (Tensor) - The input of Tanh.
+
+    Outputs:
+        Tensor, with the same type and shape as the input_x.
+
+    Examples:
+        >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
+        >>> tanh = Tanh()
+        >>> tanh(input_x)
+        [0.7615941, 0.9640276, 0.9950548, 0.9993293, 0.99990916]
+    """
+    def __init__(self):
+        super(Tanh, self).__init__()
+        self.abs = P.Abs()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.mul = P.Mul()
+        self.mul_fp16 = P.Mul()
+        self.mul_fp16.add_prim_attr("output_precision", "float16")
+        self.exp = P.Exp()
+
+    def construct(self, input_x):
+        input_abs = self.abs(input_x)
+        sign_flag = self.div(input_x, input_abs)
+        sign_flag_neg = self.mul(sign_flag, -1.0)
+
+        power_val = self.mul(input_abs, -2.0)
+        exp_val = self.exp(power_val)
+        up_val = self.add(exp_val, -1.0)
+        down_val = self.add(exp_val, 1.0)
+
+        div_val = self.div(up_val, down_val)
+        res = self.mul(sign_flag_neg, div_val)
+        return res
+
+    def bprop(self, input_x, out, dout):
+        input_y = out
+        input_dy = dout
+
+        data_square = self.mul(input_y, input_y)
+        data_mul = self.mul(data_square, -1.0)
+        anuminate = self.add(data_mul, 1.0)
+        res = self.mul_fp16(anuminate, input_dy)
+
+        return (res,)
+
+class TanhGrad(GraphKernel):
+    """
+    Backprop function of Tanh
+
+    Mathematical calculating:
+        result = Tanh(out)
+        result = 1 - result * result
+        result = result * dout
+    Inputs:
+        out (Tensor): Tanh's output
+        dout (Tensor): next layer's backward function's output, has same shape as out
+
+    Outputs:
+        result (Tensor): result of (1 - tanh(out)^2) * dout
+
+    Examples:
+        >>> x_np = np.random.randn(5, 3, 6).astype(np.float16)
+        >>> dy_np = np.random.randn(5, 3, 6).astype(np.float16)
+        >>> x_ms = Tensor(x_np)
+        >>> dy_ms = Tensor(dy_np)
+        >>> tanh_grad = TanhGrad()
+        >>> out = tanh_grad(x_np, dy_np)
+    """
+    def __init__(self):
+        super(TanhGrad, self).__init__()
+        self.add = P.TensorAdd()
+        self.mul = P.Mul()
+        self.mul_fp16 = P.Mul()
+        self.mul_fp16.add_prim_attr("output_precision", "float16")
+
+    def construct(self, out, dout):
+        input_y = out
+        input_dy = dout
+
+        data_square = self.mul(input_y, input_y)
+        data_mul = self.mul(data_square, -1.0)
+        anuminate = self.add(data_mul, 1.0)
+        res = self.mul_fp16(anuminate, input_dy)
+
+        return res
+
+class Gelu(GraphKernel):
+    r"""
+    Gaussian Error Linear Units activation function.
+
+    GeLU is described in the paper `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
+    And also please refer to `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.
+    <https://arxiv.org/abs/1810.04805>`_.
+
+    Defined as follows:
+
+    .. math::
+        \text{output} = 0.5 * x * (1 + erf(x / \sqrt{2})),
+
+    where :math:`erf` is the "Gauss error function" .
+
+    Inputs:
+        - **input_x** (Tensor) - Input to compute the Gelu.
+
+    Outputs:
+        Tensor, with the same type and shape as input.
+
+    Examples:
+        >>> tensor = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
+        >>> gelu = Gelu()
+        >>> result = gelu(tensor)
+    """
+
+    def __init__(self):
+        super(Gelu, self).__init__()
+        self.add = P.TensorAdd()
+        self.abs = P.Abs()
+        self.exp = P.Exp()
+        self.neg = P.Neg()
+        self.minimum = P.Minimum()
+        self.div = P.RealDiv()
+        self.mul = P.Mul()
+        self.CSVALUE = 0.044715
+        self.CSVALUE_A = 1.59576912
+        self.CSVALUE_5 = 0.3989422804
+        self.CSVALUE_3B = 0.2140644488
+
+    def construct(self, input_x):
+        def _tanh_parameter_compute(data_x):
+            """
+            compute the parameter of tanh:
+            return: result equal (x+0.044715*tf.pow(x,3))
+            """
+            mul_0 = self.mul(data_x, data_x)
+            pow_0 = self.mul(mul_0, data_x)
+            mul_1 = self.mul(pow_0, self.CSVALUE)
+            result = self.add(data_x, mul_1)
+
+            return result
+
+        tanh_parameter = _tanh_parameter_compute(input_x)
+        mul_0 = self.mul(tanh_parameter, 1.5957691)
+
+        mul_0_min = self.minimum(mul_0, 0.0)
+        right_mul = self.exp(mul_0_min)
+
+        mul_0_abs = self.abs(mul_0)
+        mul_0_abs_neg = self.mul(mul_0_abs, -1.0)
+        mul_0_abs_neg_exp = self.exp(mul_0_abs_neg)
+
+        mul_0_abs_neg_exp_add = self.add(mul_0_abs_neg_exp, 1.0)
+        left_mul = self.div(input_x, mul_0_abs_neg_exp_add)
+
+        result = self.mul(left_mul, right_mul)
+        return result
+
+    def bprop(self, input_x, out, dout):
+        """ register backprop function for Gelu """
+        data_x = input_x
+        data_gelu = out
+        data_dy = dout
+
+        def _math_four_compute(data_x):
+            """
+            return: math_four equal 2*(np(sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3)))
+            """
+            datax_pow = data_x * data_x * data_x
+            datax_muls_c = self.mul(datax_pow, self.CSVALUE)
+            datax_addx = self.add(datax_muls_c, data_x)
+            datax_muls_s = self.mul(datax_addx, self.CSVALUE_A)
+
+            return datax_muls_s
+
+        # common part
+        math_four = _math_four_compute(data_x)
+        math_four_abs = self.abs(math_four)
+        math_four_abs_neg = self.mul(math_four_abs, -1.0)
+        math_four_abs_neg_exp = self.exp(math_four_abs_neg)
+        math_four_min = self.minimum(math_four, 0.0)
+
+        # dividend part
+        datax_pow = self.mul(data_x, data_x)
+        datax_pow_mul = self.mul(datax_pow, self.CSVALUE_3B)
+        datax_pow_mul_add = self.add(datax_pow_mul, self.CSVALUE_A)
+        data_gelu_mul = self.mul(data_gelu, datax_pow_mul_add)
+        math_four_min_2 = self.mul(math_four_min, 2.0)
+        div_right = self.mul(data_gelu_mul, math_four_abs_neg_exp)
+        div_left = self.exp(math_four_min_2)
+        dividend = self.add(div_left, div_right)
+
+        # divisor part
+        div_0 = self.add(math_four_abs_neg_exp, 1.0)
+        div_1 = self.exp(math_four_min)
+        divisor = self.mul(div_1, div_0)
+        res_grad = self.div(dividend, divisor)
+
+        result = self.mul(res_grad, data_dy)
+        return (result,)
+
+
+class Softmax(GraphKernel):
+    """
+    Operator Softmax
+    .. math: `exp(x-max(x)) / sum(exp(x-max(x)))`
+
+    Args:
+        axis (int, tuple): Axis along which the softmax normalization is applied
+
+    Inputs:
+        x (Tensor): input data for softmax
+
+    Outputs:
+        output (Tensor): a tensor with the same shape of the input
+
+    Examples:
+        >>> layer = Softmax(1)
+        >>> x = Tensor(np.array([1.2, 2.1], [2.2, 3.2]), mindspore.float32)
+        >>> output = layer(x)
+    """
+
+    def __init__(self, axis):
+        super(Softmax, self).__init__()
+        validator.check_type("axis", axis, [int, tuple])
+        if isinstance(axis, int):
+            self.axis = (axis,)
+        else:
+            self.axis = axis
+        for item in self.axis:
+            validator.check_type("item of axis", item, [int])
+        self.max = P.ReduceMax(keep_dims=True)
+        self.sub = P.Sub()
+        self.exp = P.Exp()
+        self.sum = P.ReduceSum(keep_dims=True)
+        self.mul = P.Mul()
+
+    def construct(self, x):
+        max_x = self.max(x, self.axis)
+        data_sub = self.sub(x, max_x)
+        data_exp = self.exp(data_sub)
+        data_expsum = self.sum(data_exp, self.axis)
+        output = data_exp / data_expsum
+        return output
+
+    def bprop(self, x, out, dout):
+        mul_res = self.mul(dout, out)
+        sum_res = self.sum(mul_res, self.axis)
+        sub_res = self.sub(dout, sum_res)
+        res = self.mul(sub_res, out)
+        return (res,)
+
+
+class LayerNorm(Cell):
+    r"""
+    Applies Layer Normalization over a mini-batch of inputs.
+
+    Layer normalization is widely used in recurrent neural networks. It applies
+    normalization over a mini-batch of inputs for each single training case as described
+    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
+    normalization, layer normalization performs exactly the same computation at training and
+    testing times. It can be described using the following formula. It is applied across all channels
+    and pixel but only one batch size.
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    Args:
+        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
+            `begin_norm_axis ... R - 1`.
+        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
+            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
+        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
+            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
+            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
+        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
+            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
+            'he_uniform', etc. Default: 'ones'.
+        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
+            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
+            'he_uniform', etc. Default: 'zeros'.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
+          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
+
+    Outputs:
+        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
+
+    Examples:
+        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
+        >>> shape1 = x.shape()[1:]
+        >>> m = G.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
+        >>> m(x)
+    """
+
+    def __init__(self,
+                 begin_norm_axis=-1,
+                 begin_params_axis=-1
+                 ):
+        super(LayerNorm, self).__init__()
+        self.begin_norm_axis = begin_norm_axis
+        self.begin_params_axis = begin_params_axis
+        self.layer_norm = LayerNormForward(begin_norm_axis, begin_params_axis)
+        self.layer_norm_x_grad = LayerNormXBackprop()
+        self.layer_norm_beta_gamma = LayerNormBetaGammaBackprop()
+        self.layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis)
+
+    def construct(self, input_x, input_gamma, input_beta):
+        return self.layer_norm(input_x, input_gamma, input_beta)
+
+    # case 1
+    def bprop(self, input_x, input_gamma, input_beta, out, dout):
+        dx, d_gamma, d_beta = self.layer_norm_grad(input_x, dout[0], out[2], dout[1], input_gamma)
+        return dx, d_gamma, d_beta
+
+
+class LambUpdateWithLR(GraphKernel):
+    r"""
+    Part of Lamb optimizer.
+
+    .. math::
+        s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se)
+        i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4
+
+    Inputs:
+        - **input0** (Tensor) - The first tensor to be computed.
+        - **input1** (Tensor) - The second tensor to be computed.
+        - **input2** (Tensor) - The third tensor to be computed.
+        - **input3** (Tensor) - The fourth tensor to be computed.
+        - **input4** (Tensor) - The fifth tensor to be computed.
+        - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result.
+        - **greater_y** (Tensor) - The seventh tensor to be computed.
+        - **select_e** (Tensor) - The eighth tensor to be computed.
+        - **minimum_y** (Tensor) - The ninth tensor to be computed.
+
+    Outputs:
+        A fake output tensor.
+
+    Examples:
+        >>> lamb_update = LambUpdateWithLR()
+        >>> i0 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
+        >>> i1 = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> i2 = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> i3 = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> i4 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
+        >>> i5 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
+        >>> yg = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> se = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> ym = np.random.normal(0, 1, [1]).astype(np.float32)
+        >>> lamb_update(i0, i1, i2, i3, i4, i5, yg, se, ym)
+
+    """
+
+    def __init__(self):
+        super(LambUpdateWithLR, self).__init__()
+        self.greater = P.Greater()
+        self.select = P.Select()
+        self.div = P.RealDiv()
+        self.min = P.Minimum()
+        self.max = P.Maximum()
+        self.mul = P.Mul()
+        self.sub = P.Sub()
+        self.fake_output_assign = InplaceAssign()
+        self.fake_output_assign.add_prim_attr("fake_output", True)
+
+    def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y):
+        greater0 = self.greater(input0, greater_y)
+        greater1 = self.greater(input1, greater_y)
+        real_div0 = self.div(input1, input2)
+        select0 = self.select(greater0, real_div0, select_e)
+        select1 = self.select(greater1, select0, select_e)
+        min0 = self.min(select1, minimum_y)
+        max0 = self.max(min0, greater_y)
+        mul0 = self.mul(max0, input3)
+        mul1 = self.mul(mul0, input4)
+        sub0 = self.sub(input5, mul1)
+        sub0 = self.fake_output_assign(input5, sub0, sub0)
+        return sub0
+
+class LambNextMV(GraphKernel):
+    r"""
+    Part of Lamb optimizer.
+
+    .. math::
+        rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6}
+        rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3}
+        y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7
+        y_3 = \frac{rd_0}{\sqrt{rd_1} + x3}
+        i5 = i_8 \times i_5 + i_9 \times i_4
+        i2 = x_0 \times i_2 + x_1 \times i_1
+
+    Inputs:
+        - **inputs1** (Tensor) - The first input tensor to be computed.
+        - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result.
+        - **inputs3** (Tensor) - The third input tensor to be computed.
+        - **inputs4** (Tensor) - The fourth input tensor to be computed.
+        - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result.
+        - **inputs6** (Tensor) - The sixth input tensor to be computed.
+        - **inputs7** (Tensor) - The seventh input tensor to be computed.
+        - **inputs8** (Tensor) - The eighth input tensor to be computed.
+        - **inputs9** (Tensor) - The ninth input tensor to be computed.
+        - **inputsx0** (Tensor) - The tenth input tensor to be computed.
+        - **inputsx1** (Tensor) - The eleventh input tensor to be computed.
+        - **inputsx2** (Tensor) - The twelfth input tensor to be computed.
+        - **inputsx3** (Tensor) - The thirteenth input tensor to be computed.
+
+    Outputs:
+        Tuple of 2 Tensor.
+
+        - **add3** (Tensor) - The shape is same as the shape after broadcasting, and the data type is
+                              the one with high precision or high digits among the inputs.
+        - **realdiv4** (Tensor) - The shape is same as the shape after broadcasting, and the data type is
+                                  the one with high precision or high digits among the inputs.
+
+    Examples:
+        >>> lamb_next_mv = LambNextMV()
+        >>> i1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i3 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i4 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i5 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i6 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i7 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i8 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> i9 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x0 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
+        >>> x3 = Tensor(np.ones([1, 16]).astype(np.float32) * 1e-6)
+        >>> lamb_next_mv(i1, i2, i3, i4, i5, i6, i7, i8, i9, x0, x1, x2, x3)
+
+    """
+
+    def __init__(self):
+        super(LambNextMV, self).__init__()
+        self.mul = P.Mul()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.sqrt = P.Sqrt()
+        self.rsqrt = P.Rsqrt()
+        self.fake_output_assign_1 = InplaceAssign()
+        self.fake_output_assign_1.add_prim_attr("fake_output", False)
+        self.fake_output_assign_2 = InplaceAssign()
+        self.fake_output_assign_2.add_prim_attr("fake_output", False)
+
+
+    def construct(self, input1, input2, input3, input4, input5, input6, input7,
+                  input8, input9, inputx0, inputx1, inputx2, inputx3):
+        mul3 = self.mul(inputx1, input1)
+        mul2 = self.mul(inputx0, input2)
+        add1 = self.add(mul2, mul3)
+        realdiv1 = self.div(add1, input3)
+        add2 = self.add(realdiv1, inputx3)
+        sqrt0 = self.rsqrt(add2)
+        sqrt1 = self.sqrt(realdiv1)
+        add4 = self.add(sqrt1, inputx3)
+        mul1 = self.mul(input9, input4)
+        mul0 = self.mul(input8, input5)
+        add0 = self.add(mul0, mul1)
+        realdiv0 = self.div(add0, input6)
+        realdiv2 = self.mul(realdiv0, sqrt0)
+        realdiv4 = self.div(realdiv0, add4)
+        mul4 = self.mul(inputx2, input7)
+        add3 = self.add(realdiv2, mul4)
+
+        add3 = self.fake_output_assign_1(input5, add0, add3)
+        add3 = self.fake_output_assign_2(input2, add1, add3)
+
+        return add3, realdiv4
diff --git a/mindspore/nn/layer/activation.py b/mindspore/nn/layer/activation.py
index f20ad8692d..9b62639bb1 100644
--- a/mindspore/nn/layer/activation.py
+++ b/mindspore/nn/layer/activation.py
@@ -20,8 +20,10 @@ from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore.common.tensor import Tensor
 from mindspore._extends import cell_attr_register
+from mindspore.ops import _selected_ops
 from ..cell import Cell
 
+
 __all__ = ['Softmax',
            'LogSoftmax',
            'ReLU',
@@ -73,7 +75,7 @@ class Softmax(Cell):
 
     def __init__(self, axis=-1):
         super(Softmax, self).__init__()
-        self.softmax = P.Softmax(axis)
+        self.softmax = _selected_ops.Softmax(axis)
 
     def construct(self, x):
         return self.softmax(x)
@@ -110,7 +112,7 @@ class LogSoftmax(Cell):
 
     def __init__(self, axis=-1):
         super(LogSoftmax, self).__init__()
-        self.log_softmax = P.LogSoftmax(axis)
+        self.log_softmax = _selected_ops.LogSoftmax(axis)
 
     def construct(self, x):
         return self.log_softmax(x)
@@ -286,7 +288,7 @@ class Tanh(Cell):
 
     def __init__(self):
         super(Tanh, self).__init__()
-        self.tanh = P.Tanh()
+        self.tanh = _selected_ops.Tanh()
 
     def construct(self, x):
         return self.tanh(x)
@@ -318,7 +320,7 @@ class GELU(Cell):
 
     def __init__(self):
         super(GELU, self).__init__()
-        self.gelu = P.Gelu()
+        self.gelu = _selected_ops.Gelu()
 
     def construct(self, x):
         return self.gelu(x)
@@ -503,6 +505,7 @@ class LogSigmoid(Cell):
         [-3.1326166e-01, -1.2692806e-01, -4.8587345e-02]
 
     """
+
     def __init__(self):
         super(LogSigmoid, self).__init__()
         self.mul = P.Mul()
diff --git a/mindspore/nn/layer/basic.py b/mindspore/nn/layer/basic.py
index c050d35f8f..548fbcec1e 100644
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -27,10 +27,12 @@ from mindspore.common.parameter import Parameter
 from mindspore._extends import cell_attr_register
 from mindspore.common.api import ms_function
 from mindspore import context
+from mindspore.ops import _selected_ops
 from ..cell import Cell
 from .activation import get_activation
 from ..._checkparam import Validator as validator
 
+
 __all__ = ['Dropout', 'Flatten', 'Dense', 'ClipByNorm', 'Norm', 'OneHot', 'Pad', 'Unfold']
 
 class Dropout(Cell):
@@ -74,6 +76,7 @@ class Dropout(Cell):
         >>> net = nn.Dropout(keep_prob=0.8)
         >>> net(x)
     """
+
     def __init__(self, keep_prob=0.5, seed0=0, seed1=0, dtype=mstype.float32):
         super(Dropout, self).__init__()
         if keep_prob <= 0 or keep_prob > 1:
@@ -137,6 +140,7 @@ class Flatten(Cell):
         [[1.2 1.2 2.1 2.1]
          [2.2 2.2 3.2 3.2]]
     """
+
     def __init__(self):
         super(Flatten, self).__init__()
 
@@ -212,7 +216,7 @@ class Dense(Cell):
             self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
 
         self.matmul = P.MatMul(transpose_b=True)
-        self.bias_add = P.BiasAdd()
+        self.bias_add = _selected_ops.BiasAdd()
 
         self.activation = get_activation(activation)
         self.activation_flag = self.activation is not None
@@ -271,6 +275,7 @@ class ClipByNorm(Cell):
         >>> net(input, clip_norm)
 
     """
+
     def __init__(self):
         super(ClipByNorm, self).__init__()
         self.reduce_sum = P.ReduceSum(keep_dims=True)
@@ -302,6 +307,7 @@ class ClipByNorm(Cell):
             intermediate = x
         else:
             intermediate = x * clip_norm
+
         max_norm = self.max_op(l2norm, clip_norm)
         values_clip = self.cast(intermediate, mstype.float32) / self.expand_dims(max_norm, -1)
         values_clip = self.reshape(values_clip, self.shape(x))
@@ -330,6 +336,7 @@ class Norm(Cell):
         >>> input = Tensor(np.random.randint(0, 10, [4, 16]), mindspore.float32)
         >>> net(input)
     """
+
     def __init__(self, axis=(), keep_dims=False):
         super(Norm, self).__init__()
         self.axis = axis
@@ -392,6 +399,7 @@ class OneHot(Cell):
           [0. 1.]
           [0. 0.]]]
     """
+
     def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, dtype=mstype.float32):
         super(OneHot, self).__init__()
         self.onehot = P.OneHot(axis)
@@ -506,6 +514,7 @@ class Unfold(Cell):
         Tensor ([[[[1, 1] [1, 1]] [[1, 1], [1, 1]] [[1, 1] [1, 1]], [[1, 1], [1, 1]]]],
                 shape=(1, 4, 2, 2), dtype=mstype.float16)
     """
+
     def __init__(self, ksizes, strides, rates, padding="valid"):
         super(Unfold, self).__init__()
         self.extract_image_patches = inner.ExtractImagePatches(ksizes, strides, rates, padding)
diff --git a/mindspore/nn/layer/normalization.py b/mindspore/nn/layer/normalization.py
index 744e87ece5..4c7ea9d4d6 100644
--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -24,9 +24,11 @@ from mindspore._extends import cell_attr_register
 from mindspore.communication.management import get_group_size, get_rank
 from mindspore.communication import management
 from mindspore._checkparam import check_int_positive
+from mindspore.ops import _selected_ops
 from ..cell import Cell
 
 
+
 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'LayerNorm', 'GroupNorm', 'GlobalBatchNorm']
 
 class _BatchNorm(Cell):
@@ -116,12 +118,11 @@ class _BatchNorm(Cell):
                              "local_rank_size is {}".format(group_size, get_group_size()))
         if len(world_rank) % group_size != 0:
             raise ValueError("please make your group size correct.")
-        world_rank_list = zip(*(iter(world_rank),) *group_size)
+        world_rank_list = zip(*(iter(world_rank),) * group_size)
         group_list = [list(i) for i in world_rank_list]
         return group_list
 
 
-
     def _global_sync(self, x, axes, re_shape):
         """calculate global batch normalization output"""
         x_mean = self.reduce_mean(x, axes)
@@ -188,15 +189,19 @@ class _BatchNorm(Cell):
         return 'num_features={}, eps={}, momentum={}, gamma={}, beta={}, moving_mean={}, moving_variance={}'.format(
             self.num_features, self.eps, self.momentum, self.gamma, self.beta, self.moving_mean, self.moving_variance)
 
+
 @constexpr
 def _channel_check(channel, num_channel):
     if channel != num_channel:
         raise ValueError("the input channel is not equal with num_channel")
 
+
 @constexpr
 def _shape_check(in_shape):
     if len(in_shape) != 4:
         raise ValueError("The input must has 4 dims")
+
+
 @constexpr
 def _shape_infer(x_shape, num_feature):
     """global batch normalization shape and axes infer"""
@@ -208,6 +213,7 @@ def _shape_infer(x_shape, num_feature):
         re_shape = (1, num_feature)
     return axes, re_shape
 
+
 class BatchNorm1d(_BatchNorm):
     r"""
     Batch normalization layer over a 2D input.
@@ -257,6 +263,7 @@ class BatchNorm1d(_BatchNorm):
         >>> input = Tensor(np.random.randint(0, 255, [3, 16]), mindspore.float32)
         >>> net(input)
     """
+
     def __init__(self,
                  num_features,
                  eps=1e-5,
@@ -276,6 +283,7 @@ class BatchNorm1d(_BatchNorm):
                                           moving_mean_init,
                                           moving_var_init,
                                           use_batch_statistics)
+
     def _check_data_dim(self, x):
         if x.dim() != 2:
             pass
@@ -330,6 +338,7 @@ class BatchNorm2d(_BatchNorm):
         >>> input = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
         >>> net(input)
     """
+
     def __init__(self,
                  num_features,
                  eps=1e-5,
@@ -349,6 +358,7 @@ class BatchNorm2d(_BatchNorm):
                                           moving_mean_init,
                                           moving_var_init,
                                           use_batch_statistics)
+
     def _check_data_dim(self, x):
         if x.dim() != 4:
             pass
@@ -404,6 +414,7 @@ class GlobalBatchNorm(_BatchNorm):
         >>> input = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
         >>> global_bn_op(input)
     """
+
     def __init__(self,
                  num_features,
                  eps=1e-5,
@@ -428,10 +439,12 @@ class GlobalBatchNorm(_BatchNorm):
         self.group = check_int_positive(device_num_each_group)
         if self.group <= 1:
             raise ValueError("the number of group must be greater than 1.")
+
     def _check_data_dim(self, x):
         if x.dim == 0:
             pass
 
+
 class LayerNorm(Cell):
     r"""
     Applies Layer Normalization over a mini-batch of inputs.
@@ -475,6 +488,7 @@ class LayerNorm(Cell):
         >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
         >>> m(x)
     """
+
     def __init__(self,
                  normalized_shape,
                  begin_norm_axis=-1,
@@ -495,8 +509,8 @@ class LayerNorm(Cell):
             gamma_init, normalized_shape), name="gamma")
         self.beta = Parameter(initializer(
             beta_init, normalized_shape), name="beta")
-        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis,
-                                      epsilon=self.epsilon)
+        self.layer_norm = _selected_ops.LayerNorm(begin_norm_axis=self.begin_norm_axis,
+                                                  begin_params_axis=self.begin_params_axis)
 
     def construct(self, input_x):
         y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
@@ -508,6 +522,7 @@ class LayerNorm(Cell):
             self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
         return s
 
+
 class GroupNorm(Cell):
     r"""
     Group Normalization over a mini-batch of inputs.
@@ -544,6 +559,7 @@ class GroupNorm(Cell):
         >>> x = Tensor(np.ones([1, 64, 256, 256], np.float32))
         >>> goup_norm_op(x)
     """
+
     def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, gamma_init='ones', beta_init='zeros'):
         super(GroupNorm, self).__init__()
         self.num_groups = check_int_positive(num_groups)
diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index 426f111bb2..4639229c41 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -18,6 +18,7 @@ from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.ops.primitive import constexpr
+from mindspore.ops import _selected_ops
 from mindspore.nn.cell import Cell
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
@@ -44,7 +45,7 @@ class _Loss(Cell):
         if reduction == 'none':
             self.reduce = False
 
-        self.reduce_mean = P.ReduceMean()
+        self.reduce_mean = _selected_ops.ReduceMean()
         self.reduce_sum = P.ReduceSum()
 
     def get_axis(self, x):
@@ -245,11 +246,11 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
         super(SoftmaxCrossEntropyWithLogits, self).__init__(reduction)
         self.is_grad = is_grad
         self.sparse = sparse
-        validator.check_integer("num_classes", num_classes, 1, Rel.GT, self.cls_name)
-        validator.check_number_range("smooth_factor", smooth_factor, 0, 1, Rel.INC_BOTH, self.cls_name)
+        validator.check_number_range(
+            "smooth_factor", smooth_factor, 0, 1, Rel.INC_BOTH, self.cls_name)
         self.smooth_factor = smooth_factor
         self.num_classes = num_classes
-        self.softmax_cross_entropy = P.SoftmaxCrossEntropyWithLogits()
+        self.softmax_cross_entropy = _selected_ops.SoftmaxCrossEntropyWithLogits()
         self.one_hot = P.OneHot()
         self.on_value = Tensor(1.0 - self.smooth_factor, mstype.float32)
         self.off_value = Tensor(1.0 * self.smooth_factor / (self.num_classes - 1), mstype.float32)
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index ab914da08e..832b35d66f 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """lamb"""
 import numpy as np
+from mindspore import context
 from mindspore.common import dtype as mstype
 from mindspore.common.initializer import initializer
 from mindspore.ops import operations as P
@@ -25,13 +26,15 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from .optimizer import Optimizer
 from .. import layer
+from .. import graph_kernels as G
 
 num_one = Tensor(np.ones([1]), mstype.float32)
 
 _lamb_opt = C.MultitypeFuncGraph("lamb_opt")
 
-@_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
-                    "Tensor", "Bool")
+
+@_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                    "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
 def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, param, m, v,
                    gradient, decay_flag):
     """
@@ -72,9 +75,11 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
     v_fp32 = op_cast(v, mstype.float32)
     gradient_fp32 = op_cast(gradient, mstype.float32)
 
-    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(num_one, mstype.float32) - beta1, gradient_fp32)
+    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(num_one,
+                                                    mstype.float32) - beta1, gradient_fp32)
 
-    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))
+    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(num_one,
+                                                    mstype.float32) - beta2, op_square(gradient_fp32))
 
     next_mm = next_m / (op_cast(num_one, mstype.float32)
                         - op_pow(beta1, op_cast(global_step + num_one, mstype.float32)))
@@ -83,7 +88,8 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
     w_norm = op_norm(param_fp32)
     g_norm = op_norm(gradient_fp32)
 
-    g_norm_hat = op_norm(op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay_tensor * param_fp32)
+    g_norm_hat = op_norm(op_mul(next_mm, op_rsqrt(
+        next_vv + eps)) + weight_decay_tensor * param_fp32)
     zeros = F.zeros_like(w_norm)
     ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
     trust_ratio = op_select(
@@ -108,6 +114,70 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
     return next_v
 
 
+lamb_opt_graph_kernel = C.MultitypeFuncGraph("lamb_opt_graph_kernel")
+
+
+@lamb_opt_graph_kernel.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                                "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
+def _update_run_op_graph_kernel(beta1, beta2, eps, lr, weight_decay_tensor,
+                                global_step, param, m, v, gradient, decay_flag):
+    """
+    Update parameters.
+
+    Args:
+        beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0).
+        beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0).
+        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
+        lr (Tensor): Learning rate.
+        weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0.
+        global_step (Tensor): Global step.
+        param (Tensor): Parameters.
+        m (Tensor): m value of parameters.
+        v (Tensor): v value of parameters.
+        gradient (Tensor): Gradient of parameters.
+        decay_flag (bool): Specifies whether param update with weight decay.
+
+    Returns:
+        Tensor, the new value of v after updating.
+    """
+    op_mul = P.Mul()
+    op_square = P.Square()
+    op_cast = P.Cast()
+    op_shape = P.Shape()
+    op_pow = P.Pow()
+    op_norm = layer.Norm()
+    op_fill = P.Fill()
+    op_dtype = P.DType()
+
+    param_fp32 = op_cast(param, mstype.float32)
+    gradient_fp32 = op_cast(gradient, mstype.float32)
+
+    i6_ex = op_cast(global_step + num_one, mstype.float32)
+    i9 = op_cast(num_one, mstype.float32) - beta1
+    x1 = op_cast(num_one, mstype.float32) - beta2
+    i6 = op_cast(num_one, mstype.float32) - op_pow(beta1, i6_ex)
+    i3 = op_cast(num_one, mstype.float32) - op_pow(beta2, i6_ex)
+    i1 = op_square(gradient_fp32)
+    add3, update = G.LambNextMV()(i1, v, i3, gradient, m, i6, param, beta1,
+                                  i9, beta2, x1, weight_decay_tensor, eps)
+
+    if decay_flag:
+        update = update + op_mul(weight_decay_tensor, param_fp32)
+
+    w_norm = op_norm(param_fp32)
+    g_norm = op_norm(gradient_fp32)
+    g_norm_hat = op_norm(add3)
+
+    zeros = F.zeros_like(w_norm)
+    ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
+    tens = op_fill(op_dtype(w_norm), op_shape(w_norm), 10.0)
+
+    next_param = G.LambUpdateWithLR()(g_norm, w_norm, g_norm_hat, lr, update,
+                                      param, zeros, ones, tens)
+    next_v = F.control_depend(add3, next_param)
+    return next_v
+
+
 def _check_param_value(decay_steps, warmup_steps, start_learning_rate,
                        end_learning_rate, power, beta1, beta2, eps, weight_decay, prim_name):
     """Check the type of inputs."""
@@ -124,11 +194,16 @@ def _check_param_value(decay_steps, warmup_steps, start_learning_rate,
     validator.check_value_type("beta1", beta1, [float], prim_name)
     validator.check_value_type("beta2", beta2, [float], prim_name)
     validator.check_value_type("eps", eps, [float], prim_name)
-    validator.check_value_type("weight_dacay", weight_decay, [float], prim_name)
-    validator.check_number_range("beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
+    validator.check_value_type(
+        "weight_dacay", weight_decay, [float], prim_name)
+    validator.check_number_range(
+        "beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range(
+        "beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
+    validator.check_number_range(
+        "eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
+    validator.check_number_range(
+        "weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
 
 
 class Lamb(Optimizer):
@@ -186,7 +261,8 @@ class Lamb(Optimizer):
                  decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
         super(Lamb, self).__init__(0.0, params)
         if self.is_group:
-            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
+            raise RuntimeError(
+                f"The {self.cls_name} optimizer cannot support group setting.")
         _check_param_value(decay_steps, warmup_steps, start_learning_rate, end_learning_rate,
                            power, beta1, beta2, eps, weight_decay, self.cls_name)
 
@@ -198,14 +274,18 @@ class Lamb(Optimizer):
         if warmup_steps > 0:
             self.warmup_flag = True
         self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32))
-        self.start_learning_rate = Tensor(np.array([start_learning_rate]).astype(np.float32))
-        self.end_learning_rate = Tensor(np.array([end_learning_rate]).astype(np.float32))
-        self.diff_learning_rate = Tensor(np.array([start_learning_rate - end_learning_rate]).astype(np.float32))
+        self.start_learning_rate = Tensor(
+            np.array([start_learning_rate]).astype(np.float32))
+        self.end_learning_rate = Tensor(
+            np.array([end_learning_rate]).astype(np.float32))
+        self.diff_learning_rate = Tensor(
+            np.array([start_learning_rate - end_learning_rate]).astype(np.float32))
         self.power = power
         self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
         self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
         self.eps = Tensor(np.array([eps]).astype(np.float32))
-        self.weight_decay_tensor = Tensor(np.array([weight_decay]).astype(np.float32))
+        self.weight_decay_tensor = Tensor(
+            np.array([weight_decay]).astype(np.float32))
         self.params = self.parameters
         self.moments1 = self.params.clone(prefix="lamb_m", init='zeros')
         self.moments2 = self.params.clone(prefix="lamb_v", init='zeros')
@@ -217,19 +297,29 @@ class Lamb(Optimizer):
         self.greater = P.Greater()
         self.one = Tensor(np.array([1.0]).astype(np.float32))
         self.cast = P.Cast()
+        self.enable_graph_kernel = context.get_context("enable_graph_kernel")
 
     def construct(self, gradients):
         step = self.min(self.global_step, self.decay_steps)
         p = step / self.decay_steps
-        lr = self.diff_learning_rate * self.pow(self.one - p, self.power) + self.end_learning_rate
+        lr = self.diff_learning_rate * \
+            self.pow(self.one - p, self.power) + self.end_learning_rate
         if self.warmup_flag:
             warmup_percent = self.global_step / self.warmup_steps
             warmup_lr = self.start_learning_rate * warmup_percent
-            is_warmup = self.cast(self.greater(self.warmup_steps, self.global_step), mstype.float32)
+            is_warmup = self.cast(self.greater(
+                self.warmup_steps, self.global_step), mstype.float32)
             lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr
-        updated_velocity = self.hyper_map(F.partial(_lamb_opt, self.beta1, self.beta2, self.eps, lr,
-                                                    self.weight_decay_tensor, self.global_step),
-                                          self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+        if self.enable_graph_kernel:
+            updated_velocity = self.hyper_map(F.partial(lamb_opt_graph_kernel,
+                                                        self.beta1, self.beta2, self.eps, lr,
+                                                        self.weight_decay_tensor, self.global_step),
+                                              self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+        else:
+            updated_velocity = self.hyper_map(F.partial(_lamb_opt,
+                                                        self.beta1, self.beta2, self.eps, lr,
+                                                        self.weight_decay_tensor, self.global_step),
+                                              self.params, self.moments1, self.moments2, gradients, self.decay_flag)
 
         added_global_step = self.global_step + self.one
         F.control_depend(lr, added_global_step)
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index d93f38510d..ebdc5d86bf 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 # ============================================================================
 """momentum"""
-from mindspore.ops import functional as F, composite as C, operations as P
+from mindspore.ops import functional as F, composite as C
+from mindspore.ops import _selected_ops
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
@@ -120,7 +121,7 @@ class Momentum(Optimizer):
         self.use_nesterov = check_bool(use_nesterov)
         self.moments = self.params.clone(prefix="moments", init='zeros')
         self.hyper_map = C.HyperMap()
-        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
+        self.opt = _selected_ops.ApplyMomentum(use_nesterov=self.use_nesterov)
 
     def construct(self, gradients):
         params = self.params
diff --git a/mindspore/ops/_grad/grad_math_ops.py b/mindspore/ops/_grad/grad_math_ops.py
index fdcda730c3..2a8a4fb03b 100755
--- a/mindspore/ops/_grad/grad_math_ops.py
+++ b/mindspore/ops/_grad/grad_math_ops.py
@@ -17,6 +17,7 @@
 
 from functools import reduce
 import numpy as np
+from mindspore.ops import _selected_grad_ops as SG
 from .. import functional as F
 from .. import operations as P
 from ..operations import _grad_ops as G
@@ -26,6 +27,7 @@ from .grad_base import bprop_getters
 from ..primitive import constexpr
 from ..composite.multitype_ops import _constexpr_utils as const_utils
 
+
 shape_op = P.Shape()
 reduce_sum = P.ReduceSum()
 reshape = P.Reshape()
@@ -468,7 +470,7 @@ def get_bprop_expm1(self):
 @bprop_getters.register(P.Minimum)
 def get_bprop_minimum(self):
     """Grad definition for `Minimum` operation."""
-    input_grad = G.MinimumGrad()
+    input_grad = SG.MinimumGrad()
 
     def bprop(x, y, out, dout):
         dx, dy = input_grad(x, y, dout)
@@ -480,7 +482,7 @@ def get_bprop_minimum(self):
 @bprop_getters.register(P.Maximum)
 def get_bprop_maximum(self):
     """Grad definition for `Maximum` operation."""
-    input_grad = G.MaximumGrad()
+    input_grad = SG.MaximumGrad()
 
     def bprop(x, y, out, dout):
         dx, dy = input_grad(x, y, dout)
@@ -910,7 +912,7 @@ def get_bprop_cosh(self):
 @bprop_getters.register(P.Abs)
 def get_bprop_abs(self):
     """Grad definition for `Abs` operation."""
-    abs_grad = G.AbsGrad()
+    abs_grad = SG.AbsGrad()
 
     def bprop(x, out, dout):
         dx = abs_grad(x, dout)
diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index 036d7ddec8..13fb89b23f 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 """Define the grad rules of neural network related operations."""
+from mindspore.ops import _selected_grad_ops as SG
 from .grad_base import bprop_getters
 from .. import functional as F
 from .. import operations as P
@@ -23,10 +24,11 @@ from ..operations import _inner_ops as inner
 from ... import context
 
 
+
 @bprop_getters.register(P.BiasAdd)
 def get_bprop_bias_add(self):
     """Grad definition for `BiasAdd` operation."""
-    bias_grad = G.BiasAddGrad()
+    bias_grad = SG.BiasAddGrad()
 
     def bprop(x, w, out, dout):
         return dout, bias_grad(dout)
@@ -303,7 +305,6 @@ def get_bprop_softmax(self):
     sub = P.Sub()
     mul = P.Mul()
     axis = self.axis
-
     def bprop(x, out, dout):
         dx = mul(out, sub(dout, sum_func(mul(out, dout), axis)))
         return (dx,)
@@ -338,10 +339,10 @@ def get_bprop_softplus(self):
 @bprop_getters.register(P.Tanh)
 def get_bprop_tanh(self):
     """Grad definition for `Tanh` operation."""
-    logsoftmax_grad = G.TanhGrad()
+    tanh_grad = SG.TanhGrad()
 
     def bprop(x, out, dout):
-        dx = logsoftmax_grad(out, dout)
+        dx = tanh_grad(out, dout)
         return (dx,)
 
     return bprop
@@ -404,7 +405,8 @@ def get_bprop_layer_norm(self):
     layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis)
 
     def bprop(x, gamma, beta, out, dout):
-        dx, d_gamma, d_beta = layer_norm_grad(x, dout[0], out[2], out[1], gamma)
+        dx, d_gamma, d_beta = layer_norm_grad(
+            x, dout[0], out[2], out[1], gamma)
         return dx, d_gamma, d_beta
 
     return bprop
diff --git a/mindspore/ops/_op_impl/akg/__init__.py b/mindspore/ops/_op_impl/akg/__init__.py
index e69de29bb2..f38b99f5e4 100644
--- a/mindspore/ops/_op_impl/akg/__init__.py
+++ b/mindspore/ops/_op_impl/akg/__init__.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""autodiff ops"""
+from .abs import _abs_akg
+from .add_n import _add_n_akg
+from .add import _add_akg
+from .apply_momentum import _apply_momentum_akg
+from .assign import _assign_akg
+from .inplace_assign import _inplace_assign_akg
+from .assign_add import _assign_add_akg
+from .bias_add_grad import _bias_add_grad_akg
+from .bias_add import _bias_add_akg
+from .cast import _cast_akg
+from .clear_zero import _clear_zero_akg
+from .conv_bn1 import _conv_bn1_akg
+from .conv2d_backprop_filter import _conv2d_backprop_filter_akg
+from .conv2d_backprop_input import _conv2d_backprop_input_akg
+from .conv2d import _conv2d_akg
+from .div import _div_akg
+from .equal_count import _equal_count_akg
+from .exp import _exp_akg
+from .five2four import _five2four_akg
+from .four2five import _four2five_akg
+from .fused_batch_norm_grad import _fused_batch_norm_grad_akg
+from .fused_batch_norm_infer import _fused_batch_norm_infer_akg
+from .fused_batch_norm import _fused_batch_norm_akg
+from .fused_bn1_grad import _bn1_grad_akg
+from .fused_bn1 import _fused_bn1_akg
+from .fused_bn2_grad import _bn2_grad_akg
+from .fused_bn2 import _fused_bn2_akg
+from .fused_bn3_grad import _bn3_grad_akg
+from .fused_bn3 import _fused_bn3_akg
+from .gather_v2 import _gather_v2_akg
+from .less import _less_akg
+from .log import _log_akg
+from .matmul import _matmul_akg
+from .max_pool_grad_with_argmax import _max_pool_grad_with_argmax_akg
+from .max_pool_with_argmax import _max_pool_with_argmax_akg
+from .max import _max_akg
+from .maximum import _maximum_akg
+from .mean_grad import _mean_grad_akg
+from .mean import _mean_akg
+from .minimum import _minimum_akg
+from .mul import _mul_akg
+from .neg import _neg_akg
+from .one_hot import _one_hot_akg
+from .pow import _power_akg
+from .real_div import _real_div_akg
+from .reciprocal import _reciprocal_akg
+from .reduce_max import _reduce_max_akg
+from .reduce_mean import _reduce_mean_akg
+from .reduce_sum import _reduce_sum_akg
+from .relu_grad import _relu_grad_akg
+from .relu import _relu_akg
+from .reshape import _reshape_akg
+from .round import _round_akg
+from .rsqrt import _rsqrt_akg
+from .select import _select_akg
+from .softmax import _softmax_akg
+from .sparse_softmax_cross_entropy_with_logits import _sparse_softmax_cross_entropy_with_logits_akg
+from .sqrt import _sqrt_akg
+from .strided_slice import _strided_slice_akg
+from .sub import _sub_akg
+from .sum import _sum_akg
+from .tile import _tile_akg
+from .zeros_like import _zeros_like_akg
+from .argmax import _argmax_akg
+from .floordiv import _floor_div_akg
+from .equal import _equal_akg
+from .greater_equal import _greater_equal_akg
+from .less_equal import _less_equal_akg
+from .expand_dims import _expand_dims_akg
+from .greater import _greater_akg
+from .equiv_format import _equiv_format_akg
+from . import gpu
diff --git a/mindspore/ops/_op_impl/akg/abs.py b/mindspore/ops/_op_impl/akg/abs.py
new file mode 100644
index 0000000000..8c08f405da
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/abs.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Abs op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Abs",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _abs_akg():
+    """Abs AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/add.py b/mindspore/ops/_op_impl/akg/add.py
new file mode 100644
index 0000000000..60544ea1c7
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/add.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""TensorAdd op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "TensorAdd",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _add_akg():
+    """TensorAdd AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/add_n.py b/mindspore/ops/_op_impl/akg/add_n.py
new file mode 100644
index 0000000000..53320f752e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/add_n.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""AddN op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "AddN",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32", "float16", "float32",
+                "float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0", "FracZ", "FracZ",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "dynamic",
+            "name": "inputs"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32", "float16", "float32",
+                "float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0", "FracZ", "FracZ",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _add_n_akg():
+    """AddN AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/apply_momentum.py b/mindspore/ops/_op_impl/akg/apply_momentum.py
new file mode 100644
index 0000000000..7160571882
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/apply_momentum.py
@@ -0,0 +1,103 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ApplyMomentum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ApplyMomentum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "use_nesterov",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "gradient_scale",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "variable"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "accumulation"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","DefaultFormat"
+            ],
+            "name": "learning_rate"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "gradient"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","DefaultFormat"
+            ],
+            "name": "momentum"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NC1HWC0","FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _apply_momentum_akg():
+    """ApplyMomentum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/argmax.py b/mindspore/ops/_op_impl/akg/argmax.py
new file mode 100644
index 0000000000..b04862cbeb
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/argmax.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Argmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Argmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _argmax_akg():
+    """Argmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/assign.py b/mindspore/ops/_op_impl/akg/assign.py
new file mode 100644
index 0000000000..e7c5a082bd
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/assign.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Assign op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Assign",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "ref"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "value"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _assign_akg():
+    """Assign AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/assign_add.py b/mindspore/ops/_op_impl/akg/assign_add.py
new file mode 100644
index 0000000000..7d0d345764
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/assign_add.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""AssignAdd op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "AssignAdd",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "ref"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "value"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _assign_add_akg():
+    """AssignAdd AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/bias_add.py b/mindspore/ops/_op_impl/akg/bias_add.py
new file mode 100644
index 0000000000..74f2bf7bcf
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/bias_add.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BiasAdd op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+@op_info_register("""{
+    "op_name": "BiasAdd",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NHWC","NHWC","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NHWC","NHWC","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "b"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bias_add_akg():
+    """BiasAddGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/bias_add_grad.py b/mindspore/ops/_op_impl/akg/bias_add_grad.py
new file mode 100644
index 0000000000..7726af6692
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/bias_add_grad.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BiasAddGrad op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BiasAddGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NHWC","NHWC","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "dout"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float32","float16","float32","float16","float32"
+            ],
+            "format": [
+                "DefaultFormat","DefaultFormat","NC1HWC0","NC1HWC0","DefaultFormat","DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bias_add_grad_akg():
+    """BiasAddGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/cast.py b/mindspore/ops/_op_impl/akg/cast.py
new file mode 100644
index 0000000000..a78d4d87e4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/cast.py
@@ -0,0 +1,74 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Cast op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Cast",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "dst_type",
+            "param_type": "required",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "bool", "bool", 
+                "float16", "float32", "int32", "int32", 
+                "bool", 
+                "float16", "float32", "bool", "bool",
+                "float16", "float32", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", 
+                "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32", "float16", "int32", "float16", 
+                "int32", "int32", "float16", "float32", 
+                "float32", 
+                "float32", "float16", "int32", "float32",
+                "float32", "float16", "int32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "DefaultFormat", 
+                "DefaultFormat", 
+                "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _cast_akg():
+    """Cast AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/clear_zero.py b/mindspore/ops/_op_impl/akg/clear_zero.py
new file mode 100644
index 0000000000..38bf35044f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/clear_zero.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ClearZero op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ClearZero",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "pad_mod",
+            "param_type": "optional",
+            "type": "string"
+        },
+        {
+            "name": "window",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+    ]
+}""")
+def _clear_zero_akg():
+    """MaxPoolGradWithArgmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv2d.py b/mindspore/ops/_op_impl/akg/conv2d.py
new file mode 100644
index 0000000000..709aca7001
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv2d.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Conv2D op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Conv2D",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "x_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "w_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "w"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _conv2d_akg():
+    """Conv2D AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv2d_backprop_filter.py b/mindspore/ops/_op_impl/akg/conv2d_backprop_filter.py
new file mode 100644
index 0000000000..1e4e4f1a1e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv2d_backprop_filter.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Conv2DBackpropFilter op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Conv2DBackpropFilter",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "input_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "filter_sizes",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "out_backprop"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "input"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _conv2d_backprop_filter_akg():
+    """Conv2DBackpropFilter AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv2d_backprop_input.py b/mindspore/ops/_op_impl/akg/conv2d_backprop_input.py
new file mode 100644
index 0000000000..52c7f2e7b3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv2d_backprop_input.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Conv2DBackpropInput op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Conv2DBackpropInput",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "input_sizes",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "filter_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "out_backprop"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "filter"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _conv2d_backprop_input_akg():
+    """Conv2DBackpropInput AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/conv_bn1.py b/mindspore/ops/_op_impl/akg/conv_bn1.py
new file mode 100644
index 0000000000..118c94e6fc
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/conv_bn1.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ConvBN1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ConvBN1",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "x_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "w_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "pad_list",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "dilation",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FracZ"
+            ],
+            "name": "w"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "conv_res_16"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "var_part"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        }
+    ]
+}""")
+def _conv_bn1_akg():
+    """ConvBN1 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/div.py b/mindspore/ops/_op_impl/akg/div.py
new file mode 100644
index 0000000000..56cdcca868
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/div.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Div op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Div",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _div_akg():
+    """Div AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/equal.py b/mindspore/ops/_op_impl/akg/equal.py
new file mode 100644
index 0000000000..35874c62bb
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/equal.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Equal op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Equal",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _equal_akg():
+    """Equal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/equal_count.py b/mindspore/ops/_op_impl/akg/equal_count.py
new file mode 100644
index 0000000000..9c575db7b3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/equal_count.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EqualCount op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "EqualCount",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _equal_count_akg():
+    """EqualCount AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/equiv_format.py b/mindspore/ops/_op_impl/akg/equiv_format.py
new file mode 100644
index 0000000000..111451b15c
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/equiv_format.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""EquivFormat op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "EquivFormat",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "FRACTAL_NZ", "FRACTAL_NZ", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _equiv_format_akg():
+    """EquivFormat AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/exp.py b/mindspore/ops/_op_impl/akg/exp.py
new file mode 100644
index 0000000000..273b3348a4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/exp.py
@@ -0,0 +1,59 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Exp op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Exp",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _exp_akg():
+    """Exp AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/expand_dims.py b/mindspore/ops/_op_impl/akg/expand_dims.py
new file mode 100644
index 0000000000..9e1b18153a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/expand_dims.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ExpandDims op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ExpandDims",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "y"
+        }
+    ]
+}""")
+def _expand_dims_akg():
+    """ExpandDims AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/five2four.py b/mindspore/ops/_op_impl/akg/five2four.py
new file mode 100644
index 0000000000..1dac2c3628
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/five2four.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Five2Four op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Five2Four",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "shape4d",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "dstType",
+            "param_type": "required",
+            "type": "str"
+        },
+        {
+            "name": "output_format",
+            "param_type": "required",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float16","float16","float32","float16","float32"
+            ],
+            "format": [
+                "NC1HWC0","NC1HWC0","NC1HWC0","NC1HWC0","NC1HWC0","NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16","float16","float32","float32","float32","float32"
+            ],
+            "format": [
+                "DefaultFormat","NHWC","DefaultFormat","DefaultFormat","NHWC","NHWC"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _five2four_akg():
+    """Five2Four AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/floordiv.py b/mindspore/ops/_op_impl/akg/floordiv.py
new file mode 100644
index 0000000000..99e577b4be
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/floordiv.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FloorDiv op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FloorDiv",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _floor_div_akg():
+    """FloorDiv AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/four2five.py b/mindspore/ops/_op_impl/akg/four2five.py
new file mode 100644
index 0000000000..01b6f85715
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/four2five.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Four2Five op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Four2Five",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        },
+        {
+            "name": "dst_type",
+            "param_type": "required",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float32", "float16","float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NHWC", "NHWC", "NHWC"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16", "float32", "float16", "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _four2five_akg():
+    """Four2Five AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_batch_norm.py b/mindspore/ops/_op_impl/akg/fused_batch_norm.py
new file mode 100644
index 0000000000..5ce9839328
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_batch_norm.py
@@ -0,0 +1,149 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBatchNorm op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBatchNorm",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "momentum",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "epsilon",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "scale"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "b"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "y"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_mean"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_variance"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_inv_variance"
+        }
+    ]
+}""")
+def _fused_batch_norm_akg():
+    """FusedBatchNorm AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_batch_norm_grad.py b/mindspore/ops/_op_impl/akg/fused_batch_norm_grad.py
new file mode 100644
index 0000000000..9191548f73
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_batch_norm_grad.py
@@ -0,0 +1,119 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBatchNormGrad op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBatchNormGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dy"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "scale"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "save_inv_variance"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dx"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "bn_scale"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "bn_bias"
+        }
+    ]
+}""")
+def _fused_batch_norm_grad_akg():
+    """BiasAddGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_batch_norm_infer.py b/mindspore/ops/_op_impl/akg/fused_batch_norm_infer.py
new file mode 100644
index 0000000000..1e7743fa8f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_batch_norm_infer.py
@@ -0,0 +1,109 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBatchNormInfer op"""
+
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBatchNormInfer",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "momentum",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "epsilon",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "data_format",
+            "param_type": "optional",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "scale"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "b"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ]
+}""")
+def _fused_batch_norm_infer_akg():
+    """FusedBatchNormInfer AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn1.py b/mindspore/ops/_op_impl/akg/fused_bn1.py
new file mode 100644
index 0000000000..fdaa673f25
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn1.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBN1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBN1",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "data"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _fused_bn1_akg():
+    """FusedBN1 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn1_grad.py b/mindspore/ops/_op_impl/akg/fused_bn1_grad.py
new file mode 100644
index 0000000000..8de6796d6f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn1_grad.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNGrad1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BNGrad1",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dy"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "data"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "mean"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bn1_grad_akg():
+    """BNGrad1 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn2.py b/mindspore/ops/_op_impl/akg/fused_bn2.py
new file mode 100644
index 0000000000..e26a5ad8a0
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn2.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBN2 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBN2",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "momentum",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "var_part"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_mean"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "running_var"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _fused_bn2_akg():
+    """FusedBN2 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn2_grad.py b/mindspore/ops/_op_impl/akg/fused_bn2_grad.py
new file mode 100644
index 0000000000..e29a9177b6
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn2_grad.py
@@ -0,0 +1,132 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNGrad1 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BNGrad2",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "eps",
+            "param_type": "optional",
+            "type": "float"
+        },
+        {
+            "name": "data_shape",
+            "param_type": "optional",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dgamma_red_hw"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "dbeta_red_hw"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "gamma"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bn2_grad_akg():
+    """BNGrad2 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn3.py b/mindspore/ops/_op_impl/akg/fused_bn3.py
new file mode 100644
index 0000000000..74f3f652f3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn3.py
@@ -0,0 +1,95 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""FusedBN3 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "FusedBN3",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "eps",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "data"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "mean"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "variance"
+        },{
+            "index": 3,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "gamma"
+        },{
+            "index": 4,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "beta"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _fused_bn3_akg():
+    """FusedBN3 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/fused_bn3_grad.py b/mindspore/ops/_op_impl/akg/fused_bn3_grad.py
new file mode 100644
index 0000000000..5ffc57a68e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/fused_bn3_grad.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BNGrad3 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BNGrad3",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dy"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "rs"
+        },{
+            "index": 2,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dgamma_dx"
+        },
+        {
+            "index": 3,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "dbeta_dx"
+        },
+        {
+            "index": 4,
+            "dtype": [
+                "float32", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "data_minus_mean"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _bn3_grad_akg():
+    """BNGrad3 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/gather_v2.py b/mindspore/ops/_op_impl/akg/gather_v2.py
new file mode 100644
index 0000000000..84ab7eb669
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/gather_v2.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""GatherV2 op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "GatherV2",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "params"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "indices"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _gather_v2_akg():
+    """GatherV2 AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/greater.py b/mindspore/ops/_op_impl/akg/greater.py
new file mode 100644
index 0000000000..941946163a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/greater.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Greater op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Greater",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float16", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _greater_akg():
+    """Greater AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/greater_equal.py b/mindspore/ops/_op_impl/akg/greater_equal.py
new file mode 100644
index 0000000000..11642baa86
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/greater_equal.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""GreaterEqual op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "GreaterEqual",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _greater_equal_akg():
+    """Equal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/inplace_assign.py b/mindspore/ops/_op_impl/akg/inplace_assign.py
new file mode 100644
index 0000000000..1cc40abe9b
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/inplace_assign.py
@@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""InplaceAssign op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "InplaceAssign",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "fake_output",
+            "param_type": "optional",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "y"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "z"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0", "FracZ", "FracZ", "FracZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _inplace_assign_akg():
+    """InplaceAssign AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/less.py b/mindspore/ops/_op_impl/akg/less.py
new file mode 100644
index 0000000000..499ed2e8fc
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/less.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Less op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Less",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _less_akg():
+    """Less AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/less_equal.py b/mindspore/ops/_op_impl/akg/less_equal.py
new file mode 100644
index 0000000000..97fbdec090
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/less_equal.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""LessEqual op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "LessEqual",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _less_equal_akg():
+    """Equal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/log.py b/mindspore/ops/_op_impl/akg/log.py
new file mode 100644
index 0000000000..526538d17d
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/log.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Log op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Log",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _log_akg():
+    """Log AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/matmul.py b/mindspore/ops/_op_impl/akg/matmul.py
new file mode 100644
index 0000000000..084ba754fa
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/matmul.py
@@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MatMul op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "MatMul",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "transpose_a",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "transpose_b",
+            "param_type": "optional",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "x1"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "x2"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _matmul_akg():
+    """MatMul AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/max.py b/mindspore/ops/_op_impl/akg/max.py
new file mode 100644
index 0000000000..21fd4ef9c4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/max.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Max op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Max",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _max_akg():
+    """Max AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/max_pool_grad_with_argmax.py b/mindspore/ops/_op_impl/akg/max_pool_grad_with_argmax.py
new file mode 100644
index 0000000000..4adad3eb88
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/max_pool_grad_with_argmax.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MaxPoolGradWithArgmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "MaxPoolGradWithArgmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "pad_mode",
+            "param_type": "optional",
+            "type": "str"
+         },
+        {
+            "name": "window",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "argmax"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "grad"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32"
+            ],
+            "format": [
+                "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _max_pool_grad_with_argmax_akg():
+    """MaxPoolGradWithArgmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/max_pool_with_argmax.py b/mindspore/ops/_op_impl/akg/max_pool_with_argmax.py
new file mode 100644
index 0000000000..3ae36d4793
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/max_pool_with_argmax.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""MaxPoolWithArgmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "MaxPoolWithArgmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "CONVLUTION",
+    "attr": [
+        {
+            "name": "pad_mode",
+            "param_type": "optional",
+            "type": "str"
+        },
+        {
+            "name": "window",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "pad",
+            "param_type": "optional",
+            "type": "int"
+        },
+        {
+            "name": "stride",
+            "param_type": "optional",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "NC1HWC0"
+            ],
+            "name": "output"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "argmax"
+        }
+    ]
+}""")
+def _max_pool_with_argmax_akg():
+    """MaxPoolWithArgmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/maximum.py b/mindspore/ops/_op_impl/akg/maximum.py
new file mode 100644
index 0000000000..8d8de5270a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/maximum.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Maximum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Maximum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _maximum_akg():
+    """Maximum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/mean.py b/mindspore/ops/_op_impl/akg/mean.py
new file mode 100644
index 0000000000..0b49e76865
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/mean.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SimpleMean op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "SimpleMean",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _mean_akg():
+    """SimpleMean AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/mean_grad.py b/mindspore/ops/_op_impl/akg/mean_grad.py
new file mode 100644
index 0000000000..3b8379d1f0
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/mean_grad.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SimpleMeanGrad op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "SimpleMeanGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "input_shape",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "HEAD"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _mean_grad_akg():
+    """SimpleMeanGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/minimum.py b/mindspore/ops/_op_impl/akg/minimum.py
new file mode 100644
index 0000000000..759df2085f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/minimum.py
@@ -0,0 +1,70 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Minimum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Minimum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32",
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32",
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32",
+                 "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _minimum_akg():
+    """Minimum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/mul.py b/mindspore/ops/_op_impl/akg/mul.py
new file mode 100644
index 0000000000..ab02c2d89e
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/mul.py
@@ -0,0 +1,86 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Mul op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Mul",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "x_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "y_shape",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "data_format",
+            "param_type": "required",
+            "type": "listStr"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "FracZ", "FracZ", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "FracZ", "FracZ", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "FracZ", "FracZ", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _mul_akg():
+    """Mul AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/neg.py b/mindspore/ops/_op_impl/akg/neg.py
new file mode 100644
index 0000000000..bc00d60271
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/neg.py
@@ -0,0 +1,59 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Neg op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Neg",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32",
+                "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32",
+                 "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _neg_akg():
+    """Neg AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/one_hot.py b/mindspore/ops/_op_impl/akg/one_hot.py
new file mode 100644
index 0000000000..c5034dbbd4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/one_hot.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""OneHot op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "OneHot",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "depth",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "int32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "indices"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "on_value"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "int32", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "off_value"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _one_hot_akg():
+    """OneHot AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/pow.py b/mindspore/ops/_op_impl/akg/pow.py
new file mode 100644
index 0000000000..d782968c05
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/pow.py
@@ -0,0 +1,65 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Pow op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Pow",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "power"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _power_akg():
+    """Pow AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/real_div.py b/mindspore/ops/_op_impl/akg/real_div.py
new file mode 100644
index 0000000000..9fa37a24e3
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/real_div.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""RealDiv op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "RealDiv",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _real_div_akg():
+    """RealDiv AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reciprocal.py b/mindspore/ops/_op_impl/akg/reciprocal.py
new file mode 100644
index 0000000000..9fd7cc40b4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reciprocal.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Reciprocal op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Reciprocal",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reciprocal_akg():
+    """Reciprocal AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reduce_max.py b/mindspore/ops/_op_impl/akg/reduce_max.py
new file mode 100644
index 0000000000..b9db8ea83a
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reduce_max.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReduceMax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReduceMax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reduce_max_akg():
+    """ReduceMax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reduce_mean.py b/mindspore/ops/_op_impl/akg/reduce_mean.py
new file mode 100644
index 0000000000..0a4ffdf221
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reduce_mean.py
@@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReduceMean op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReduceMean",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reduce_mean_akg():
+    """ReduceMean AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reduce_sum.py b/mindspore/ops/_op_impl/akg/reduce_sum.py
new file mode 100644
index 0000000000..20d091ac76
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reduce_sum.py
@@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReduceSum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReduceSum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keep_dims",
+            "param_type": "required",
+            "type": "bool"
+        },
+        {
+            "name": "atomic_add",
+            "param_type": "optional",
+            "type": "str"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reduce_sum_akg():
+    """ReduceSum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/relu.py b/mindspore/ops/_op_impl/akg/relu.py
new file mode 100644
index 0000000000..b32725f885
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/relu.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReLU op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReLU",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _relu_akg():
+    """ReLU AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/relu_grad.py b/mindspore/ops/_op_impl/akg/relu_grad.py
new file mode 100644
index 0000000000..c785b750fe
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/relu_grad.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ReluGrad op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ReluGrad",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "y_backprop"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _relu_grad_akg():
+    """ReluGrad AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/reshape.py b/mindspore/ops/_op_impl/akg/reshape.py
new file mode 100644
index 0000000000..d200b66fa2
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/reshape.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Reshape op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Reshape",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "shape",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "tensor"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _reshape_akg():
+    """Reshape AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/round.py b/mindspore/ops/_op_impl/akg/round.py
new file mode 100644
index 0000000000..0625c3ceda
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/round.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Round op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Round",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _round_akg():
+    """Round AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/rsqrt.py b/mindspore/ops/_op_impl/akg/rsqrt.py
new file mode 100644
index 0000000000..9264864f91
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/rsqrt.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Rsqrt op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Rsqrt",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _rsqrt_akg():
+    """Rsqrt AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/select.py b/mindspore/ops/_op_impl/akg/select.py
new file mode 100644
index 0000000000..006c6a5444
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/select.py
@@ -0,0 +1,76 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Select op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Select",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "bool", "bool", "bool", "bool", "bool", "bool"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "condition"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 2,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "int32", "float16", "int32", "float32", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "DefaultFormat", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _select_akg():
+    """Select AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/softmax.py b/mindspore/ops/_op_impl/akg/softmax.py
new file mode 100644
index 0000000000..a41c2aef36
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/softmax.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Softmax op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Softmax",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _softmax_akg():
+    """Softmax AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sparse_softmax_cross_entropy_with_logits.py b/mindspore/ops/_op_impl/akg/sparse_softmax_cross_entropy_with_logits.py
new file mode 100644
index 0000000000..e9e828f312
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sparse_softmax_cross_entropy_with_logits.py
@@ -0,0 +1,73 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""SparseSoftmaxCrossEntropyWithLogits op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "SparseSoftmaxCrossEntropyWithLogits",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "is_grad",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "sens",
+            "param_type": "optional",
+            "type": "float"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "features"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "labels"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float32"
+            ],
+            "format": [
+                "DefaultFormat"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sparse_softmax_cross_entropy_with_logits_akg():
+    """SparseSoftmaxCrossEntropyWithLogits AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sqrt.py b/mindspore/ops/_op_impl/akg/sqrt.py
new file mode 100644
index 0000000000..fcaa84b3d4
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sqrt.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sqrt op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Sqrt",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                 "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sqrt_akg():
+    """Sqrt AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/strided_slice.py b/mindspore/ops/_op_impl/akg/strided_slice.py
new file mode 100644
index 0000000000..bdbd8dfc2f
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/strided_slice.py
@@ -0,0 +1,93 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""StridedSlice op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "StridedSlice",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "begin",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "end",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "strides",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "begin_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "end_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "ellipsis_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "new_axis_mask",
+            "param_type": "required",
+            "type": "int"
+        },
+        {
+            "name": "shrink_axis_mask",
+            "param_type": "required",
+            "type": "int"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32",  "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32",  "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _strided_slice_akg():
+    """StridedSlice AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sub.py b/mindspore/ops/_op_impl/akg/sub.py
new file mode 100644
index 0000000000..846aa280bb
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sub.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sub op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Sub",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "y"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "int32", "float16", "float32", "int32", "float16", "float32",
+                "int32", "float16", "float32", "int32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0",
+                "FracZ", "FracZ", "FracZ", "FRACTAL_NZ", "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sub_akg():
+    """Sub AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/sum.py b/mindspore/ops/_op_impl/akg/sum.py
new file mode 100644
index 0000000000..501b387b25
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/sum.py
@@ -0,0 +1,68 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Sum op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Sum",
+    "imply_type": "AutoDiff",
+    "fusion_type": "COMMREDUCE",
+    "attr": [
+        {
+            "name": "axis",
+            "param_type": "required",
+            "type": "listInt"
+        },
+        {
+            "name": "keepdims",
+            "param_type": "required",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "param_type": "required",
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32",
+                "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0",
+                "FRACTAL_NZ", "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _sum_akg():
+    """Sum AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/tile.py b/mindspore/ops/_op_impl/akg/tile.py
new file mode 100644
index 0000000000..bd13978fe7
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/tile.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Tile op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "Tile",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "multiples",
+            "param_type": "required",
+            "type": "listInt"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "int32", "float16", "float32", "int32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _tile_akg():
+    """Tile AutoDiff register"""
+    return
diff --git a/mindspore/ops/_op_impl/akg/zeros_like.py b/mindspore/ops/_op_impl/akg/zeros_like.py
new file mode 100644
index 0000000000..a02ece22d7
--- /dev/null
+++ b/mindspore/ops/_op_impl/akg/zeros_like.py
@@ -0,0 +1,54 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""ZerosLike op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "ZerosLike",
+    "imply_type": "AutoDiff",
+    "fusion_type": "ELEMWISE",
+    "attr": [
+    
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "x"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16", "float32", "float16", "float32"
+            ],
+            "format": [
+                "DefaultFormat", "DefaultFormat", "NC1HWC0", "NC1HWC0"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _zeros_like_akg():
+    """ZerosLike AutoDiff register"""
+    return
diff --git a/mindspore/ops/_selected_grad_ops.py b/mindspore/ops/_selected_grad_ops.py
new file mode 100644
index 0000000000..5da1d53abf
--- /dev/null
+++ b/mindspore/ops/_selected_grad_ops.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+""" resolved grad ops """
+from mindspore.ops.op_selector import new_ops_selector
+
+op_selector = new_ops_selector(
+    "mindspore.ops.operations._grad_ops", "mindspore.nn.graph_kernels")
+
+
+@op_selector
+class MaximumGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class MinimumGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class AbsGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class BiasAddGrad:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class TanhGrad:
+    def __call__(self, *args):
+        pass
diff --git a/mindspore/ops/_selected_ops.py b/mindspore/ops/_selected_ops.py
new file mode 100644
index 0000000000..5e125025c9
--- /dev/null
+++ b/mindspore/ops/_selected_ops.py
@@ -0,0 +1,108 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+""" resolve ops """
+from mindspore.ops.op_selector import new_ops_selector
+
+op_selector = new_ops_selector(
+    "mindspore.ops.operations", "mindspore.nn.graph_kernels")
+opt_selector = new_ops_selector(
+    "mindspore.nn.optim", "mindspore.nn.graph_kernels")
+nn_selector = new_ops_selector(
+    "mindspore.nn", "mindspore.nn.graph_kernels")
+
+
+@nn_selector
+class BatchNorm2d:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class ReLU:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class ReduceMean:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class BiasAdd:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class FusedBatchNorm:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class ApplyMomentum:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class SoftmaxCrossEntropyWithLogits:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LogSoftmax:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class Tanh:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class Gelu:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LayerNorm:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class Softmax:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LambUpdateWithLR:
+    def __call__(self, *args):
+        pass
+
+
+@op_selector
+class LambNextMV:
+    def __call__(self, *args):
+        pass
diff --git a/mindspore/ops/op_selector.py b/mindspore/ops/op_selector.py
new file mode 100644
index 0000000000..bdd00ac7f1
--- /dev/null
+++ b/mindspore/ops/op_selector.py
@@ -0,0 +1,120 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+A factory class that create op selector instance to config switch on a class,
+which can be used to control the switch of op type: GraphKernel or Primitive.
+"""
+import importlib
+import inspect
+from mindspore import context
+
+
+class _OpSelector:
+    """
+    A helper class, which can be used to choose different type of operator.
+
+    When an instance of this class is called, we return the right operator
+    according to the context['enable_graph_kernel'] and the name of the
+    parameter. returned operator will be a GraphKernel op ora  Primitive op.
+
+    Args:
+        op (class): an empty class has an operator name as its class name
+        config_optype (str): operator type, which must be either 'GraphKernel'
+        or 'Primitive'
+        graph_kernel_pkg (str): real operator's package name
+        primitive_pkg (str): graph kernel operator's package name
+
+    Examples:
+        >>> class A: pass
+        >>> selected_op = _OpSelector(A, "GraphKernel",
+        >>>                           "graph_kernel.ops.pkg", "primitive.ops.pkg")
+        >>> # selected_op() will call graph_kernel.ops.pkg.A()
+    """
+    GRAPH_KERNEL = "GraphKernel"
+    PRIMITIVE = "Primitive"
+    DEFAULT_OP_TYPE = PRIMITIVE
+    KW_STR = "op_type"
+
+    def __init__(self, op, config_optype, primitive_pkg, graph_kernel_pkg):
+        self.op_name = op.__name__
+        self.config_optype = config_optype
+        self.graph_kernel_pkg = graph_kernel_pkg
+        self.primitive_pkg = primitive_pkg
+
+    def __call__(self, *args, **kwargs):
+        _op_type = _OpSelector.DEFAULT_OP_TYPE
+        if context.get_context("enable_graph_kernel"):
+            if _OpSelector.KW_STR in kwargs:
+                _op_type = kwargs.get(_OpSelector.KW_STR)
+                kwargs.pop(_OpSelector.KW_STR, None)
+            elif self.config_optype is not None:
+                _op_type = self.config_optype
+        if _op_type == _OpSelector.GRAPH_KERNEL:
+            pkg = self.graph_kernel_pkg
+        else:
+            pkg = self.primitive_pkg
+        op = getattr(importlib.import_module(pkg, __package__), self.op_name)
+        return op(*args, **kwargs)
+
+
+def new_ops_selector(primitive_pkg, graph_kernel_pkg):
+    """
+    A factory method to return an op selector
+
+    When the GraphKernel switch is on:
+        `context.get_context('enable_graph_kernel') == True`, we have 2 ways to control the op type:
+        (1). call the real op with an extra parameter `op_type='Primitive'` or `op_type='GraphKernel'`
+        (2). pass a parameter to the op selector, like `@op_selector('Primitive')` or
+                `@op_selector('GraphKernel')`
+        (3). default op type is PRIMITIVE
+        The order of the highest priority to lowest priority is (1), (2), (3)
+    If the GraphKernel switch is off, then op_type will always be PRIMITIVE.
+
+    Args:
+        primitive_pkg (str): primitive op's package name
+        graph_kernel_pkg (str): graph kernel op's package name
+
+    Returns:
+        returns an op selector, which can control what operator should be actually called.
+
+    Examples:
+        >>> op_selector = new_ops_selector("primitive_pkg.some.path",
+        >>>                                "graph_kernel_pkg.some.path")
+        >>> @op_selector
+        >>> class ReduceSum: pass
+    """
+
+    def op_selector(cls_or_optype):
+
+        _primitive_pkg = primitive_pkg
+        _graph_kernel_pkg = graph_kernel_pkg
+
+        def direct_op_type():
+            darg = None
+            if cls_or_optype is None:
+                pass
+            elif not inspect.isclass(cls_or_optype):
+                darg = cls_or_optype
+            return darg
+
+        if direct_op_type() is not None:
+            def deco_cls(_real_cls):
+                return _OpSelector(_real_cls, direct_op_type(), _primitive_pkg, _graph_kernel_pkg)
+            return deco_cls
+
+        return _OpSelector(cls_or_optype, direct_op_type(), _primitive_pkg, _graph_kernel_pkg)
+
+    return op_selector
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index 57ffd969c1..901db32c46 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -51,7 +51,7 @@ from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AssignAdd, AssignSub, Atan2
                        NPUGetFloatStatus, Pow, RealDiv, IsNan, IsInf, IsFinite, FloatStatus,
                        Reciprocal, CumSum, HistogramFixedWidth,
                        Sin, Sqrt, Rsqrt, BesselI0e, BesselI1e,
-                       Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh)
+                       Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh, Eps)
 
 from .random_ops import (RandomChoiceWithMask, Normal)
 from .nn_ops import (LSTM, SGD, Adam, SparseApplyAdam, SparseApplyLazyAdam, ApplyMomentum, BatchNorm,
@@ -282,6 +282,7 @@ __all__ = [
     "Sign",
     "LARSUpdate",
     "Round",
+    "Eps",
     "ApplyFtrl",
     "SpaceToBatch",
     "SparseApplyFtrl",
diff --git a/mindspore/ops/operations/_grad_ops.py b/mindspore/ops/operations/_grad_ops.py
index e964ba272c..24177bceeb 100644
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -405,6 +405,33 @@ class FusedBatchNormGrad(Primitive):
     def __call__(self, dy, x, scale, save_mean, save_inv_variance):
         raise NotImplementedError
 
+class BNTrainingReduceGrad(PrimitiveWithInfer):
+    """Gradients of FusedBatchNorm operation."""
+
+    @prim_attr_register
+    def __init__(self, epsilon=0.0001):
+        _inputs = ['grads', 'x', 'diff_scale', 'diff_offset', 'scale', 'batch_mean', 'batch_variance']
+        self.init_prim_io_names(inputs=_inputs, outputs=['y'])
+
+    def infer_shape(self, grads, x, diff_scale, diff_offset, scale, batch_mean, batch_variance):
+        return grads
+
+    def infer_dtype(self, grads, x, diff_scale, diff_offset, scale, batch_mean, batch_variance):
+        return grads
+
+class BNTrainingUpdateGrad(PrimitiveWithInfer):
+    """Gradients of FusedBatchNorm operation."""
+
+    @prim_attr_register
+    def __init__(self, epsilon=0.0001):
+        self.init_prim_io_names(inputs=['grads', 'x', 'batch_mean', 'batch_variance'],
+                                outputs=['diff_scale', 'diff_offset'])
+
+    def infer_shape(self, grads, x, batch_mean, batch_variance):
+        return (batch_mean, batch_variance)
+
+    def infer_dtype(self, grads, x, batch_mean, batch_variance):
+        return (batch_mean, batch_variance)
 
 class GeluGrad(PrimitiveWithInfer):
     """Gradients of Gelu operation."""
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index d7298e2099..395d3c509c 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -83,12 +83,17 @@ class ExpandDims(PrimitiveWithInfer):
         axis_v = axis['value']
         rank = len(x_shape)
         validator.check_int_range('axis', axis_v, -rank - 1, rank, Rel.INC_BOTH, self.name)
+        value = None
+        if x['value'] is not None:
+            value = x['value'].asnumpy()
+            value = np.expand_dims(value, axis_v)
+            value = Tensor(value)
         if axis_v < 0:
             axis_v = rank + 1 + axis_v
         x_shape.insert(axis_v, 1)
         out = {'shape': x_shape,
                'dtype': x['dtype'],
-               'value': None}
+               'value': value}
         return out
 
 
@@ -1661,6 +1666,7 @@ class Select(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init"""
+        self.init_prim_io_names(inputs=['condition', 'x', 'y'], outputs=['output'])
 
     def infer_shape(self, cond_shape, x_shape, y_shape):
         if cond_shape != x_shape or x_shape != y_shape:
@@ -1676,6 +1682,16 @@ class Select(PrimitiveWithInfer):
             raise TypeError('\'%s\' the x_type %s must be the same as y_type %s.' % (self.name, x_type, y_type))
         return x_type
 
+    def infer_value(self, cond, x, y):
+        if cond is not None and x is not None and y is not None:
+            cond = cond.asnumpy()
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.where(cond, x, y)
+            return Tensor(out)
+        return None
+
+
 
 class StridedSlice(PrimitiveWithInfer):
     r"""
@@ -2472,8 +2488,7 @@ class SpaceToBatch(PrimitiveWithInfer):
         validator.check_integer('rank of input_x', len(x_shape), 4, Rel.EQ, self.name)
         out_shape = copy.deepcopy(x_shape)
         for i in range(2):
-            padded = out_shape[i + 2] + self.paddings[i][0] + \
-                     self.paddings[i][1]
+            padded = out_shape[i + 2] + self.paddings[i][0] + self.paddings[i][1]
             if padded % self.block_size != 0:
                 raise ValueError(f'For \'{self.name}\' padded[{i}] {padded} should be divisible by '
                                  f'block_size {self.block_size}')
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index 8bba03f251..f66bea0be2 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -15,6 +15,7 @@
 
 """Operators for math."""
 
+import copy
 import numpy as np
 from ... import context
 from ..._c_expression import signature_rw as sig_rw
@@ -142,6 +143,15 @@ class TensorAdd(_MathBinaryOp):
         [5,7,9]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = x + y
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class AssignAdd(PrimitiveWithInfer):
     """
@@ -255,6 +265,7 @@ class _Reduce(PrimitiveWithInfer):
         return output
 
     def do_infer(self, input_x, axis, valid_dtype=mstype.number_type):
+        """ return meta infos of input parameters """
         axis_v = axis['value']
         input_shp = input_x['shape']
         args = {'input_x': input_x['dtype']}
@@ -263,9 +274,26 @@ class _Reduce(PrimitiveWithInfer):
         if axis_v is None:
             raise ValueError(f"For {self.name}, axis must be const.")
         input_shp = _infer_shape_reduce(input_shp, axis_v, self.keep_dims, self.name)
+        value = None
+        if input_x['value'] is not None:
+            prim_map = {
+                'ReduceSum': np.sum,
+                'ReduceMax': np.max,
+                'ReduceMin': np.min,
+            }
+            np_reduce_func = prim_map.get(self.name, None)
+
+            if np_reduce_func is not None:
+                value = input_x['value'].asnumpy()
+                if not axis_v:
+                    axis_v = [i for i in range(len(input_x['shape']))]
+                    axis_v = tuple(axis_v)
+                value = np_reduce_func(value, axis_v, keepdims=self.keep_dims)
+                value = np.array(value)
+                value = Tensor(value)
         return {'shape': input_shp,
                 'dtype': input_x['dtype'],
-                'value': None}
+                'value': value}
 
     def __infer__(self, input_x, axis):
         return self.do_infer(input_x, axis)
@@ -334,6 +362,12 @@ class ReduceSum(_Reduce):
         >>> output = op(input_x, 1)
     """
 
+    @prim_attr_register
+    def __init__(self, keep_dims=False):
+        """init ReduceSum"""
+        super(ReduceSum, self).__init__(keep_dims)
+        self.__setattr_flag__ = True
+
 
 class ReduceAll(_Reduce):
     """
@@ -403,6 +437,12 @@ class ReduceMax(_Reduce):
         >>> output = op(input_x, 1)
     """
 
+    @prim_attr_register
+    def __init__(self, keep_dims=False):
+        """ReduceMax"""
+        super(ReduceMax, self).__init__(keep_dims)
+        self.__setattr_flag__ = True
+
 
 class ReduceMin(_Reduce):
     """
@@ -743,6 +783,20 @@ class AddN(PrimitiveWithInfer):
         validator.check_tensor_type_same(args, mstype.number_type + (mstype.bool_,), cls_name)
         return inputs[0]
 
+    def infer_value(self, inputs):
+        if inputs is None:
+            return None
+
+        for x in inputs:
+            if x is None:
+                return None
+
+        added = copy.deepcopy(inputs[0].asnumpy())
+        for x in inputs[1:]:
+            added += x.asnumpy()
+        out = np.array(added, inputs[0].asnumpy().dtype)
+        return Tensor(out)
+
 
 class Neg(PrimitiveWithInfer):
     """
@@ -773,6 +827,13 @@ class Neg(PrimitiveWithInfer):
         validator.check_tensor_type_same({"input_x": input_x}, mstype.number_type, self.name)
         return input_x
 
+    def infer_value(self, input_x):
+        if input_x is not None:
+            input_x = input_x.asnumpy()
+            return Tensor(-input_x)
+
+        return None
+
 
 class InplaceAdd(PrimitiveWithInfer):
     """
@@ -920,6 +981,15 @@ class Sub(_MathBinaryOp):
         [-3, -3, -3]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = x - y
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Mul(_MathBinaryOp):
     """
@@ -978,6 +1048,7 @@ class Square(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Square"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -986,6 +1057,14 @@ class Square(PrimitiveWithInfer):
         validator.check_tensor_type_same({"x": x_type}, mstype.number_type, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = x * x
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Rsqrt(PrimitiveWithInfer):
     """
@@ -1007,6 +1086,7 @@ class Rsqrt(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Rsqrt"""
+        self.init_prim_io_names(inputs=['x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -1015,6 +1095,14 @@ class Rsqrt(PrimitiveWithInfer):
         validator.check_tensor_type_same({"x": x_type}, mstype.number_type, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = 1.0 / np.sqrt(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Sqrt(PrimitiveWithInfer):
     """
@@ -1036,6 +1124,7 @@ class Sqrt(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Sqrt"""
+        self.init_prim_io_names(inputs=['x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -1044,6 +1133,14 @@ class Sqrt(PrimitiveWithInfer):
         validator.check_tensor_type_same({"x": x_type}, mstype.number_type, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = np.sqrt(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Reciprocal(PrimitiveWithInfer):
     """
@@ -1074,6 +1171,14 @@ class Reciprocal(PrimitiveWithInfer):
         validator.check_subclass("x", x, mstype.tensor, self.name)
         return x
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = 1.0 / x
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Pow(_MathBinaryOp):
     """
@@ -1109,6 +1214,15 @@ class Pow(_MathBinaryOp):
         [1.0, 16.0, 64.0]
     """
 
+    def infer_value(self, x, power):
+        if x is not None and power is not None:
+            x = x.asnumpy()
+            power = power.asnumpy()
+            out = np.power(x, power)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Exp(PrimitiveWithInfer):
     """
@@ -1139,6 +1253,14 @@ class Exp(PrimitiveWithInfer):
         validator.check_subclass("x", x_type, mstype.tensor, self.name)
         return x_type
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = np.exp(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Expm1(PrimitiveWithInfer):
     """
@@ -1242,6 +1364,14 @@ class Log(PrimitiveWithInfer):
         validator.check_subclass("x", x, mstype.tensor, self.name)
         return x
 
+    def infer_value(self, x):
+        if x is not None:
+            x = x.asnumpy()
+            out = np.log(x)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Log1p(PrimitiveWithInfer):
     """
@@ -1360,6 +1490,15 @@ class Minimum(_MathBinaryOp):
         [1.0, 2.0, 3.0]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.minimum(x, y)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
+
 
 class Maximum(_MathBinaryOp):
     """
@@ -1389,6 +1528,14 @@ class Maximum(_MathBinaryOp):
         [4.0, 5.0, 6.0]
     """
 
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.maximum(x, y)
+            out = np.array(out, x.dtype)
+            return Tensor(out)
+        return None
 
 class RealDiv(_MathBinaryOp):
     """
@@ -1923,6 +2070,13 @@ class Greater(_LogicBinaryOp):
         >>> greater(input_x, input_y)
         [False, True, False]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.greater(x, y))
+            return Tensor(out)
+        return None
 
 
 class GreaterEqual(_LogicBinaryOp):
@@ -1951,6 +2105,13 @@ class GreaterEqual(_LogicBinaryOp):
         >>> greater_equal(input_x, input_y)
         [True, True, False]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.greater_equal(x, y))
+            return Tensor(out)
+        return None
 
 
 class Less(_LogicBinaryOp):
@@ -1979,6 +2140,13 @@ class Less(_LogicBinaryOp):
         >>> less(input_x, input_y)
         [False, False, True]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.less(x, y))
+            return Tensor(out)
+        return None
 
 
 class LessEqual(_LogicBinaryOp):
@@ -2007,6 +2175,13 @@ class LessEqual(_LogicBinaryOp):
         >>> less_equal(input_x, input_y)
         [True, False, True]
     """
+    def infer_value(self, x, y):
+        if x is not None and y is not None:
+            x = x.asnumpy()
+            y = y.asnumpy()
+            out = np.array(np.less_equal(x, y))
+            return Tensor(out)
+        return None
 
 
 class LogicalNot(PrimitiveWithInfer):
@@ -2517,6 +2692,7 @@ class Abs(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Abs"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -2587,7 +2763,8 @@ class Round(PrimitiveWithInfer):
 
     @prim_attr_register
     def __init__(self):
-        pass
+        """init Round"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
 
     def infer_shape(self, x_shape):
         return x_shape
@@ -2679,7 +2856,6 @@ class Atan2(_MathBinaryOp):
          [[0. 0.7853982]]
     """
 
-
 class SquareSumAll(PrimitiveWithInfer):
     """
     Returns square sum all of a tensor element-wise
@@ -2705,6 +2881,7 @@ class SquareSumAll(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init SquareSumAll"""
+
     def infer_shape(self, x_shape, y_shape):
         validator.check("x1_shape", x_shape, "x2_shape", y_shape, Rel.EQ, self.name)
         return [], []
@@ -2891,3 +3068,41 @@ class Invert(PrimitiveWithInfer):
     def infer_dtype(self, x_dtype):
         validator.check_tensor_type_same({'x_dtype': x_dtype}, [mstype.int16, mstype.uint16], self.name)
         return x_dtype
+
+
+class Eps(PrimitiveWithInfer):
+    """
+    Creates a tensor filled with `input_x` dtype minimum val.
+
+    Inputs:
+        - **input_x** (Tensor) - Input tensor.
+
+    Outputs:
+        Tensor, has the same type and shape as `input_x`, but filled with `input_x` dtype minimum val.
+
+    Examples:
+        >>> out = P.Eps()(input_x)
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init Eps"""
+        self.init_prim_io_names(inputs=['input_x'], outputs=['y'])
+
+    def __infer__(self, input_x):
+        valid_types = [mstype.float16, mstype.float32]
+        validator.check_tensor_type_same({'input_x': input_x['dtype']}, valid_types, self.name)
+
+        x_nptype = mstype.dtype_to_nptype(input_x['dtype'].element_type())
+        if x_nptype == np.float16:
+            min_val = 2 ** (-14)
+        else:
+            min_val = 2 ** (-16)
+
+        res = np.full(input_x['shape'], min_val, x_nptype)
+        out = {
+            'value': Tensor(res),
+            'shape': input_x['shape'],
+            'dtype': input_x['dtype'],
+        }
+        return out
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index a5c1684fce..a1659e139d 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -830,9 +830,9 @@ class Conv2D(PrimitiveWithInfer):
             pad_top, pad_bottom, pad_left, pad_right = self.pad, self.pad, self.pad, self.pad
 
             h_out = 1 + (x_shape[2] + 2 * self.pad - kernel_size_h - (kernel_size_h - 1) * (dilation_h - 1)) \
-                    / stride_h
+                / stride_h
             w_out = 1 + (x_shape[3] + 2 * self.pad - kernel_size_w - (kernel_size_w - 1) * (dilation_w - 1)) \
-                    / stride_w
+                / stride_w
             h_out = math.floor(h_out)
             w_out = math.floor(w_out)
 
@@ -953,9 +953,9 @@ class DepthwiseConv2dNative(PrimitiveWithInfer):
             pad_top, pad_bottom, pad_left, pad_right = self.pad, self.pad, self.pad, self.pad
 
             h_out = 1 + (x_shape[2] + 2 * self.pad - kernel_size_h - (kernel_size_h - 1) * (dilation_h - 1)) \
-                    / stride_h
+                / stride_h
             w_out = 1 + (x_shape[3] + 2 * self.pad - kernel_size_w - (kernel_size_w - 1) * (dilation_w - 1)) \
-                    / stride_w
+                / stride_w
             h_out = math.floor(h_out)
             w_out = math.floor(w_out)
 
diff --git a/mindspore/ops/operations/other_ops.py b/mindspore/ops/operations/other_ops.py
index 1f22c4caac..74c6080ab4 100644
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -53,7 +53,7 @@ class Assign(PrimitiveWithInfer):
     )
     @prim_attr_register
     def __init__(self):
-        pass
+        self.init_prim_io_names(inputs=['ref', 'value'], outputs=['output'])
 
     def infer_shape(self, variable, value):
         return variable
diff --git a/model_zoo/bert/src/bert_for_pre_training.py b/model_zoo/bert/src/bert_for_pre_training.py
index 976f1a3c43..5e014f02ba 100644
--- a/model_zoo/bert/src/bert_for_pre_training.py
+++ b/model_zoo/bert/src/bert_for_pre_training.py
@@ -27,6 +27,7 @@ from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
 from mindspore.train.parallel_utils import ParallelMode
 from mindspore.communication.management import get_group_size
 from mindspore import context
+from mindspore.ops import _selected_ops
 from .bert_model import BertModel
 
 GRADIENT_CLIP_TYPE = 1
@@ -130,7 +131,7 @@ class GetNextSentenceOutput(nn.Cell):
     """
     def __init__(self, config):
         super(GetNextSentenceOutput, self).__init__()
-        self.log_softmax = P.LogSoftmax()
+        self.log_softmax = _selected_ops.LogSoftmax()
         self.weight_init = TruncatedNormal(config.initializer_range)
         self.dense = nn.Dense(config.hidden_size, 2,
                               weight_init=self.weight_init, has_bias=True).to_float(config.compute_type)
diff --git a/tests/st/ops/ascend/test_fused_batchnorm.py b/tests/st/ops/ascend/test_fused_batchnorm.py
deleted file mode 100644
index 59e2df67de..0000000000
--- a/tests/st/ops/ascend/test_fused_batchnorm.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-import numpy as np
-
-import mindspore.context as context
-import mindspore.nn as nn
-from mindspore import Tensor
-from mindspore.common.initializer import initializer
-from mindspore.common.parameter import Parameter
-from mindspore.ops import operations as P
-
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
-
-
-class Net(nn.Cell):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.bn = P.FusedBatchNorm()
-        self.scale = Parameter(initializer('ones', [64]), name='scale')
-        self.b = Parameter(initializer('zeros', [64]), name='b')
-        self.mean = Parameter(initializer('ones', [64]), name='mean')
-        self.variance = Parameter(initializer('zeros', [64]), name='variance')
-
-    def construct(self, x):
-        return self.bn(x, self.scale, self.b, self.mean, self.variance)[0]
-
-
-def test_net():
-    x = np.random.randn(1, 64, 112, 112).astype(np.float32)
-    # mean = np.random.randn(1,16,1,1).astype(np.float32)
-    # variance = np.random.randn(1,16,1,1).astype(np.float32)
-    fusedBn = Net()
-    output = fusedBn(Tensor(x))
-    print("***********x*********")
-    print(x)
-
-    print("***********output y*********")
-    print(output.asnumpy())
diff --git a/tests/st/tbe_networks/resnet_cifar.py b/tests/st/tbe_networks/resnet_cifar.py
index 6b3b75a63c..cf9eb59400 100644
--- a/tests/st/tbe_networks/resnet_cifar.py
+++ b/tests/st/tbe_networks/resnet_cifar.py
@@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,20 +19,20 @@ import numpy as np
 from resnet import resnet50
 
 import mindspore.common.dtype as mstype
+import mindspore.ops.functional as F
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.c_transforms as C
 import mindspore.dataset.transforms.vision.c_transforms as vision
 import mindspore.nn as nn
-import mindspore.ops.functional as F
 from mindspore import Tensor
 from mindspore import context
 from mindspore.communication.management import init
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.ops import operations as P
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
-from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
 from mindspore.train.model import Model, ParallelMode
-from mindspore.train.serialization import load_checkpoint, load_param_into_net
 
 random.seed(1)
 np.random.seed(1)
@@ -62,12 +62,12 @@ def create_dataset(repeat_num=1, training=True):
     data_dir = data_home + "/cifar-10-batches-bin"
     if not training:
         data_dir = data_home + "/cifar-10-verify-bin"
-    data_set = ds.Cifar10Dataset(data_dir)
+    data_set = ds.Cifar10Dataset(data_dir, num_samples=32)
 
     if args_opt.run_distribute:
         rank_id = int(os.getenv('RANK_ID'))
         rank_size = int(os.getenv('RANK_SIZE'))
-        data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id)
+        data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id, num_samples=32)
 
     resize_height = 224
     resize_width = 224
@@ -140,8 +140,9 @@ if __name__ == '__main__':
         batch_num = dataset.get_dataset_size()
         config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=10)
         ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10", directory="./", config=config_ck)
+        time_cb = TimeMonitor(data_size=batch_num)
         loss_cb = LossMonitor()
-        model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
+        model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb, time_cb])
 
     if args_opt.do_eval:
         if args_opt.checkpoint_path:
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 840a66ad20..13f961fa24 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -91,6 +91,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "../../../mindspore/ccsrc/device/kernel_info.cc"
         "../../../mindspore/ccsrc/device/ascend/profiling/*.cc"
         "../../../mindspore/ccsrc/device/ascend/kernel_select_ascend.cc"
+        "../../../mindspore/ccsrc/device/ascend/kernel_select_graph_kernel.cc"
         "../../../mindspore/ccsrc/device/convert_tensor_utils.cc"
         "../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc"
         "../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc"
diff --git a/tests/ut/cpp/optimizer/lib_test.cc b/tests/ut/cpp/optimizer/lib_test.cc
index 037bcd75d1..ed4497f9a5 100644
--- a/tests/ut/cpp/optimizer/lib_test.cc
+++ b/tests/ut/cpp/optimizer/lib_test.cc
@@ -583,6 +583,5 @@ TEST_F(TestOptLib, test_adjust_allreduce_mul_add) {
   ASSERT_TRUE(CheckOpt(before2l, after2, patterns));
   ASSERT_TRUE(CheckOpt(before2r, after2, patterns));
 }
-
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc b/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc
index 2da100af93..317eace6c6 100644
--- a/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/format_type/insert_cast_test.cc
@@ -60,7 +60,7 @@ TEST_F(TestHWInsertCast, test_insert_cast_op_for_single_output) {
   builder.SetOutputsDeviceType({kFloat16->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
   builder1.SetInputsFormat({"NC1HWC0"});
   builder1.SetInputsDeviceType({kFloat32->type_id()});
@@ -68,7 +68,7 @@ TEST_F(TestHWInsertCast, test_insert_cast_op_for_single_output) {
   builder1.SetOutputsDeviceType({kFloat32->type_id()});
   builder1.SetFusionType(kernel::FusionType::ELEMWISE);
   builder1.SetProcessor(kernel::Processor::AICORE);
-  builder1.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder1.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto& node : node_list) {
     if (node == nullptr) {
@@ -122,7 +122,7 @@ TEST_F(TestHWInsertCast, test_insert_cast_op_for_multiple_output) {
   builder1.SetOutputsDeviceType({kFloat32->type_id()});
   builder1.SetFusionType(kernel::FusionType::ELEMWISE);
   builder1.SetProcessor(kernel::Processor::AICORE);
-  builder1.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder1.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto& node : node_list) {
     if (node == nullptr) {
diff --git a/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc b/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc
index 077a9f0723..69a330614e 100644
--- a/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/pass/allreduce_fusion_test.cc
@@ -56,7 +56,7 @@ TEST_F(TestHWAllReduceFusion, test_fusion_all) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto &node : node_list) {
     if (node == nullptr) {
@@ -97,7 +97,7 @@ TEST_F(TestHWAllReduceFusion, test_fusion_group) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto &node : node_list) {
     if (node == nullptr) {
@@ -138,7 +138,7 @@ TEST_F(TestHWAllReduceFusion, test_fusion_op) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   int count = 0;
   for (auto &node : node_list) {
@@ -195,7 +195,7 @@ TEST_F(TestHWAllReduceFusion, test_fusion_sorted) {
   builder.SetOutputsDeviceType({kFloat32->type_id()});
   builder.SetFusionType(kernel::FusionType::ELEMWISE);
   builder.SetProcessor(kernel::Processor::AICORE);
-  builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
+  builder.SetKernelType(KernelType::AKG_KERNEL);
   auto node_list = TopoSort(func_graph->get_return());
   for (auto &node : node_list) {
     if (node == nullptr) {
diff --git a/tests/ut/cpp/session/anf_runtime_algorithm_test.cc b/tests/ut/cpp/session/anf_runtime_algorithm_test.cc
index 9ff8123004..2ea2453381 100644
--- a/tests/ut/cpp/session/anf_runtime_algorithm_test.cc
+++ b/tests/ut/cpp/session/anf_runtime_algorithm_test.cc
@@ -645,9 +645,9 @@ TEST_F(AnfRuntimeAlgorithmTest, GetKernelType) {
   auto d_kernel_info = add->kernel_info();
   MS_EXCEPTION_IF_NULL(d_kernel_info);
   KernelBuildInfoBuilder builder;
-  builder.SetKernelType(AUTO_DIFF_KERNEL);
+  builder.SetKernelType(AKG_KERNEL);
   d_kernel_info->set_select_kernel_build_info(builder.Build());
-  EXPECT_EQ(AnfAlgo::GetKernelType(add), AUTO_DIFF_KERNEL);
+  EXPECT_EQ(AnfAlgo::GetKernelType(add), AKG_KERNEL);
   EXPECT_THROW(AnfAlgo::GetKernelType(nullptr), std::runtime_error);
 }