!22731 GraphKernel supports CPU

Merge pull request !22731 from DeshiChen/0901_graphkernel_cpu
2021-09-30 09:36:35 +00:00 · 2021-09-30 09:36:35 +00:00 · 06b0beced7
parent 989a640308 32ecd8ee79
commit 06b0beced7
35 changed files with 608 additions and 89 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -47,7 +47,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(PYBIND11_CPP_STANDARD -std=c++17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPTION_CXX_FLAGS}")

-if(ENABLE_AKG AND (ENABLE_D OR ENABLE_GPU))
+if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
    add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
 endif()

--- a/build.sh
+++ b/build.sh
@ -45,7 +45,7 @@ update_submodule()
  cd "${BASEPATH}/graphengine"
  git submodule update --init metadef
  cd "${BASEPATH}"
-  if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then
+  if [[ "X$ENABLE_AKG" = "Xon" ]]; then
      git submodule update --init --recursive akg
  fi
 }
@ -57,7 +57,6 @@ build_exit()
    exit 1
 }

-
 make_clean()
 {
  echo "enable make clean"
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -151,3 +151,7 @@ endif()
 if(ENABLE_CPU AND NOT WIN32)
    add_compile_definitions(ENABLE_ARMOUR)
 endif()
+
+if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions(ENABLE_AKG)
+endif()
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -291,7 +291,7 @@ install(
    COMPONENT mindspore
 )

-if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG)
+if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
    set (AKG_PATH ${BUILD_PATH}/mindspore/akg)
    file(REMOVE_RECURSE ${AKG_PATH}/_akg)
    file(MAKE_DIRECTORY ${AKG_PATH}/_akg)
--- a/cmake/package_win.cmake
+++ b/cmake/package_win.cmake
@ -187,20 +187,6 @@ install(
  COMPONENT mindspore
 )

-if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG)
-  set (AKG_PATH ${CMAKE_SOURCE_DIR}/build/mindspore/akg)
-  file(REMOVE_RECURSE ${AKG_PATH}/_akg)
-  file(MAKE_DIRECTORY ${AKG_PATH}/_akg)
-  file(TOUCH ${AKG_PATH}/_akg/__init__.py)
-  install(DIRECTORY "${AKG_PATH}/akg" DESTINATION "${AKG_PATH}/_akg")
-  install(
-    DIRECTORY
-    ${AKG_PATH}/_akg
-    DESTINATION ${INSTALL_PY_DIR}/
-    COMPONENT mindspore
-  )
-endif()
-
 if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
  install(
    DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/dataset
--- a/mindspore/_extends/graph_kernel/model/graph_split.py
+++ b/mindspore/_extends/graph_kernel/model/graph_split.py
@ -1180,11 +1180,131 @@ class GraphSplitAscend(GraphSplitByPattern):
            _fuse_once(fuse_func)


+class GraphSplitCpu(GraphSplitByPattern):
+    """Graph splitter"""
+    BORADCAST_FUSE_DEPTH = 20
+    REDUCE_FUSE_DEPTH = 20
+
+    def get_default_mode(self, op):
+        """Get default mode in CPU"""
+        pattern = PrimLib.iter_type(op)
+        return self.Area.MODE_BASIC if pattern == PrimLib.RESHAPE else self.Area.MODE_COMPOSITE
+
+    def pattern_fuse(self, fuse_func=None):
+        """fuse Areas by pattern"""
+        def _reshape(dom):
+            if dom.pattern != PrimLib.RESHAPE:
+                return None
+            min_area, forward_fuse = None, False
+            for a, _ in dom.out_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and dom.check_acyclic(a) and \
+                        (min_area is None or a.pattern < min_area.pattern):
+                    min_area = a
+            for a, _ in dom.in_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and a.check_acyclic(dom) and \
+                        len(dom.ops[0].inputs[0].to_ops) == 1 and not a.is_output and \
+                        (min_area is None or a.pattern < min_area.pattern):
+                    min_area, forward_fuse = a, True
+            return ([min_area], forward_fuse) if min_area else None
+
+        def _elemwise_depth(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.in_relations) != 1:
+                return None
+            a, r = list(dom.in_relations.items())[0]
+            if a.pattern > PrimLib.BROADCAST or len(a.out_relations) != 1 or r != PrimLib.ELEMWISE or \
+                    a.dom_op().output.shape != dom.dom_op().output.shape:
+                return None
+            return [a], True
+
+        def _elemwise_width(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST):
+                return None
+            fused = []
+            for a, r in dom.in_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and r == PrimLib.ELEMWISE and a.check_acyclic(dom) and \
+                        a.dom_op().output.shape == dom.dom_op().output.shape:
+                    fused.append(a)
+            return fused, True
+
+        def _broadcast_pat_exclude(dom, a, r):
+            if a.pattern == PrimLib.REDUCE:
+                return dom.pattern > PrimLib.ELEMWISE or r > PrimLib.ELEMWISE
+            return a.pattern > PrimLib.REDUCE or r > PrimLib.BROADCAST
+
+        def _broadcast_depth(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.out_relations) != 1 or \
+                    dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH:
+                return None
+            a, r = list(dom.out_relations.items())[0]
+            if _broadcast_pat_exclude(dom, a, r) or len(a.in_relations) != 1:
+                return None
+            return [a], False
+
+        def _broadcast_width(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or \
+                    dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH:
+                return None
+            fused = []
+            for a, r in dom.out_relations.items():
+                if _broadcast_pat_exclude(dom, a, r) or not dom.check_acyclic(a) or \
+                        (fused and fused[0].dom_op().output.shape != a.dom_op().output.shape):
+                    return None
+                fused.append(a)
+            return fused, False
+
+        def _reduce_pat_exclude(_, a, r):
+            if len(a.ops) > self.REDUCE_FUSE_DEPTH:
+                return True
+            return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST
+
+        def _reduce_depth(dom):
+            if dom.pattern != PrimLib.REDUCE or len(dom.in_relations) != 1:
+                return None
+            a, r = list(dom.in_relations.items())[0]
+            if _reduce_pat_exclude(dom, a, r) or len(a.out_relations) != 1:
+                return None
+            return [a], True
+
+        def _reduce_width(dom):
+            if dom.pattern != PrimLib.REDUCE:
+                return None
+            fused = []
+            for a, r in dom.in_relations.items():
+                if not _reduce_pat_exclude(dom, a, r) and a.check_acyclic(dom):
+                    fused.append(a)
+            return fused, True
+
+        def _fuse_loop():
+            changed = True
+            while changed:
+                changed = False
+                changed = self.fuse(_reshape) or changed
+                changed = self.fuse(_elemwise_depth) or changed
+                changed = self.fuse(_elemwise_width) or changed
+                changed = self.fuse(_reduce_depth) or changed
+                changed = self.fuse(_reduce_width) or changed
+                changed = self.fuse(_broadcast_depth) or changed
+                changed = self.fuse(_broadcast_width) or changed
+
+        def _fuse_once(fuse_func):
+            if fuse_func(_reshape) or fuse_func(_elemwise_depth) or fuse_func(_elemwise_width) or \
+                    fuse_func(_reduce_depth) or fuse_func(_reduce_width) or fuse_func(_broadcast_depth) or \
+                    fuse_func(_broadcast_width):
+                return
+
+        if fuse_func is None:
+            _fuse_loop()
+        else:
+            _fuse_once(fuse_func)
+
+
 def split(graph, target, flags):
    """Split graph"""
    result = None
    if target == "cuda":
        result = GraphSplitGpu(graph, flags).split()
-    else:
+    elif target == "aicore":
        result = GraphSplitAscend(graph, flags).split()
+    else:
+        result = GraphSplitCpu(graph, flags).split()
    return result
--- a/mindspore/_extends/graph_kernel/model/model_builder.py
+++ b/mindspore/_extends/graph_kernel/model/model_builder.py
@ -132,7 +132,7 @@ class CompositeGraph:
                return dict()
            attr = {}
            for a in op['attr']:
-                if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'):
+                if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin', 'Argmax', 'Argmin'):
                    attr['reduce_axis'] = a['value']
                else:
                    attr[a['name']] = a['value']
--- a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
+++ b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
@ -33,7 +33,7 @@ def copy_json(pid_path, ppid_path):
        shutil.move(os.path.join(pid_path, json_file), ppid_path)


-def _compile_akg_task_gpu(json_strs, attrs):
+def _compile_akg_task_default(json_strs, attrs):
    """
    compile func called in single process

@ -110,16 +110,14 @@ class AkgProcess:
        if self.argc == 0:
            raise ValueError("json must be not null")
        args = [(arg, attrs) for arg in self.args]
-        if self.platform == "GPU":
-            with Pool(processes=self.process_num) as pool:
-                res = pool.starmap_async(_compile_akg_task_gpu, args)
-                res.get(timeout=self.wait_time)
-        elif self.platform == "ASCEND":
+        if self.platform == "ASCEND":
            with Pool(processes=self.process_num) as pool:
                res = pool.starmap_async(_compile_akg_task_ascend, args)
                res.get(timeout=self.wait_time)
        else:
-            raise ValueError("The value of 'platform' must be 'GPU' or 'ASCEND'.")
+            with Pool(processes=self.process_num) as pool:
+                res = pool.starmap_async(_compile_akg_task_default, args)
+                res.get(timeout=self.wait_time)
        return True

    def accept_json(self, json):
--- a/mindspore/_extends/remote/kernel_build_server_akg.py
+++ b/mindspore/_extends/remote/kernel_build_server_akg.py
@ -12,22 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""kernel build server for gpu"""
+"""kernel build server for akg kernels"""
 import sys
 import warnings
 from mindspore._extends.remote.kernel_build_server import Messager, get_logger, AkgBuilder


-class GpuMessager(Messager):
+class AkgMessager(Messager):
    '''
-    GPU Messager
+    Default Messager for akg kernels.
    It works as a server, communicating with c++ client.
    '''

    def __init__(self, fdin, fdout):
        super().__init__(fdin, fdout)
-        get_logger().info("[TRACE] GPU Messager init...")
-        self.akg_builder = AkgBuilder("GPU")
+        get_logger().info("[TRACE] Akg Messager init...")
+        self.akg_builder = AkgBuilder("default")

    def handle(self):
        """
@ -42,7 +42,7 @@ class GpuMessager(Messager):
            self.exit()

    def exit(self):
-        get_logger().info("[TRACE] GPU Messager Exit...")
+        get_logger().info("[TRACE] Akg Messager Exit...")
        exit()


@ -51,5 +51,5 @@ if __name__ == '__main__':
    if len(sys.argv) != 3:
        raise Exception('Incorrect argv: {}'.format(sys.argv))
    get_logger().debug(f"[TRACE] argv: {str(sys.argv)}")
-    messager = GpuMessager(int(sys.argv[1]), int(sys.argv[2]))
+    messager = AkgMessager(int(sys.argv[1]), int(sys.argv[2]))
    messager.run()
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -13,12 +13,6 @@ if(ENABLE_D)
    file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "kernel_query.cc"
        "kernel_fusion.cc"
-        "akg/akg_kernel_build.cc"
-        "akg/ascend/*.cc"
-        "akg/akg_kernel_json_generator.cc"
-        "akg/akg_kernel_json_decoder.cc"
-        "akg/akg_kernel_attrs_process.cc"
-        "akg/akg_kernel_metadata.cc"
        "tbe/*.cc"
        "host/*.cc"
        "aicpu/*.cc"
@ -95,11 +89,6 @@ endif()
 if(ENABLE_GPU)
    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cu"
-        "akg/akg_kernel_build.cc"
-        "akg/gpu/*.cc"
-        "akg/akg_kernel_json_generator.cc"
-        "akg/akg_kernel_json_decoder.cc"
-        "akg/akg_kernel_attrs_process.cc"
    )

    file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
@ -122,7 +111,35 @@ if(ENABLE_GPU)
    # add_library(_mindspore_kernel_cuda_obj OBJECT ${CUDA_SRC_LIST})
 endif()

-set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST}
+if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    file(GLOB_RECURSE AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "akg/akg_kernel_build.cc"
+        "akg/akg_kernel_json_generator.cc"
+        "akg/akg_kernel_json_decoder.cc"
+        "akg/akg_kernel_attrs_process.cc"
+    )
+    if(ENABLE_GPU)
+        file(GLOB_RECURSE AKG_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+            "akg/gpu/*.cc"
+        )
+        list(APPEND AKG_SRC_LIST ${AKG_GPU_SRC_LIST})
+    endif()
+    if(ENABLE_D)
+        file(GLOB_RECURSE AKG_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+            "akg/ascend/*.cc"
+            "akg/akg_kernel_metadata.cc"
+        )
+        list(APPEND AKG_SRC_LIST ${AKG_D_SRC_LIST})
+    endif()
+    if(ENABLE_CPU)
+        file(GLOB_RECURSE AKG_CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+            "akg/cpu/*.cc"
+        )
+        list(APPEND AKG_SRC_LIST ${AKG_CPU_SRC_LIST})
+    endif()
+endif()
+
+set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST}
    PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
 add_library(_mindspore_backend_kernel_compiler_obj OBJECT ${KERNEL_SRC_LIST} ${CPU_SRC_LIST}
-    ${GPU_SRC_LIST} ${D_SRC_LIST} ${QUANTUM_SRC_LIST})
+    ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST} ${QUANTUM_SRC_LIST})
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
@ -16,6 +16,7 @@

 #include "backend/kernel_compiler/akg/akg_kernel_build.h"

+#include <sys/shm.h>
 #include <stdio.h>
 #include <errno.h>
 #include <fcntl.h>
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@ -17,8 +17,6 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_

-#include <sys/shm.h>
-
 #include <string>
 #include <utility>
 #include <vector>
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc
@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
+#include <Python.h>
+#include <vector>
+#include <memory>
+#include <string>
+#include "backend/kernel_compiler/common_utils.h"
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
+#include "utils/ms_utils.h"
+#include "backend/session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace kernel {
+KernelPackPtr AkgCpuKernelBuilder::AkgSearchCache(const std::string &kernel_name) {
+  return SearchCache(kernel_name, kProcessorCpu);
+}
+
+KernelPackPtr AkgCpuKernelBuilder::AkgInsertCache(const std::string &kernel_name) {
+  return InsertCache(kernel_name, kProcessorCpu);
+}
+
+void AkgCpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
+                                          const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
+  auto kernel_mod_ptr = std::make_shared<CpuKernelMod>(kernel_pack);
+  kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+  kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+  AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+}
+
+void AkgCpuKernelBuilder::AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) {
+  kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path());
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
+#include <string>
+#include "backend/kernel_compiler/akg/akg_kernel_build.h"
+#include "base/base.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgCpuKernelBuilder : public AkgKernelBuilder {
+ public:
+  AkgCpuKernelBuilder() = default;
+  ~AkgCpuKernelBuilder() = default;
+
+  kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); }
+  KernelPackPtr AkgSearchCache(const std::string &kernel_name) override;
+  KernelPackPtr AkgInsertCache(const std::string &kernel_name) override;
+  void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator,
+                       const AnfNodePtr &anf_node) override;
+  void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc
@ -0,0 +1,143 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
+
+#include <dlfcn.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include "nlohmann/json.hpp"
+#include "backend/kernel_compiler/common_utils.h"
+#include "common/thread_pool.h"
+#include "utils/ms_utils.h"
+#include "mindspore/ccsrc/debug/common.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+using AkgParallelLambda = int (*)(int task_id, int num_task, void *cdata);
+int AkgLaunchFunc(AkgParallelLambda flambda, void *cdata, int num_task) {
+  size_t num_workers =
+    std::min(mindspore::common::ThreadPool::GetInstance().GetSyncRunThreadNum(), static_cast<size_t>(num_task));
+  std::vector<mindspore::common::Task> tasks;
+  size_t thread_index = 0;
+  while (thread_index < num_workers) {
+    auto block = [&, thread_index]() {
+      flambda(thread_index, num_workers, cdata);
+      return mindspore::common::SUCCESS;
+    };
+    tasks.emplace_back(block);
+    thread_index++;
+  }
+  mindspore::common::ThreadPool::GetInstance().SyncRun(tasks);
+  return 0;
+}
+
+struct AkgCallBack {
+  void *parallel_launch_func;
+  void *(*malloc_func)(size_t);
+  void (*free_func)(void *);
+
+  AkgCallBack() {
+    parallel_launch_func = reinterpret_cast<void *>(&AkgLaunchFunc);
+    malloc_func = &malloc;
+    free_func = &free;
+  }
+  ~AkgCallBack() = default;
+};
+}  // namespace
+CpuKernelManagerPtr CpuKernelMod::kernelmanager_ = std::make_shared<CpuKernelManager>();
+
+CpuKernelManager::~CpuKernelManager() {
+  for (auto &cpu_func_pair : cpu_func_map_) {
+    if (cpu_func_pair.second.second != nullptr) {
+      (void)dlclose(cpu_func_pair.second.second);
+    }
+  }
+}
+
+void *CpuKernelManager::SearchFunc(const std::string &kernel_name) const {
+  auto iter = cpu_func_map_.find(kernel_name);
+  if (iter == cpu_func_map_.end()) {
+    return nullptr;
+  } else {
+    return iter->second.first;
+  }
+}
+
+void *CpuKernelManager::SearchFuncWithSharedLock(const std::string &kernel_name) const {
+  std::shared_lock lock(mutex_);
+  return SearchFunc(kernel_name);
+}
+
+void *CpuKernelManager::GetFunction(const std::string &kernel_name) {
+  if (auto func = SearchFuncWithSharedLock(kernel_name); func != nullptr) {
+    return func;
+  }
+  std::unique_lock lock(mutex_);
+  // Search cache again between setting unique lock and calling "dlopen", to make sure that
+  // only one thread can call "dlopen" and insert handle to the cache for a new kernel_name.
+  // To avoid that several nodes (with the same kernel_name) open the same "so" by dlopen,
+  // but only cache it once, then the "dlclose" will be called only once, causing resource leak.
+  if (auto func = SearchFunc(kernel_name); func != nullptr) {
+    return func;
+  }
+  std::string fn;
+  auto it = kernel_name.rfind("_kernel");
+  if (it < kernel_name.size()) {
+    fn = kernel_name.substr(0, it);
+  } else {
+    fn = kernel_name;
+  }
+  std::string fn_so = kCpuKernelMeta + fn + ".so";
+  auto handle = dlopen(fn_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
+  if (handle == nullptr) {
+    MS_LOG(ERROR) << "Load " << fn_so << " failed. kernel: " << kernel_name;
+    return nullptr;
+  }
+  auto launch_func = dlsym(handle, kernel_name.c_str());
+  if (launch_func == nullptr) {
+    MS_LOG(ERROR) << "Undefined symbol " << kernel_name << " in " << fn_so;
+    return nullptr;
+  }
+  cpu_func_map_[kernel_name] = std::make_pair(launch_func, handle);
+  return launch_func;
+}
+
+bool CpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  auto js = nlohmann::json::parse(kernel_pack_->GetJson()->contents,
+                                  kernel_pack_->GetJson()->contents + kernel_pack_->GetJson()->len);
+  std::string kernel_name = js["kernelName"];
+  auto launch_func = kernelmanager_->GetFunction(kernel_name);
+  if (launch_func == nullptr) {
+    MS_LOG(ERROR) << "GetFunction failed. kernel: " << kernel_name;
+    return false;
+  }
+  std::vector<void *> runtimeargs;
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+  AkgCallBack akg_callback;
+  runtimeargs.emplace_back(reinterpret_cast<void *>(&akg_callback));
+  using AkgCpuKernelFunction = void (*)(void *);
+  reinterpret_cast<AkgCpuKernelFunction>(launch_func)(reinterpret_cast<void *>(runtimeargs.data()));
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h
@ -0,0 +1,73 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <unordered_map>
+#include <mutex>
+#include <shared_mutex>
+#include "backend/kernel_compiler/kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class CpuKernelManager {
+ public:
+  CpuKernelManager() = default;
+  ~CpuKernelManager();
+
+  void *GetFunction(const std::string &kernel_name);
+
+ private:
+  void *SearchFunc(const std::string &kernel_name) const;
+  void *SearchFuncWithSharedLock(const std::string &kernel_name) const;
+
+  // cache the kernel function: kernel_name -> {kernel_func, so_handle}
+  std::unordered_map<std::string, std::pair<void *, void *>> cpu_func_map_;
+  mutable std::shared_mutex mutex_;
+};
+using CpuKernelManagerPtr = std::shared_ptr<CpuKernelManager>;
+
+class CpuKernelMod : public KernelMod {
+ public:
+  explicit CpuKernelMod(const KernelPackPtr &kp) : kernel_pack_(kp) {}
+  ~CpuKernelMod() = default;
+
+  void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
+  void SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
+  void SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+
+  static CpuKernelManagerPtr kernelmanager_;
+
+ private:
+  KernelPackPtr kernel_pack_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;  // workspace is not used in cpu kernel.
+};
+
+using CpuKernelModPtr = std::shared_ptr<CpuKernelMod>;
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
@ -27,7 +27,7 @@ class AkgGpuKernelBuilder : public AkgKernelBuilder {
  AkgGpuKernelBuilder() = default;
  ~AkgGpuKernelBuilder() = default;

-  kernel::KernelBuildClient *GetClient() override { return &(kernel::GpuKernelBuildClient::Instance()); }
+  kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); }
  KernelPackPtr AkgSearchCache(const std::string &kernel_name) override;
  KernelPackPtr AkgInsertCache(const std::string &kernel_name) override;
  void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator,
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@ -157,16 +157,21 @@ FusionType GetFusionTypeByName(const std::string &name) {
 }

 void KernelMeta::Initialize() {
-  kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
+  if (GetStrProcessorFromContext() == kProcessorCpu) {
+    kernel_meta_path_ = std::string(kCpuKernelMeta);
+  } else {
+    kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";

 #if defined(_WIN32) || defined(_WIN64)
-  auto ret = mkdir(kernel_meta_path_.c_str());
+    auto ret = mkdir(kernel_meta_path_.c_str());
 #else
-  auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU);
+    auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU);
 #endif
-  if (ret != 0) {
-    MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later";
+    if (ret != 0) {
+      MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later";
+    }
  }
+
  initialized_ = true;
 }

@ -238,6 +243,8 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
  std::string kernel_json;
  if (processor == kProcessorAiCore || processor == kProcessorAiCpu) {
    kernel_json = kCceKernelMeta;
+  } else if (processor == kProcessorCpu) {
+    kernel_json = kCpuKernelMeta;
  } else {
    kernel_json = bin_map->kernel_meta_path();
  }
@ -872,6 +879,8 @@ Processor GetProcessorFromContext() {
    processor = kernel::Processor::CUDA;
  } else if (device_info == kAscendDevice) {
    processor = kernel::Processor::AICORE;
+  } else if (device_info == kCPUDevice) {
+    processor = kernel::Processor::CPU;
  }
  return processor;
 }
@ -883,6 +892,8 @@ std::string GetStrProcessorFromContext() {
    str_processor = kernel::kProcessorCuda;
  } else if (processor == kernel::Processor::AICORE) {
    str_processor = kernel::kProcessorAiCore;
+  } else if (processor == kernel::Processor::CPU) {
+    str_processor = kernel::kProcessorCpu;
  }
  return str_processor;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@ -34,10 +34,12 @@
 namespace mindspore {
 namespace kernel {
 constexpr auto kCceKernelMeta = "./kernel_meta/";
+constexpr auto kCpuKernelMeta = "./kernel_meta/";
 constexpr auto kGpuKernelMeta = "./cuda_meta";
 constexpr auto kProcessorAiCore = "aicore";
 constexpr auto kProcessorAiCpu = "aicpu";
 constexpr auto kProcessorCuda = "cuda";
+constexpr auto kProcessorCpu = "cpu";
 constexpr auto kProcessorUnknown = "unknown";
 constexpr auto kJsonSuffix = ".json";
 constexpr auto kInfoSuffix = ".info";
--- a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -99,6 +99,14 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string &
  (void)kernel_json.seekg(0, std::ios::beg);
  (void)kernel_json.read(json_->contents, SizeToLong(json_->len));

+  if (processor == kProcessorCpu) {
+    std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".so";
+    if (!CheckHash(json_f, bin_f, js)) {
+      return false;
+    }
+    return true;
+  }
+
  if (processor == kProcessorCuda) {
    std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".ptx";
    std::ifstream kernelbin(bin_f);
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@ -107,6 +107,7 @@ enum Processor {
  AICORE = 0,
  AICPU,
  CUDA,
+  CPU,
 };

 struct FlexArray {
--- a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
@ -13,8 +13,6 @@ endif()
 if(ENABLE_D OR ENABLE_ACL)
    file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "ascend/*.cc"
-        "graph_kernel/*.cc"
-        "graph_kernel/model/*.cc"
    )
    list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST})
 endif()
@ -22,8 +20,6 @@ endif()
 if(ENABLE_GPU)
    file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cc"
-        "graph_kernel/*.cc"
-        "graph_kernel/model/*.cc"
    )
    list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST})
 endif()
@ -43,6 +39,13 @@ if(ENABLE_CPU)
    list(APPEND _PREACTIVATE_SRC_LIST ${_CPU_SRC_LIST})
 endif()

+if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    file(GLOB_RECURSE _GK_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "graph_kernel/*.cc"
+        )
+    list(APPEND _PREACTIVATE_SRC_LIST ${_GK_SRC_LIST})
+endif()
+
 set_property(SOURCE ${_PREACTIVATE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
  SUBMODULE_ID=mindspore::SubModuleId::SM_PRE_ACT)
 add_library(_mindspore_backend_optimizer_obj OBJECT ${_PREACTIVATE_SRC_LIST})
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
@ -38,6 +38,8 @@
 #include "runtime/device/ascend/kernel_select_ascend.h"
 #elif ENABLE_GPU
 #include "runtime/device/gpu/kernel_info_setter.h"
+#elif ENABLE_CPU
+#include "runtime/device/cpu/kernel_select_cpu.h"
 #endif

 namespace mindspore::graphkernel {
@ -608,6 +610,9 @@ void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) {
 #elif ENABLE_GPU
  cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
  device::gpu::SetKernelInfo(cnode, kernel_type);
+#elif ENABLE_CPU
+  cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
+  device::cpu::SetKernelInfo(cnode);
 #endif
 }

--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc
@ -121,7 +121,7 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
  pm->AddPass(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);

  // Universal arithmetic simplify
-  pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu);
+  pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);

  // Common subexpression elimination
  pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_2);
@ -158,7 +158,7 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
 PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
  auto pm = std::make_shared<GraphKernelPassManager>(4, "highlevelopt2");
  // Enable atomic add
-  pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2);
+  pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);

  // Enable atomic add for stitch nodes.
  auto level = GetPassLevelByFlag(context::GraphKernelFlags::GetInstance().enable_stitch_fusion);
@ -170,8 +170,8 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
  pm->AddPass(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);

  // Enable tsa and uss
-  pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1);
-  pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1);
+  pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
+  pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);

  return pm;
 }
@ -204,6 +204,7 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
  MS_EXCEPTION_IF_NULL(context_ptr);
  is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
  is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
+  is_cpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice);

  auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
  optimizer->AddPassManager(PreProcess());
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h
@ -46,6 +46,7 @@ class GraphKernelOptimizer {

  bool is_gpu{false};
  bool is_ascend{false};
+  bool is_cpu{false};
 };

 void GraphKernelOptimize(const KernelGraphPtr &kernel_graph);
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@ -21,14 +21,17 @@
 #include "ir/anf.h"
 #include "utils/ms_utils.h"
 #include "utils/trace_base.h"
+#include "utils/context/graph_kernel_flags.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime.h"
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/cpu/insert_cast_cpu.h"
 #include "backend/optimizer/cpu/insert_format_transform_op.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #include "backend/optimizer/pass/erase_visit_attr.h"
 #include "debug/anf_ir_dump.h"
@ -102,6 +105,16 @@ void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  kernel_graph->SetExecOrderByDefault();
 }

+void CPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
+#ifdef ENABLE_AKG
+  if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
+    return;
+  }
+  graphkernel::GraphKernelOptimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+#endif
+}
+
 GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  auto graph_id = graph_sum_;
  auto graph = ConstructKernelGraph(lst, outputs);
@ -112,6 +125,7 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
  MS_LOG(INFO) << "Set kernel info end";
  Optimize(graph);
  FinalOptimize(graph);
+  GraphKernelOptimize(graph);
  MS_LOG(INFO) << "Build kernel";
  BuildKernel(graph.get());
  // Remove reorder after PS feature finish adapting push/pull in auto_monad.
@ -352,10 +366,20 @@ void KernelNotSupportException(const AnfNodePtr &kernel_node) {
 void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto &kernel_nodes = kernel_graph->execution_order();
+  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
+  std::vector<AnfNodePtr> akg_nodes;
  for (const auto &kernel_node : kernel_nodes) {
    MS_EXCEPTION_IF_NULL(kernel_node);
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "].";
+    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel_node) == KernelType::AKG_KERNEL) {
+      if (!bin_map->initialized()) {
+        bin_map->Initialize();
+      }
+      akg_nodes.push_back(kernel_node);
+      continue;
+    }
    std::shared_ptr<kernel::CPUKernel> cpu_kernel =
      kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
    if (cpu_kernel == nullptr) {
@ -369,6 +393,10 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
    AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
    MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "].";
  }
+#ifdef ENABLE_AKG
+  kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
+  (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
+#endif
 }
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/cpu_session.h
+++ b/mindspore/ccsrc/backend/session/cpu_session.h
@ -42,6 +42,7 @@ class CPUSession : public SessionBasic {
                        VectorRef *const outputs) override;
  void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
  ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override;
+  void GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph);
  void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
  KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                             const std::vector<tensor::TensorPtr> &input_tensors,
--- a/mindspore/ccsrc/backend/session/kernel_build_client.h
+++ b/mindspore/ccsrc/backend/session/kernel_build_client.h
@ -256,7 +256,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
  ~AscendKernelBuildClient() override { Close(); }
 };

-class GpuKernelBuildClient : public KernelBuildClient {
+class AkgKernelBuildClient : public KernelBuildClient {
 public:
  // Server configure
  constexpr inline static auto kGetPathScript =
@ -264,15 +264,15 @@ class GpuKernelBuildClient : public KernelBuildClient {
    "\""
    "import pkgutil;"
    "path = pkgutil"
-    ".get_loader(\\\"mindspore._extends.remote.kernel_build_server_gpu\\\")"  // Server module name
+    ".get_loader(\\\"mindspore._extends.remote.kernel_build_server_akg\\\")"  // Server module name
    ".get_filename();"
    "print('[~]' + path)"
    "\"";

-  constexpr inline static auto kServerScript = "kernel_build_server_gpu.py";
+  constexpr inline static auto kServerScript = "kernel_build_server_akg.py";

-  static GpuKernelBuildClient &Instance() {
-    static GpuKernelBuildClient instance;
+  static AkgKernelBuildClient &Instance() {
+    static AkgKernelBuildClient instance;
    return instance;
  }

@ -283,15 +283,15 @@ class GpuKernelBuildClient : public KernelBuildClient {
    return GetScriptFilePath(env, kGetPathScript, kServerScript);
  }

-  GpuKernelBuildClient(const GpuKernelBuildClient &) = delete;
-  GpuKernelBuildClient &operator=(const GpuKernelBuildClient &) = delete;
+  AkgKernelBuildClient(const AkgKernelBuildClient &) = delete;
+  AkgKernelBuildClient &operator=(const AkgKernelBuildClient &) = delete;

-  GpuKernelBuildClient(GpuKernelBuildClient &&) = delete;
-  GpuKernelBuildClient &operator=(GpuKernelBuildClient &&) = delete;
+  AkgKernelBuildClient(AkgKernelBuildClient &&) = delete;
+  AkgKernelBuildClient &operator=(AkgKernelBuildClient &&) = delete;

 private:
-  GpuKernelBuildClient() { Open(); }
-  ~GpuKernelBuildClient() override { Close(); }
+  AkgKernelBuildClient() { Open(); }
+  ~AkgKernelBuildClient() override { Close(); }
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
@ -18,10 +18,12 @@
 #include <string>
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/cpu_memory_manager.h"
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "backend/kernel_compiler/kernel_build_info.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "utils/trace_base.h"
+#include "utils/context/graph_kernel_flags.h"
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/common/common_backend_optimization.h"
@ -29,6 +31,8 @@
 #include "backend/optimizer/cpu/insert_format_transform_op.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #include "backend/optimizer/pass/erase_visit_attr.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
+#include "backend/session/anf_runtime_algorithm.h"
 #include "profiler/device/cpu/cpu_profiling.h"
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
@ -113,6 +117,14 @@ void CPUDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {

  // Run final optimization.
  opt::CommonFinalOptimization(graph);
+
+#ifdef ENABLE_AKG
+  // Run graph kernel fusion optimization
+  if (context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
+    graphkernel::GraphKernelOptimize(graph);
+    graph->SetExecOrderByDefault();
+  }
+#endif
 }

 void CPUDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
@ -173,11 +185,21 @@ void CPUDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const
 }

 void CPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
+  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
+  std::vector<AnfNodePtr> akg_nodes;
  for (const auto &node : nodes) {
    MS_EXCEPTION_IF_NULL(node);
    if (AnfAlgo::IsControlOpExecInBackend(node)) {
      continue;
    }
+    if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) {
+      if (!bin_map->initialized()) {
+        bin_map->Initialize();
+      }
+      akg_nodes.push_back(node);
+      continue;
+    }
    std::string kernel_name = AnfAlgo::GetCNodeName(node);
    std::shared_ptr<kernel::CPUKernel> cpu_kernel = kernel::CPUKernelFactory::GetInstance().Create(kernel_name, node);
    if (!cpu_kernel) {
@ -195,6 +217,10 @@ void CPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
    cpu_kernel->Init(node);
    AnfAlgo::SetKernelMod(cpu_kernel, node.get());
  }
+#ifdef ENABLE_AKG
+  kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
+  (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
+#endif
 }

 void CPUDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const {
@ -212,8 +238,6 @@ bool CPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
  MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  MS_EXCEPTION_IF_NULL(kernel_mod);
-  auto cpu_kernel_mod = dynamic_cast<kernel::CPUKernel *>(kernel_mod);
-  MS_EXCEPTION_IF_NULL(cpu_kernel_mod);

 #ifdef PLATFORM_86
  // Some CPU kernels need set the flush zero mode to improve performance.
@ -226,6 +250,8 @@ bool CPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
  // Some CPU kernels can't initialize kernel and launch kernel in different thread, so reinitialize the kernels before
  // launch.
  if (kOpNotSupportMultiThreadExecList.find(AnfAlgo::GetCNodeName(kernel)) != kOpNotSupportMultiThreadExecList.end()) {
+    auto cpu_kernel_mod = dynamic_cast<kernel::CPUKernel *>(kernel_mod);
+    MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
    cpu_kernel_mod->InitKernel(kernel);
  }
 #ifndef ENABLE_SECURITY
--- a/mindspore/ccsrc/utils/context/graph_kernel_flags.cc
+++ b/mindspore/ccsrc/utils/context/graph_kernel_flags.cc
@ -172,7 +172,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma
  FlagRegister reg(flag_map);
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
-  bool is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
+  bool is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);

  // Set opt_level first, some flags' default value depends on it.
  // Default optimization level is level 2 when enable graphkernel
@ -192,7 +192,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma

  // Integer flags
  reg.AddFlag("online_tuning", &online_tuning);
-  reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_gpu ? OpLevel_MAX : OpLevel_0);
+  reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_ascend ? OpLevel_0 : OpLevel_MAX);

  // String flags
  reg.AddFlag("repository_path", &repository_path);
--- a/mindspore/context.py
+++ b/mindspore/context.py
@ -489,8 +489,8 @@ def _check_target_specific_cfgs(device, arg_key):
    device_cfgs = {
        'enable_dump': ['Ascend'],
        'save_dump_path': ['Ascend'],
-        'enable_graph_kernel': ['Ascend', 'GPU'],
-        'graph_kernel_flags': ['Ascend', 'GPU'],
+        'enable_graph_kernel': ['Ascend', 'GPU', 'CPU'],
+        'graph_kernel_flags': ['Ascend', 'GPU', 'CPU'],
        'enable_reduce_precision': ['Ascend'],
        'enable_profiling': ['Ascend'],
        'profiling_options': ['Ascend'],
--- a/scripts/build/build_mindspore.sh
+++ b/scripts/build/build_mindspore.sh
@ -79,8 +79,11 @@ build_mindspore()
    if [[ "X$USE_GLOG" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DUSE_GLOG=ON"
    fi
-    if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then
+    if [[ "X$ENABLE_AKG" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_AKG=ON"
+        if [[ "X$ENABLE_CPU" = "Xon" && "X$ENABLE_D" != "Xon" && "X$ENABLE_GPU" != "Xon" ]]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DUSE_LLVM=ON"
+        fi
    fi
    if [[ "X$ENABLE_ACL" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ACL=ON"
--- a/scripts/build/default_options.sh
+++ b/scripts/build/default_options.sh
@ -44,7 +44,7 @@ init_default_options()
  export LITE_PLATFORM=""
  export LITE_ENABLE_AAR="off"
  export USE_GLOG="on"
-  export ENABLE_AKG="on"
+  export ENABLE_AKG="off"
  export ENABLE_ACL="off"
  export ENABLE_D="off"
  export ENABLE_DEBUGGER="on"
--- a/scripts/build/parse_device.sh
+++ b/scripts/build/parse_device.sh
@ -40,6 +40,7 @@ parse_device()
      exit 1
    fi
    export CUDA_VERSION="$DEVICE_VERSION"
+    export ENABLE_AKG="on"
  elif [[ "X$DEVICE" == "Xd" || "X$DEVICE" == "Xascend" ]]; then
    # version default 910
    if [[ "X$DEVICE_VERSION" == "X" ]]; then
@ -54,6 +55,7 @@ parse_device()
      export ENABLE_ACL="on"
      ENABLE_CPU="on"
      export ENABLE_MPI="on"
+      export ENABLE_AKG="on"
    else
      echo "Invalid value ${DEVICE_VERSION} for option -V"
      usage
--- a/scripts/build/usage.sh
+++ b/scripts/build/usage.sh
@ -21,7 +21,7 @@ usage()
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t ut|st] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
  echo "              [-a on|off] [-p on|off] [-i] [-R] [-D on|off] [-j[n]] [-e gpu|ascend|cpu] \\"
-  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
+  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K on|off] \\"
  echo "              [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
  echo "              [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
  echo "              [-L Tensor-RT path] [-y on|off]  \\"
@ -52,7 +52,7 @@ usage()
  echo "    -V Specify the device version, if -e gpu, default CUDA 10.1, if -e ascend, default Ascend 910"
  echo "    -I Enable compiling mindspore lite for arm64, arm32 or x86_64, default disable mindspore lite compilation"
  echo "    -A Enable compiling mindspore lite aar package, option: on/off, default: off"
-  echo "    -K Compile with AKG, default on"
+  echo "    -K Compile with AKG, default on if -e gpu or -e ascend, else default off"
  echo "    -B Enable debugger, default on"
  echo "    -E Enable IBVERBS for parameter server, default off"
  echo "    -l Compile with python dependency, default on"