diff --git a/CMakeLists.txt b/CMakeLists.txt
index 098b8af2383..3dd84ca95d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(PYBIND11_CPP_STANDARD -std=c++17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPTION_CXX_FLAGS}")
 
-if(ENABLE_AKG AND (ENABLE_D OR ENABLE_GPU))
+if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
     add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
 endif()
 
diff --git a/build.sh b/build.sh
index af83835b99d..5fa6113cc75 100755
--- a/build.sh
+++ b/build.sh
@@ -45,7 +45,7 @@ update_submodule()
   cd "${BASEPATH}/graphengine"
   git submodule update --init metadef
   cd "${BASEPATH}"
-  if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then
+  if [[ "X$ENABLE_AKG" = "Xon" ]]; then
       git submodule update --init --recursive akg
   fi
 }
@@ -57,7 +57,6 @@ build_exit()
     exit 1
 }
 
-
 make_clean()
 {
   echo "enable make clean"
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 59d5861c5ed..cfdf946b0ff 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -151,3 +151,7 @@ endif()
 if(ENABLE_CPU AND NOT WIN32)
     add_compile_definitions(ENABLE_ARMOUR)
 endif()
+
+if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions(ENABLE_AKG)
+endif()
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 1fb3227d1f9..6f5e3594797 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -291,7 +291,7 @@ install(
     COMPONENT mindspore
 )
 
-if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG)
+if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
     set (AKG_PATH ${BUILD_PATH}/mindspore/akg)
     file(REMOVE_RECURSE ${AKG_PATH}/_akg)
     file(MAKE_DIRECTORY ${AKG_PATH}/_akg)
diff --git a/cmake/package_win.cmake b/cmake/package_win.cmake
index aaac79b921a..a2cfa701d40 100644
--- a/cmake/package_win.cmake
+++ b/cmake/package_win.cmake
@@ -187,20 +187,6 @@ install(
   COMPONENT mindspore
 )
 
-if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG)
-  set (AKG_PATH ${CMAKE_SOURCE_DIR}/build/mindspore/akg)
-  file(REMOVE_RECURSE ${AKG_PATH}/_akg)
-  file(MAKE_DIRECTORY ${AKG_PATH}/_akg)
-  file(TOUCH ${AKG_PATH}/_akg/__init__.py)
-  install(DIRECTORY "${AKG_PATH}/akg" DESTINATION "${AKG_PATH}/_akg")
-  install(
-    DIRECTORY
-    ${AKG_PATH}/_akg
-    DESTINATION ${INSTALL_PY_DIR}/
-    COMPONENT mindspore
-  )
-endif()
-
 if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
   install(
     DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/dataset
diff --git a/mindspore/_extends/graph_kernel/model/graph_split.py b/mindspore/_extends/graph_kernel/model/graph_split.py
index 745b4382963..1b5c5e9983d 100644
--- a/mindspore/_extends/graph_kernel/model/graph_split.py
+++ b/mindspore/_extends/graph_kernel/model/graph_split.py
@@ -1180,11 +1180,131 @@ class GraphSplitAscend(GraphSplitByPattern):
             _fuse_once(fuse_func)
 
 
+class GraphSplitCpu(GraphSplitByPattern):
+    """Graph splitter"""
+    BORADCAST_FUSE_DEPTH = 20
+    REDUCE_FUSE_DEPTH = 20
+
+    def get_default_mode(self, op):
+        """Get default mode in CPU"""
+        pattern = PrimLib.iter_type(op)
+        return self.Area.MODE_BASIC if pattern == PrimLib.RESHAPE else self.Area.MODE_COMPOSITE
+
+    def pattern_fuse(self, fuse_func=None):
+        """fuse Areas by pattern"""
+        def _reshape(dom):
+            if dom.pattern != PrimLib.RESHAPE:
+                return None
+            min_area, forward_fuse = None, False
+            for a, _ in dom.out_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and dom.check_acyclic(a) and \
+                        (min_area is None or a.pattern < min_area.pattern):
+                    min_area = a
+            for a, _ in dom.in_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and a.check_acyclic(dom) and \
+                        len(dom.ops[0].inputs[0].to_ops) == 1 and not a.is_output and \
+                        (min_area is None or a.pattern < min_area.pattern):
+                    min_area, forward_fuse = a, True
+            return ([min_area], forward_fuse) if min_area else None
+
+        def _elemwise_depth(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.in_relations) != 1:
+                return None
+            a, r = list(dom.in_relations.items())[0]
+            if a.pattern > PrimLib.BROADCAST or len(a.out_relations) != 1 or r != PrimLib.ELEMWISE or \
+                    a.dom_op().output.shape != dom.dom_op().output.shape:
+                return None
+            return [a], True
+
+        def _elemwise_width(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST):
+                return None
+            fused = []
+            for a, r in dom.in_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and r == PrimLib.ELEMWISE and a.check_acyclic(dom) and \
+                        a.dom_op().output.shape == dom.dom_op().output.shape:
+                    fused.append(a)
+            return fused, True
+
+        def _broadcast_pat_exclude(dom, a, r):
+            if a.pattern == PrimLib.REDUCE:
+                return dom.pattern > PrimLib.ELEMWISE or r > PrimLib.ELEMWISE
+            return a.pattern > PrimLib.REDUCE or r > PrimLib.BROADCAST
+
+        def _broadcast_depth(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.out_relations) != 1 or \
+                    dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH:
+                return None
+            a, r = list(dom.out_relations.items())[0]
+            if _broadcast_pat_exclude(dom, a, r) or len(a.in_relations) != 1:
+                return None
+            return [a], False
+
+        def _broadcast_width(dom):
+            if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or \
+                    dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH:
+                return None
+            fused = []
+            for a, r in dom.out_relations.items():
+                if _broadcast_pat_exclude(dom, a, r) or not dom.check_acyclic(a) or \
+                        (fused and fused[0].dom_op().output.shape != a.dom_op().output.shape):
+                    return None
+                fused.append(a)
+            return fused, False
+
+        def _reduce_pat_exclude(_, a, r):
+            if len(a.ops) > self.REDUCE_FUSE_DEPTH:
+                return True
+            return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST
+
+        def _reduce_depth(dom):
+            if dom.pattern != PrimLib.REDUCE or len(dom.in_relations) != 1:
+                return None
+            a, r = list(dom.in_relations.items())[0]
+            if _reduce_pat_exclude(dom, a, r) or len(a.out_relations) != 1:
+                return None
+            return [a], True
+
+        def _reduce_width(dom):
+            if dom.pattern != PrimLib.REDUCE:
+                return None
+            fused = []
+            for a, r in dom.in_relations.items():
+                if not _reduce_pat_exclude(dom, a, r) and a.check_acyclic(dom):
+                    fused.append(a)
+            return fused, True
+
+        def _fuse_loop():
+            changed = True
+            while changed:
+                changed = False
+                changed = self.fuse(_reshape) or changed
+                changed = self.fuse(_elemwise_depth) or changed
+                changed = self.fuse(_elemwise_width) or changed
+                changed = self.fuse(_reduce_depth) or changed
+                changed = self.fuse(_reduce_width) or changed
+                changed = self.fuse(_broadcast_depth) or changed
+                changed = self.fuse(_broadcast_width) or changed
+
+        def _fuse_once(fuse_func):
+            if fuse_func(_reshape) or fuse_func(_elemwise_depth) or fuse_func(_elemwise_width) or \
+                    fuse_func(_reduce_depth) or fuse_func(_reduce_width) or fuse_func(_broadcast_depth) or \
+                    fuse_func(_broadcast_width):
+                return
+
+        if fuse_func is None:
+            _fuse_loop()
+        else:
+            _fuse_once(fuse_func)
+
+
 def split(graph, target, flags):
     """Split graph"""
     result = None
     if target == "cuda":
         result = GraphSplitGpu(graph, flags).split()
-    else:
+    elif target == "aicore":
         result = GraphSplitAscend(graph, flags).split()
+    else:
+        result = GraphSplitCpu(graph, flags).split()
     return result
diff --git a/mindspore/_extends/graph_kernel/model/model_builder.py b/mindspore/_extends/graph_kernel/model/model_builder.py
index e23efd54992..e36c1fc5fd5 100644
--- a/mindspore/_extends/graph_kernel/model/model_builder.py
+++ b/mindspore/_extends/graph_kernel/model/model_builder.py
@@ -132,7 +132,7 @@ class CompositeGraph:
                 return dict()
             attr = {}
             for a in op['attr']:
-                if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'):
+                if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin', 'Argmax', 'Argmin'):
                     attr['reduce_axis'] = a['value']
                 else:
                     attr[a['name']] = a['value']
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
index d3f0bbf1641..096ee8f005f 100644
--- a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
+++ b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
@@ -33,7 +33,7 @@ def copy_json(pid_path, ppid_path):
         shutil.move(os.path.join(pid_path, json_file), ppid_path)
 
 
-def _compile_akg_task_gpu(json_strs, attrs):
+def _compile_akg_task_default(json_strs, attrs):
     """
     compile func called in single process
 
@@ -110,16 +110,14 @@ class AkgProcess:
         if self.argc == 0:
             raise ValueError("json must be not null")
         args = [(arg, attrs) for arg in self.args]
-        if self.platform == "GPU":
-            with Pool(processes=self.process_num) as pool:
-                res = pool.starmap_async(_compile_akg_task_gpu, args)
-                res.get(timeout=self.wait_time)
-        elif self.platform == "ASCEND":
+        if self.platform == "ASCEND":
             with Pool(processes=self.process_num) as pool:
                 res = pool.starmap_async(_compile_akg_task_ascend, args)
                 res.get(timeout=self.wait_time)
         else:
-            raise ValueError("The value of 'platform' must be 'GPU' or 'ASCEND'.")
+            with Pool(processes=self.process_num) as pool:
+                res = pool.starmap_async(_compile_akg_task_default, args)
+                res.get(timeout=self.wait_time)
         return True
 
     def accept_json(self, json):
diff --git a/mindspore/_extends/remote/kernel_build_server_gpu.py b/mindspore/_extends/remote/kernel_build_server_akg.py
similarity index 81%
rename from mindspore/_extends/remote/kernel_build_server_gpu.py
rename to mindspore/_extends/remote/kernel_build_server_akg.py
index c951508f226..bd1ee1fd924 100644
--- a/mindspore/_extends/remote/kernel_build_server_gpu.py
+++ b/mindspore/_extends/remote/kernel_build_server_akg.py
@@ -12,22 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""kernel build server for gpu"""
+"""kernel build server for akg kernels"""
 import sys
 import warnings
 from mindspore._extends.remote.kernel_build_server import Messager, get_logger, AkgBuilder
 
 
-class GpuMessager(Messager):
+class AkgMessager(Messager):
     '''
-    GPU Messager
+    Default Messager for akg kernels.
     It works as a server, communicating with c++ client.
     '''
 
     def __init__(self, fdin, fdout):
         super().__init__(fdin, fdout)
-        get_logger().info("[TRACE] GPU Messager init...")
-        self.akg_builder = AkgBuilder("GPU")
+        get_logger().info("[TRACE] Akg Messager init...")
+        self.akg_builder = AkgBuilder("default")
 
     def handle(self):
         """
@@ -42,7 +42,7 @@ class GpuMessager(Messager):
             self.exit()
 
     def exit(self):
-        get_logger().info("[TRACE] GPU Messager Exit...")
+        get_logger().info("[TRACE] Akg Messager Exit...")
         exit()
 
 
@@ -51,5 +51,5 @@ if __name__ == '__main__':
     if len(sys.argv) != 3:
         raise Exception('Incorrect argv: {}'.format(sys.argv))
     get_logger().debug(f"[TRACE] argv: {str(sys.argv)}")
-    messager = GpuMessager(int(sys.argv[1]), int(sys.argv[2]))
+    messager = AkgMessager(int(sys.argv[1]), int(sys.argv[2]))
     messager.run()
diff --git a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
index bfc8f66bae6..3ed92e71e51 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -13,12 +13,6 @@ if(ENABLE_D)
     file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "kernel_query.cc"
         "kernel_fusion.cc"
-        "akg/akg_kernel_build.cc"
-        "akg/ascend/*.cc"
-        "akg/akg_kernel_json_generator.cc"
-        "akg/akg_kernel_json_decoder.cc"
-        "akg/akg_kernel_attrs_process.cc"
-        "akg/akg_kernel_metadata.cc"
         "tbe/*.cc"
         "host/*.cc"
         "aicpu/*.cc"
@@ -95,11 +89,6 @@ endif()
 if(ENABLE_GPU)
     file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "gpu/*.cu"
-        "akg/akg_kernel_build.cc"
-        "akg/gpu/*.cc"
-        "akg/akg_kernel_json_generator.cc"
-        "akg/akg_kernel_json_decoder.cc"
-        "akg/akg_kernel_attrs_process.cc"
     )
 
     file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
@@ -122,7 +111,35 @@ if(ENABLE_GPU)
     # add_library(_mindspore_kernel_cuda_obj OBJECT ${CUDA_SRC_LIST})
 endif()
 
-set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST}
+if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    file(GLOB_RECURSE AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "akg/akg_kernel_build.cc"
+        "akg/akg_kernel_json_generator.cc"
+        "akg/akg_kernel_json_decoder.cc"
+        "akg/akg_kernel_attrs_process.cc"
+    )
+    if(ENABLE_GPU)
+        file(GLOB_RECURSE AKG_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+            "akg/gpu/*.cc"
+        )
+        list(APPEND AKG_SRC_LIST ${AKG_GPU_SRC_LIST})
+    endif()
+    if(ENABLE_D)
+        file(GLOB_RECURSE AKG_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+            "akg/ascend/*.cc"
+            "akg/akg_kernel_metadata.cc"
+        )
+        list(APPEND AKG_SRC_LIST ${AKG_D_SRC_LIST})
+    endif()
+    if(ENABLE_CPU)
+        file(GLOB_RECURSE AKG_CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+            "akg/cpu/*.cc"
+        )
+        list(APPEND AKG_SRC_LIST ${AKG_CPU_SRC_LIST})
+    endif()
+endif()
+
+set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST}
     PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
 add_library(_mindspore_backend_kernel_compiler_obj OBJECT ${KERNEL_SRC_LIST} ${CPU_SRC_LIST}
-    ${GPU_SRC_LIST} ${D_SRC_LIST} ${QUANTUM_SRC_LIST})
+    ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST} ${QUANTUM_SRC_LIST})
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
index 33a352ecda4..2d3975e8d22 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
@@ -16,6 +16,7 @@
 
 #include "backend/kernel_compiler/akg/akg_kernel_build.h"
 
+#include <sys/shm.h>
 #include <stdio.h>
 #include <errno.h>
 #include <fcntl.h>
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
index 50c477234a6..0507b4aa143 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@@ -17,8 +17,6 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
 
-#include <sys/shm.h>
-
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc
new file mode 100644
index 00000000000..1b7d49c505b
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
+#include <Python.h>
+#include <vector>
+#include <memory>
+#include <string>
+#include "backend/kernel_compiler/common_utils.h"
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
+#include "utils/ms_utils.h"
+#include "backend/session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace kernel {
+KernelPackPtr AkgCpuKernelBuilder::AkgSearchCache(const std::string &kernel_name) {
+  return SearchCache(kernel_name, kProcessorCpu);
+}
+
+KernelPackPtr AkgCpuKernelBuilder::AkgInsertCache(const std::string &kernel_name) {
+  return InsertCache(kernel_name, kProcessorCpu);
+}
+
+void AkgCpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
+                                          const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
+  auto kernel_mod_ptr = std::make_shared<CpuKernelMod>(kernel_pack);
+  kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
+  kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+  AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
+}
+
+void AkgCpuKernelBuilder::AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) {
+  kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path());
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h
new file mode 100644
index 00000000000..695f997c2cc
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
+#include <string>
+#include "backend/kernel_compiler/akg/akg_kernel_build.h"
+#include "base/base.h"
+
+namespace mindspore {
+namespace kernel {
+class AkgCpuKernelBuilder : public AkgKernelBuilder {
+ public:
+  AkgCpuKernelBuilder() = default;
+  ~AkgCpuKernelBuilder() = default;
+
+  kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); }
+  KernelPackPtr AkgSearchCache(const std::string &kernel_name) override;
+  KernelPackPtr AkgInsertCache(const std::string &kernel_name) override;
+  void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator,
+                       const AnfNodePtr &anf_node) override;
+  void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override;
+};
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc
new file mode 100644
index 00000000000..911cd6992e6
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc
@@ -0,0 +1,143 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
+
+#include <dlfcn.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include "nlohmann/json.hpp"
+#include "backend/kernel_compiler/common_utils.h"
+#include "common/thread_pool.h"
+#include "utils/ms_utils.h"
+#include "mindspore/ccsrc/debug/common.h"
+
+namespace mindspore {
+namespace kernel {
+namespace {
+using AkgParallelLambda = int (*)(int task_id, int num_task, void *cdata);
+int AkgLaunchFunc(AkgParallelLambda flambda, void *cdata, int num_task) {
+  size_t num_workers =
+    std::min(mindspore::common::ThreadPool::GetInstance().GetSyncRunThreadNum(), static_cast<size_t>(num_task));
+  std::vector<mindspore::common::Task> tasks;
+  size_t thread_index = 0;
+  while (thread_index < num_workers) {
+    auto block = [&, thread_index]() {
+      flambda(thread_index, num_workers, cdata);
+      return mindspore::common::SUCCESS;
+    };
+    tasks.emplace_back(block);
+    thread_index++;
+  }
+  mindspore::common::ThreadPool::GetInstance().SyncRun(tasks);
+  return 0;
+}
+
+struct AkgCallBack {
+  void *parallel_launch_func;
+  void *(*malloc_func)(size_t);
+  void (*free_func)(void *);
+
+  AkgCallBack() {
+    parallel_launch_func = reinterpret_cast<void *>(&AkgLaunchFunc);
+    malloc_func = &malloc;
+    free_func = &free;
+  }
+  ~AkgCallBack() = default;
+};
+}  // namespace
+CpuKernelManagerPtr CpuKernelMod::kernelmanager_ = std::make_shared<CpuKernelManager>();
+
+CpuKernelManager::~CpuKernelManager() {
+  for (auto &cpu_func_pair : cpu_func_map_) {
+    if (cpu_func_pair.second.second != nullptr) {
+      (void)dlclose(cpu_func_pair.second.second);
+    }
+  }
+}
+
+void *CpuKernelManager::SearchFunc(const std::string &kernel_name) const {
+  auto iter = cpu_func_map_.find(kernel_name);
+  if (iter == cpu_func_map_.end()) {
+    return nullptr;
+  } else {
+    return iter->second.first;
+  }
+}
+
+void *CpuKernelManager::SearchFuncWithSharedLock(const std::string &kernel_name) const {
+  std::shared_lock lock(mutex_);
+  return SearchFunc(kernel_name);
+}
+
+void *CpuKernelManager::GetFunction(const std::string &kernel_name) {
+  if (auto func = SearchFuncWithSharedLock(kernel_name); func != nullptr) {
+    return func;
+  }
+  std::unique_lock lock(mutex_);
+  // Search cache again between setting unique lock and calling "dlopen", to make sure that
+  // only one thread can call "dlopen" and insert handle to the cache for a new kernel_name.
+  // To avoid that several nodes (with the same kernel_name) open the same "so" by dlopen,
+  // but only cache it once, then the "dlclose" will be called only once, causing resource leak.
+  if (auto func = SearchFunc(kernel_name); func != nullptr) {
+    return func;
+  }
+  std::string fn;
+  auto it = kernel_name.rfind("_kernel");
+  if (it < kernel_name.size()) {
+    fn = kernel_name.substr(0, it);
+  } else {
+    fn = kernel_name;
+  }
+  std::string fn_so = kCpuKernelMeta + fn + ".so";
+  auto handle = dlopen(fn_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
+  if (handle == nullptr) {
+    MS_LOG(ERROR) << "Load " << fn_so << " failed. kernel: " << kernel_name;
+    return nullptr;
+  }
+  auto launch_func = dlsym(handle, kernel_name.c_str());
+  if (launch_func == nullptr) {
+    MS_LOG(ERROR) << "Undefined symbol " << kernel_name << " in " << fn_so;
+    return nullptr;
+  }
+  cpu_func_map_[kernel_name] = std::make_pair(launch_func, handle);
+  return launch_func;
+}
+
+bool CpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  auto js = nlohmann::json::parse(kernel_pack_->GetJson()->contents,
+                                  kernel_pack_->GetJson()->contents + kernel_pack_->GetJson()->len);
+  std::string kernel_name = js["kernelName"];
+  auto launch_func = kernelmanager_->GetFunction(kernel_name);
+  if (launch_func == nullptr) {
+    MS_LOG(ERROR) << "GetFunction failed. kernel: " << kernel_name;
+    return false;
+  }
+  std::vector<void *> runtimeargs;
+  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
+                       [](const AddressPtr &input) -> void * { return input->addr; });
+  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
+                       [](const AddressPtr &output) -> void * { return output->addr; });
+  AkgCallBack akg_callback;
+  runtimeargs.emplace_back(reinterpret_cast<void *>(&akg_callback));
+  using AkgCpuKernelFunction = void (*)(void *);
+  reinterpret_cast<AkgCpuKernelFunction>(launch_func)(reinterpret_cast<void *>(runtimeargs.data()));
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h
new file mode 100644
index 00000000000..71cf82405f9
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <unordered_map>
+#include <mutex>
+#include <shared_mutex>
+#include "backend/kernel_compiler/kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class CpuKernelManager {
+ public:
+  CpuKernelManager() = default;
+  ~CpuKernelManager();
+
+  void *GetFunction(const std::string &kernel_name);
+
+ private:
+  void *SearchFunc(const std::string &kernel_name) const;
+  void *SearchFuncWithSharedLock(const std::string &kernel_name) const;
+
+  // cache the kernel function: kernel_name -> {kernel_func, so_handle}
+  std::unordered_map<std::string, std::pair<void *, void *>> cpu_func_map_;
+  mutable std::shared_mutex mutex_;
+};
+using CpuKernelManagerPtr = std::shared_ptr<CpuKernelManager>;
+
+class CpuKernelMod : public KernelMod {
+ public:
+  explicit CpuKernelMod(const KernelPackPtr &kp) : kernel_pack_(kp) {}
+  ~CpuKernelMod() = default;
+
+  void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
+  void SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
+  void SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+
+  static CpuKernelManagerPtr kernelmanager_;
+
+ private:
+  KernelPackPtr kernel_pack_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;  // workspace is not used in cpu kernel.
+};
+
+using CpuKernelModPtr = std::shared_ptr<CpuKernelMod>;
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
index 5453aea7157..50ad8f1970c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h
@@ -27,7 +27,7 @@ class AkgGpuKernelBuilder : public AkgKernelBuilder {
   AkgGpuKernelBuilder() = default;
   ~AkgGpuKernelBuilder() = default;
 
-  kernel::KernelBuildClient *GetClient() override { return &(kernel::GpuKernelBuildClient::Instance()); }
+  kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); }
   KernelPackPtr AkgSearchCache(const std::string &kernel_name) override;
   KernelPackPtr AkgInsertCache(const std::string &kernel_name) override;
   void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
index 426889507c5..2eaa33f40d4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@@ -157,16 +157,21 @@ FusionType GetFusionTypeByName(const std::string &name) {
 }
 
 void KernelMeta::Initialize() {
-  kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
+  if (GetStrProcessorFromContext() == kProcessorCpu) {
+    kernel_meta_path_ = std::string(kCpuKernelMeta);
+  } else {
+    kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
 
 #if defined(_WIN32) || defined(_WIN64)
-  auto ret = mkdir(kernel_meta_path_.c_str());
+    auto ret = mkdir(kernel_meta_path_.c_str());
 #else
-  auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU);
+    auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU);
 #endif
-  if (ret != 0) {
-    MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later";
+    if (ret != 0) {
+      MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later";
+    }
   }
+
   initialized_ = true;
 }
 
@@ -238,6 +243,8 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
   std::string kernel_json;
   if (processor == kProcessorAiCore || processor == kProcessorAiCpu) {
     kernel_json = kCceKernelMeta;
+  } else if (processor == kProcessorCpu) {
+    kernel_json = kCpuKernelMeta;
   } else {
     kernel_json = bin_map->kernel_meta_path();
   }
@@ -872,6 +879,8 @@ Processor GetProcessorFromContext() {
     processor = kernel::Processor::CUDA;
   } else if (device_info == kAscendDevice) {
     processor = kernel::Processor::AICORE;
+  } else if (device_info == kCPUDevice) {
+    processor = kernel::Processor::CPU;
   }
   return processor;
 }
@@ -883,6 +892,8 @@ std::string GetStrProcessorFromContext() {
     str_processor = kernel::kProcessorCuda;
   } else if (processor == kernel::Processor::AICORE) {
     str_processor = kernel::kProcessorAiCore;
+  } else if (processor == kernel::Processor::CPU) {
+    str_processor = kernel::kProcessorCpu;
   }
   return str_processor;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
index 7eee55693b6..e29e9040007 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -34,10 +34,12 @@
 namespace mindspore {
 namespace kernel {
 constexpr auto kCceKernelMeta = "./kernel_meta/";
+constexpr auto kCpuKernelMeta = "./kernel_meta/";
 constexpr auto kGpuKernelMeta = "./cuda_meta";
 constexpr auto kProcessorAiCore = "aicore";
 constexpr auto kProcessorAiCpu = "aicpu";
 constexpr auto kProcessorCuda = "cuda";
+constexpr auto kProcessorCpu = "cpu";
 constexpr auto kProcessorUnknown = "unknown";
 constexpr auto kJsonSuffix = ".json";
 constexpr auto kInfoSuffix = ".info";
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
index 0319ec04995..63a211fb87c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,6 +99,14 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string &
   (void)kernel_json.seekg(0, std::ios::beg);
   (void)kernel_json.read(json_->contents, SizeToLong(json_->len));
 
+  if (processor == kProcessorCpu) {
+    std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".so";
+    if (!CheckHash(json_f, bin_f, js)) {
+      return false;
+    }
+    return true;
+  }
+
   if (processor == kProcessorCuda) {
     std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".ptx";
     std::ifstream kernelbin(bin_f);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel.h b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
index e727a4884e4..50c8753f2e5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@@ -107,6 +107,7 @@ enum Processor {
   AICORE = 0,
   AICPU,
   CUDA,
+  CPU,
 };
 
 struct FlexArray {
diff --git a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
index 52a365ade73..1f43c2099fb 100644
--- a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt
@@ -13,8 +13,6 @@ endif()
 if(ENABLE_D OR ENABLE_ACL)
     file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "ascend/*.cc"
-        "graph_kernel/*.cc"
-        "graph_kernel/model/*.cc"
     )
     list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST})
 endif()
@@ -22,8 +20,6 @@ endif()
 if(ENABLE_GPU)
     file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "gpu/*.cc"
-        "graph_kernel/*.cc"
-        "graph_kernel/model/*.cc"
     )
     list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST})
 endif()
@@ -43,6 +39,13 @@ if(ENABLE_CPU)
     list(APPEND _PREACTIVATE_SRC_LIST ${_CPU_SRC_LIST})
 endif()
 
+if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    file(GLOB_RECURSE _GK_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "graph_kernel/*.cc"
+        )
+    list(APPEND _PREACTIVATE_SRC_LIST ${_GK_SRC_LIST})
+endif()
+
 set_property(SOURCE ${_PREACTIVATE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
   SUBMODULE_ID=mindspore::SubModuleId::SM_PRE_ACT)
 add_library(_mindspore_backend_optimizer_obj OBJECT ${_PREACTIVATE_SRC_LIST})
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
index 232875e9b93..0ffb2172ec0 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
@@ -38,6 +38,8 @@
 #include "runtime/device/ascend/kernel_select_ascend.h"
 #elif ENABLE_GPU
 #include "runtime/device/gpu/kernel_info_setter.h"
+#elif ENABLE_CPU
+#include "runtime/device/cpu/kernel_select_cpu.h"
 #endif
 
 namespace mindspore::graphkernel {
@@ -608,6 +610,9 @@ void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) {
 #elif ENABLE_GPU
   cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
   device::gpu::SetKernelInfo(cnode, kernel_type);
+#elif ENABLE_CPU
+  cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
+  device::cpu::SetKernelInfo(cnode);
 #endif
 }
 
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc
index 7c5611371d3..f82adb5715e 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc
@@ -121,7 +121,7 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
   pm->AddPass(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);
 
   // Universal arithmetic simplify
-  pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu);
+  pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);
 
   // Common subexpression elimination
   pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_2);
@@ -158,7 +158,7 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
 PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
   auto pm = std::make_shared<GraphKernelPassManager>(4, "highlevelopt2");
   // Enable atomic add
-  pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2);
+  pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);
 
   // Enable atomic add for stitch nodes.
   auto level = GetPassLevelByFlag(context::GraphKernelFlags::GetInstance().enable_stitch_fusion);
@@ -170,8 +170,8 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
   pm->AddPass(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);
 
   // Enable tsa and uss
-  pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1);
-  pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1);
+  pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
+  pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);
 
   return pm;
 }
@@ -204,6 +204,7 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
   MS_EXCEPTION_IF_NULL(context_ptr);
   is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
   is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
+  is_cpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice);
 
   auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
   optimizer->AddPassManager(PreProcess());
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h
index 59927c17e90..291574c70c2 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h
@@ -46,6 +46,7 @@ class GraphKernelOptimizer {
 
   bool is_gpu{false};
   bool is_ascend{false};
+  bool is_cpu{false};
 };
 
 void GraphKernelOptimize(const KernelGraphPtr &kernel_graph);
diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc
index fe95ad3be0f..e2bfd76b9b8 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -21,14 +21,17 @@
 #include "ir/anf.h"
 #include "utils/ms_utils.h"
 #include "utils/trace_base.h"
+#include "utils/context/graph_kernel_flags.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime.h"
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/cpu/insert_cast_cpu.h"
 #include "backend/optimizer/cpu/insert_format_transform_op.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #include "backend/optimizer/pass/erase_visit_attr.h"
 #include "debug/anf_ir_dump.h"
@@ -102,6 +105,16 @@ void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
   kernel_graph->SetExecOrderByDefault();
 }
 
+void CPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
+#ifdef ENABLE_AKG
+  if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
+    return;
+  }
+  graphkernel::GraphKernelOptimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+#endif
+}
+
 GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
   auto graph_id = graph_sum_;
   auto graph = ConstructKernelGraph(lst, outputs);
@@ -112,6 +125,7 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
   MS_LOG(INFO) << "Set kernel info end";
   Optimize(graph);
   FinalOptimize(graph);
+  GraphKernelOptimize(graph);
   MS_LOG(INFO) << "Build kernel";
   BuildKernel(graph.get());
   // Remove reorder after PS feature finish adapting push/pull in auto_monad.
@@ -352,10 +366,20 @@ void KernelNotSupportException(const AnfNodePtr &kernel_node) {
 void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto &kernel_nodes = kernel_graph->execution_order();
+  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
+  std::vector<AnfNodePtr> akg_nodes;
   for (const auto &kernel_node : kernel_nodes) {
     MS_EXCEPTION_IF_NULL(kernel_node);
     std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
     MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "].";
+    if (session::AnfRuntimeAlgorithm::GetKernelType(kernel_node) == KernelType::AKG_KERNEL) {
+      if (!bin_map->initialized()) {
+        bin_map->Initialize();
+      }
+      akg_nodes.push_back(kernel_node);
+      continue;
+    }
     std::shared_ptr<kernel::CPUKernel> cpu_kernel =
       kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
     if (cpu_kernel == nullptr) {
@@ -369,6 +393,10 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
     AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
     MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "].";
   }
+#ifdef ENABLE_AKG
+  kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
+  (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
+#endif
 }
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/cpu_session.h b/mindspore/ccsrc/backend/session/cpu_session.h
index 90f6066407a..e969d3ab75e 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.h
+++ b/mindspore/ccsrc/backend/session/cpu_session.h
@@ -42,6 +42,7 @@ class CPUSession : public SessionBasic {
                         VectorRef *const outputs) override;
   void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
   ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override;
+  void GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph);
   void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
   KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                              const std::vector<tensor::TensorPtr> &input_tensors,
diff --git a/mindspore/ccsrc/backend/session/kernel_build_client.h b/mindspore/ccsrc/backend/session/kernel_build_client.h
index 864de7ba0f3..9c4a15c6248 100644
--- a/mindspore/ccsrc/backend/session/kernel_build_client.h
+++ b/mindspore/ccsrc/backend/session/kernel_build_client.h
@@ -256,7 +256,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
   ~AscendKernelBuildClient() override { Close(); }
 };
 
-class GpuKernelBuildClient : public KernelBuildClient {
+class AkgKernelBuildClient : public KernelBuildClient {
  public:
   // Server configure
   constexpr inline static auto kGetPathScript =
@@ -264,15 +264,15 @@ class GpuKernelBuildClient : public KernelBuildClient {
     "\""
     "import pkgutil;"
     "path = pkgutil"
-    ".get_loader(\\\"mindspore._extends.remote.kernel_build_server_gpu\\\")"  // Server module name
+    ".get_loader(\\\"mindspore._extends.remote.kernel_build_server_akg\\\")"  // Server module name
     ".get_filename();"
     "print('[~]' + path)"
     "\"";
 
-  constexpr inline static auto kServerScript = "kernel_build_server_gpu.py";
+  constexpr inline static auto kServerScript = "kernel_build_server_akg.py";
 
-  static GpuKernelBuildClient &Instance() {
-    static GpuKernelBuildClient instance;
+  static AkgKernelBuildClient &Instance() {
+    static AkgKernelBuildClient instance;
     return instance;
   }
 
@@ -283,15 +283,15 @@ class GpuKernelBuildClient : public KernelBuildClient {
     return GetScriptFilePath(env, kGetPathScript, kServerScript);
   }
 
-  GpuKernelBuildClient(const GpuKernelBuildClient &) = delete;
-  GpuKernelBuildClient &operator=(const GpuKernelBuildClient &) = delete;
+  AkgKernelBuildClient(const AkgKernelBuildClient &) = delete;
+  AkgKernelBuildClient &operator=(const AkgKernelBuildClient &) = delete;
 
-  GpuKernelBuildClient(GpuKernelBuildClient &&) = delete;
-  GpuKernelBuildClient &operator=(GpuKernelBuildClient &&) = delete;
+  AkgKernelBuildClient(AkgKernelBuildClient &&) = delete;
+  AkgKernelBuildClient &operator=(AkgKernelBuildClient &&) = delete;
 
  private:
-  GpuKernelBuildClient() { Open(); }
-  ~GpuKernelBuildClient() override { Close(); }
+  AkgKernelBuildClient() { Open(); }
+  ~AkgKernelBuildClient() override { Close(); }
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
index a749fb01974..458b42d7104 100644
--- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
@@ -18,10 +18,12 @@
 #include <string>
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/cpu_memory_manager.h"
+#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "backend/kernel_compiler/kernel_build_info.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "utils/trace_base.h"
+#include "utils/context/graph_kernel_flags.h"
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/common/common_backend_optimization.h"
@@ -29,6 +31,8 @@
 #include "backend/optimizer/cpu/insert_format_transform_op.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #include "backend/optimizer/pass/erase_visit_attr.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
+#include "backend/session/anf_runtime_algorithm.h"
 #include "profiler/device/cpu/cpu_profiling.h"
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
@@ -105,6 +109,14 @@ void CPUDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
 
   // Run final optimization.
   opt::CommonFinalOptimization(graph);
+
+#ifdef ENABLE_AKG
+  // Run graph kernel fusion optimization
+  if (context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
+    graphkernel::GraphKernelOptimize(graph);
+    graph->SetExecOrderByDefault();
+  }
+#endif
 }
 
 void CPUDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
@@ -165,11 +177,21 @@ void CPUDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const
 }
 
 void CPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
+  kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
+  MS_EXCEPTION_IF_NULL(bin_map);
+  std::vector<AnfNodePtr> akg_nodes;
   for (const auto &node : nodes) {
     MS_EXCEPTION_IF_NULL(node);
     if (AnfAlgo::IsControlOpExecInBackend(node)) {
       continue;
     }
+    if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) {
+      if (!bin_map->initialized()) {
+        bin_map->Initialize();
+      }
+      akg_nodes.push_back(node);
+      continue;
+    }
     std::string kernel_name = AnfAlgo::GetCNodeName(node);
     std::shared_ptr<kernel::CPUKernel> cpu_kernel = kernel::CPUKernelFactory::GetInstance().Create(kernel_name, node);
     if (!cpu_kernel) {
@@ -179,6 +201,10 @@ void CPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
     cpu_kernel->Init(node);
     AnfAlgo::SetKernelMod(cpu_kernel, node.get());
   }
+#ifdef ENABLE_AKG
+  kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
+  (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
+#endif
 }
 
 void CPUDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const {
@@ -196,12 +222,12 @@ bool CPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
   MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
   auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
   MS_EXCEPTION_IF_NULL(kernel_mod);
-  auto cpu_kernel_mod = dynamic_cast<kernel::CPUKernel *>(kernel_mod);
-  MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
 
   // Some CPU kernels can't initialize kernel and launch kernel in different thread, so reinitialize the kernels before
   // launch.
   if (kOpNotSupportMultiThreadExecList.find(AnfAlgo::GetCNodeName(kernel)) != kOpNotSupportMultiThreadExecList.end()) {
+    auto cpu_kernel_mod = dynamic_cast<kernel::CPUKernel *>(kernel_mod);
+    MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
     cpu_kernel_mod->InitKernel(kernel);
   }
 #ifndef ENABLE_SECURITY
diff --git a/mindspore/ccsrc/utils/context/graph_kernel_flags.cc b/mindspore/ccsrc/utils/context/graph_kernel_flags.cc
index 438b606169a..6555df521f0 100644
--- a/mindspore/ccsrc/utils/context/graph_kernel_flags.cc
+++ b/mindspore/ccsrc/utils/context/graph_kernel_flags.cc
@@ -172,7 +172,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma
   FlagRegister reg(flag_map);
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
-  bool is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
+  bool is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
 
   // Set opt_level first, some flags' default value depends on it.
   // Default optimization level is level 2 when enable graphkernel
@@ -192,7 +192,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma
 
   // Integer flags
   reg.AddFlag("online_tuning", &online_tuning);
-  reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_gpu ? OpLevel_MAX : OpLevel_0);
+  reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_ascend ? OpLevel_0 : OpLevel_MAX);
 
   // String flags
   reg.AddFlag("repository_path", &repository_path);
diff --git a/mindspore/context.py b/mindspore/context.py
index 33afbd754a1..33ff95af303 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -489,8 +489,8 @@ def _check_target_specific_cfgs(device, arg_key):
     device_cfgs = {
         'enable_dump': ['Ascend'],
         'save_dump_path': ['Ascend'],
-        'enable_graph_kernel': ['Ascend', 'GPU'],
-        'graph_kernel_flags': ['Ascend', 'GPU'],
+        'enable_graph_kernel': ['Ascend', 'GPU', 'CPU'],
+        'graph_kernel_flags': ['Ascend', 'GPU', 'CPU'],
         'enable_reduce_precision': ['Ascend'],
         'enable_profiling': ['Ascend'],
         'profiling_options': ['Ascend'],
diff --git a/scripts/build/build_mindspore.sh b/scripts/build/build_mindspore.sh
index f08980dffb4..ecee047a97e 100755
--- a/scripts/build/build_mindspore.sh
+++ b/scripts/build/build_mindspore.sh
@@ -79,8 +79,11 @@ build_mindspore()
     if [[ "X$USE_GLOG" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DUSE_GLOG=ON"
     fi
-    if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then
+    if [[ "X$ENABLE_AKG" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_AKG=ON"
+        if [[ "X$ENABLE_CPU" = "Xon" && "X$ENABLE_D" != "Xon" && "X$ENABLE_GPU" != "Xon" ]]; then
+            CMAKE_ARGS="${CMAKE_ARGS} -DUSE_LLVM=ON"
+        fi
     fi
     if [[ "X$ENABLE_ACL" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ACL=ON"
@@ -107,4 +110,4 @@ build_mindspore()
     fi
     cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM
     echo "success building mindspore project!"
-}
\ No newline at end of file
+}
diff --git a/scripts/build/default_options.sh b/scripts/build/default_options.sh
index 2e12c6592e2..f2bfb38b794 100755
--- a/scripts/build/default_options.sh
+++ b/scripts/build/default_options.sh
@@ -44,7 +44,7 @@ init_default_options()
   export LITE_PLATFORM=""
   export LITE_ENABLE_AAR="off"
   export USE_GLOG="on"
-  export ENABLE_AKG="on"
+  export ENABLE_AKG="off"
   export ENABLE_ACL="off"
   export ENABLE_D="off"
   export ENABLE_DEBUGGER="on"
@@ -63,4 +63,4 @@ init_default_options()
   export USER_ENABLE_DUMP_IR=false
   export USER_ENABLE_DEBUGGER=false
   export ENABLE_SYM_FILE="off"
-}
\ No newline at end of file
+}
diff --git a/scripts/build/parse_device.sh b/scripts/build/parse_device.sh
index f3d233f458f..327cc186584 100755
--- a/scripts/build/parse_device.sh
+++ b/scripts/build/parse_device.sh
@@ -40,6 +40,7 @@ parse_device()
       exit 1
     fi
     export CUDA_VERSION="$DEVICE_VERSION"
+    export ENABLE_AKG="on"
   elif [[ "X$DEVICE" == "Xd" || "X$DEVICE" == "Xascend" ]]; then
     # version default 910
     if [[ "X$DEVICE_VERSION" == "X" ]]; then
@@ -54,6 +55,7 @@ parse_device()
       export ENABLE_ACL="on"
       ENABLE_CPU="on"
       export ENABLE_MPI="on"
+      export ENABLE_AKG="on"
     else
       echo "Invalid value ${DEVICE_VERSION} for option -V"
       usage
@@ -68,4 +70,4 @@ parse_device()
     usage
     exit 1
   fi
-}
\ No newline at end of file
+}
diff --git a/scripts/build/usage.sh b/scripts/build/usage.sh
index c802b8f1401..e50eab2aaca 100755
--- a/scripts/build/usage.sh
+++ b/scripts/build/usage.sh
@@ -21,7 +21,7 @@ usage()
   echo "Usage:"
   echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t ut|st] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
   echo "              [-a on|off] [-p on|off] [-i] [-R] [-D on|off] [-j[n]] [-e gpu|ascend|cpu] \\"
-  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
+  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K on|off] \\"
   echo "              [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
   echo "              [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
   echo "              [-L Tensor-RT path] [-y on|off]  \\"
@@ -52,7 +52,7 @@ usage()
   echo "    -V Specify the device version, if -e gpu, default CUDA 10.1, if -e ascend, default Ascend 910"
   echo "    -I Enable compiling mindspore lite for arm64, arm32 or x86_64, default disable mindspore lite compilation"
   echo "    -A Enable compiling mindspore lite aar package, option: on/off, default: off"
-  echo "    -K Compile with AKG, default on"
+  echo "    -K Compile with AKG, default on if -e gpu or -e ascend, else default off"
   echo "    -B Enable debugger, default on"
   echo "    -E Enable IBVERBS for parameter server, default off"
   echo "    -l Compile with python dependency, default on"
@@ -62,4 +62,4 @@ usage()
   echo "    -H Enable hidden"
   echo "    -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking"
   echo "    -y Compile the symbol table switch and save the symbol table to the directory output"
-}
\ No newline at end of file
+}