diff --git a/CMakeLists.txt b/CMakeLists.txt index 098b8af2383..3dd84ca95d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(PYBIND11_CPP_STANDARD -std=c++17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPTION_CXX_FLAGS}") -if(ENABLE_AKG AND (ENABLE_D OR ENABLE_GPU)) +if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux") add_subdirectory("${CMAKE_SOURCE_DIR}/akg") endif() diff --git a/build.sh b/build.sh index af83835b99d..5fa6113cc75 100755 --- a/build.sh +++ b/build.sh @@ -45,7 +45,7 @@ update_submodule() cd "${BASEPATH}/graphengine" git submodule update --init metadef cd "${BASEPATH}" - if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then + if [[ "X$ENABLE_AKG" = "Xon" ]]; then git submodule update --init --recursive akg fi } @@ -57,7 +57,6 @@ build_exit() exit 1 } - make_clean() { echo "enable make clean" diff --git a/cmake/options.cmake b/cmake/options.cmake index 59d5861c5ed..cfdf946b0ff 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -151,3 +151,7 @@ endif() if(ENABLE_CPU AND NOT WIN32) add_compile_definitions(ENABLE_ARMOUR) endif() + +if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions(ENABLE_AKG) +endif() diff --git a/cmake/package.cmake b/cmake/package.cmake index 1fb3227d1f9..6f5e3594797 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -291,7 +291,7 @@ install( COMPONENT mindspore ) -if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG) +if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux") set (AKG_PATH ${BUILD_PATH}/mindspore/akg) file(REMOVE_RECURSE ${AKG_PATH}/_akg) file(MAKE_DIRECTORY ${AKG_PATH}/_akg) diff --git a/cmake/package_win.cmake b/cmake/package_win.cmake index aaac79b921a..a2cfa701d40 100644 --- a/cmake/package_win.cmake +++ b/cmake/package_win.cmake @@ -187,20 +187,6 @@ install( COMPONENT mindspore ) -if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG) - set (AKG_PATH ${CMAKE_SOURCE_DIR}/build/mindspore/akg) - file(REMOVE_RECURSE ${AKG_PATH}/_akg) - file(MAKE_DIRECTORY ${AKG_PATH}/_akg) - file(TOUCH ${AKG_PATH}/_akg/__init__.py) - install(DIRECTORY "${AKG_PATH}/akg" DESTINATION "${AKG_PATH}/_akg") - install( - DIRECTORY - ${AKG_PATH}/_akg - DESTINATION ${INSTALL_PY_DIR}/ - COMPONENT mindspore - ) -endif() - if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset) install( DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/dataset diff --git a/mindspore/_extends/graph_kernel/model/graph_split.py b/mindspore/_extends/graph_kernel/model/graph_split.py index 745b4382963..1b5c5e9983d 100644 --- a/mindspore/_extends/graph_kernel/model/graph_split.py +++ b/mindspore/_extends/graph_kernel/model/graph_split.py @@ -1180,11 +1180,131 @@ class GraphSplitAscend(GraphSplitByPattern): _fuse_once(fuse_func) +class GraphSplitCpu(GraphSplitByPattern): + """Graph splitter""" + BORADCAST_FUSE_DEPTH = 20 + REDUCE_FUSE_DEPTH = 20 + + def get_default_mode(self, op): + """Get default mode in CPU""" + pattern = PrimLib.iter_type(op) + return self.Area.MODE_BASIC if pattern == PrimLib.RESHAPE else self.Area.MODE_COMPOSITE + + def pattern_fuse(self, fuse_func=None): + """fuse Areas by pattern""" + def _reshape(dom): + if dom.pattern != PrimLib.RESHAPE: + return None + min_area, forward_fuse = None, False + for a, _ in dom.out_relations.items(): + if a.pattern <= PrimLib.BROADCAST and dom.check_acyclic(a) and \ + (min_area is None or a.pattern < min_area.pattern): + min_area = a + for a, _ in dom.in_relations.items(): + if a.pattern <= PrimLib.BROADCAST and a.check_acyclic(dom) and \ + len(dom.ops[0].inputs[0].to_ops) == 1 and not a.is_output and \ + (min_area is None or a.pattern < min_area.pattern): + min_area, forward_fuse = a, True + return ([min_area], forward_fuse) if min_area else None + + def _elemwise_depth(dom): + if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.in_relations) != 1: + return None + a, r = list(dom.in_relations.items())[0] + if a.pattern > PrimLib.BROADCAST or len(a.out_relations) != 1 or r != PrimLib.ELEMWISE or \ + a.dom_op().output.shape != dom.dom_op().output.shape: + return None + return [a], True + + def _elemwise_width(dom): + if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST): + return None + fused = [] + for a, r in dom.in_relations.items(): + if a.pattern <= PrimLib.BROADCAST and r == PrimLib.ELEMWISE and a.check_acyclic(dom) and \ + a.dom_op().output.shape == dom.dom_op().output.shape: + fused.append(a) + return fused, True + + def _broadcast_pat_exclude(dom, a, r): + if a.pattern == PrimLib.REDUCE: + return dom.pattern > PrimLib.ELEMWISE or r > PrimLib.ELEMWISE + return a.pattern > PrimLib.REDUCE or r > PrimLib.BROADCAST + + def _broadcast_depth(dom): + if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.out_relations) != 1 or \ + dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH: + return None + a, r = list(dom.out_relations.items())[0] + if _broadcast_pat_exclude(dom, a, r) or len(a.in_relations) != 1: + return None + return [a], False + + def _broadcast_width(dom): + if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or \ + dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH: + return None + fused = [] + for a, r in dom.out_relations.items(): + if _broadcast_pat_exclude(dom, a, r) or not dom.check_acyclic(a) or \ + (fused and fused[0].dom_op().output.shape != a.dom_op().output.shape): + return None + fused.append(a) + return fused, False + + def _reduce_pat_exclude(_, a, r): + if len(a.ops) > self.REDUCE_FUSE_DEPTH: + return True + return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST + + def _reduce_depth(dom): + if dom.pattern != PrimLib.REDUCE or len(dom.in_relations) != 1: + return None + a, r = list(dom.in_relations.items())[0] + if _reduce_pat_exclude(dom, a, r) or len(a.out_relations) != 1: + return None + return [a], True + + def _reduce_width(dom): + if dom.pattern != PrimLib.REDUCE: + return None + fused = [] + for a, r in dom.in_relations.items(): + if not _reduce_pat_exclude(dom, a, r) and a.check_acyclic(dom): + fused.append(a) + return fused, True + + def _fuse_loop(): + changed = True + while changed: + changed = False + changed = self.fuse(_reshape) or changed + changed = self.fuse(_elemwise_depth) or changed + changed = self.fuse(_elemwise_width) or changed + changed = self.fuse(_reduce_depth) or changed + changed = self.fuse(_reduce_width) or changed + changed = self.fuse(_broadcast_depth) or changed + changed = self.fuse(_broadcast_width) or changed + + def _fuse_once(fuse_func): + if fuse_func(_reshape) or fuse_func(_elemwise_depth) or fuse_func(_elemwise_width) or \ + fuse_func(_reduce_depth) or fuse_func(_reduce_width) or fuse_func(_broadcast_depth) or \ + fuse_func(_broadcast_width): + return + + if fuse_func is None: + _fuse_loop() + else: + _fuse_once(fuse_func) + + def split(graph, target, flags): """Split graph""" result = None if target == "cuda": result = GraphSplitGpu(graph, flags).split() - else: + elif target == "aicore": result = GraphSplitAscend(graph, flags).split() + else: + result = GraphSplitCpu(graph, flags).split() return result diff --git a/mindspore/_extends/graph_kernel/model/model_builder.py b/mindspore/_extends/graph_kernel/model/model_builder.py index e23efd54992..e36c1fc5fd5 100644 --- a/mindspore/_extends/graph_kernel/model/model_builder.py +++ b/mindspore/_extends/graph_kernel/model/model_builder.py @@ -132,7 +132,7 @@ class CompositeGraph: return dict() attr = {} for a in op['attr']: - if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'): + if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin', 'Argmax', 'Argmin'): attr['reduce_axis'] = a['value'] else: attr[a['name']] = a['value'] diff --git a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py index d3f0bbf1641..096ee8f005f 100644 --- a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py +++ b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py @@ -33,7 +33,7 @@ def copy_json(pid_path, ppid_path): shutil.move(os.path.join(pid_path, json_file), ppid_path) -def _compile_akg_task_gpu(json_strs, attrs): +def _compile_akg_task_default(json_strs, attrs): """ compile func called in single process @@ -110,16 +110,14 @@ class AkgProcess: if self.argc == 0: raise ValueError("json must be not null") args = [(arg, attrs) for arg in self.args] - if self.platform == "GPU": - with Pool(processes=self.process_num) as pool: - res = pool.starmap_async(_compile_akg_task_gpu, args) - res.get(timeout=self.wait_time) - elif self.platform == "ASCEND": + if self.platform == "ASCEND": with Pool(processes=self.process_num) as pool: res = pool.starmap_async(_compile_akg_task_ascend, args) res.get(timeout=self.wait_time) else: - raise ValueError("The value of 'platform' must be 'GPU' or 'ASCEND'.") + with Pool(processes=self.process_num) as pool: + res = pool.starmap_async(_compile_akg_task_default, args) + res.get(timeout=self.wait_time) return True def accept_json(self, json): diff --git a/mindspore/_extends/remote/kernel_build_server_gpu.py b/mindspore/_extends/remote/kernel_build_server_akg.py similarity index 81% rename from mindspore/_extends/remote/kernel_build_server_gpu.py rename to mindspore/_extends/remote/kernel_build_server_akg.py index c951508f226..bd1ee1fd924 100644 --- a/mindspore/_extends/remote/kernel_build_server_gpu.py +++ b/mindspore/_extends/remote/kernel_build_server_akg.py @@ -12,22 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""kernel build server for gpu""" +"""kernel build server for akg kernels""" import sys import warnings from mindspore._extends.remote.kernel_build_server import Messager, get_logger, AkgBuilder -class GpuMessager(Messager): +class AkgMessager(Messager): ''' - GPU Messager + Default Messager for akg kernels. It works as a server, communicating with c++ client. ''' def __init__(self, fdin, fdout): super().__init__(fdin, fdout) - get_logger().info("[TRACE] GPU Messager init...") - self.akg_builder = AkgBuilder("GPU") + get_logger().info("[TRACE] Akg Messager init...") + self.akg_builder = AkgBuilder("default") def handle(self): """ @@ -42,7 +42,7 @@ class GpuMessager(Messager): self.exit() def exit(self): - get_logger().info("[TRACE] GPU Messager Exit...") + get_logger().info("[TRACE] Akg Messager Exit...") exit() @@ -51,5 +51,5 @@ if __name__ == '__main__': if len(sys.argv) != 3: raise Exception('Incorrect argv: {}'.format(sys.argv)) get_logger().debug(f"[TRACE] argv: {str(sys.argv)}") - messager = GpuMessager(int(sys.argv[1]), int(sys.argv[2])) + messager = AkgMessager(int(sys.argv[1]), int(sys.argv[2])) messager.run() diff --git a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt index bfc8f66bae6..3ed92e71e51 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt +++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt @@ -13,12 +13,6 @@ if(ENABLE_D) file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "kernel_query.cc" "kernel_fusion.cc" - "akg/akg_kernel_build.cc" - "akg/ascend/*.cc" - "akg/akg_kernel_json_generator.cc" - "akg/akg_kernel_json_decoder.cc" - "akg/akg_kernel_attrs_process.cc" - "akg/akg_kernel_metadata.cc" "tbe/*.cc" "host/*.cc" "aicpu/*.cc" @@ -95,11 +89,6 @@ endif() if(ENABLE_GPU) file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cu" - "akg/akg_kernel_build.cc" - "akg/gpu/*.cc" - "akg/akg_kernel_json_generator.cc" - "akg/akg_kernel_json_decoder.cc" - "akg/akg_kernel_attrs_process.cc" ) file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc") @@ -122,7 +111,35 @@ if(ENABLE_GPU) # add_library(_mindspore_kernel_cuda_obj OBJECT ${CUDA_SRC_LIST}) endif() -set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST} +if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux") + file(GLOB_RECURSE AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "akg/akg_kernel_build.cc" + "akg/akg_kernel_json_generator.cc" + "akg/akg_kernel_json_decoder.cc" + "akg/akg_kernel_attrs_process.cc" + ) + if(ENABLE_GPU) + file(GLOB_RECURSE AKG_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "akg/gpu/*.cc" + ) + list(APPEND AKG_SRC_LIST ${AKG_GPU_SRC_LIST}) + endif() + if(ENABLE_D) + file(GLOB_RECURSE AKG_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "akg/ascend/*.cc" + "akg/akg_kernel_metadata.cc" + ) + list(APPEND AKG_SRC_LIST ${AKG_D_SRC_LIST}) + endif() + if(ENABLE_CPU) + file(GLOB_RECURSE AKG_CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "akg/cpu/*.cc" + ) + list(APPEND AKG_SRC_LIST ${AKG_CPU_SRC_LIST}) + endif() +endif() + +set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL) add_library(_mindspore_backend_kernel_compiler_obj OBJECT ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} - ${GPU_SRC_LIST} ${D_SRC_LIST} ${QUANTUM_SRC_LIST}) + ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST} ${QUANTUM_SRC_LIST}) diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc index 33a352ecda4..2d3975e8d22 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc @@ -16,6 +16,7 @@ #include "backend/kernel_compiler/akg/akg_kernel_build.h" +#include #include #include #include diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h index 50c477234a6..0507b4aa143 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h @@ -17,8 +17,6 @@ #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_ -#include - #include #include #include diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc new file mode 100644 index 00000000000..1b7d49c505b --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc @@ -0,0 +1,49 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h" +#include +#include +#include +#include +#include "backend/kernel_compiler/common_utils.h" +#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h" +#include "utils/ms_utils.h" +#include "backend/session/anf_runtime_algorithm.h" + +namespace mindspore { +namespace kernel { +KernelPackPtr AkgCpuKernelBuilder::AkgSearchCache(const std::string &kernel_name) { + return SearchCache(kernel_name, kProcessorCpu); +} + +KernelPackPtr AkgCpuKernelBuilder::AkgInsertCache(const std::string &kernel_name) { + return InsertCache(kernel_name, kProcessorCpu); +} + +void AkgCpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack, + const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) { + auto kernel_mod_ptr = std::make_shared(kernel_pack); + kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list()); + kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list()); + AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get()); +} + +void AkgCpuKernelBuilder::AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) { + kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path()); +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h new file mode 100644 index 00000000000..695f997c2cc --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_ +#include +#include "backend/kernel_compiler/akg/akg_kernel_build.h" +#include "base/base.h" + +namespace mindspore { +namespace kernel { +class AkgCpuKernelBuilder : public AkgKernelBuilder { + public: + AkgCpuKernelBuilder() = default; + ~AkgCpuKernelBuilder() = default; + + kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); } + KernelPackPtr AkgSearchCache(const std::string &kernel_name) override; + KernelPackPtr AkgInsertCache(const std::string &kernel_name) override; + void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator, + const AnfNodePtr &anf_node) override; + void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override; +}; +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc new file mode 100644 index 00000000000..911cd6992e6 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc @@ -0,0 +1,143 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h" + +#include +#include +#include +#include +#include "nlohmann/json.hpp" +#include "backend/kernel_compiler/common_utils.h" +#include "common/thread_pool.h" +#include "utils/ms_utils.h" +#include "mindspore/ccsrc/debug/common.h" + +namespace mindspore { +namespace kernel { +namespace { +using AkgParallelLambda = int (*)(int task_id, int num_task, void *cdata); +int AkgLaunchFunc(AkgParallelLambda flambda, void *cdata, int num_task) { + size_t num_workers = + std::min(mindspore::common::ThreadPool::GetInstance().GetSyncRunThreadNum(), static_cast(num_task)); + std::vector tasks; + size_t thread_index = 0; + while (thread_index < num_workers) { + auto block = [&, thread_index]() { + flambda(thread_index, num_workers, cdata); + return mindspore::common::SUCCESS; + }; + tasks.emplace_back(block); + thread_index++; + } + mindspore::common::ThreadPool::GetInstance().SyncRun(tasks); + return 0; +} + +struct AkgCallBack { + void *parallel_launch_func; + void *(*malloc_func)(size_t); + void (*free_func)(void *); + + AkgCallBack() { + parallel_launch_func = reinterpret_cast(&AkgLaunchFunc); + malloc_func = &malloc; + free_func = &free; + } + ~AkgCallBack() = default; +}; +} // namespace +CpuKernelManagerPtr CpuKernelMod::kernelmanager_ = std::make_shared(); + +CpuKernelManager::~CpuKernelManager() { + for (auto &cpu_func_pair : cpu_func_map_) { + if (cpu_func_pair.second.second != nullptr) { + (void)dlclose(cpu_func_pair.second.second); + } + } +} + +void *CpuKernelManager::SearchFunc(const std::string &kernel_name) const { + auto iter = cpu_func_map_.find(kernel_name); + if (iter == cpu_func_map_.end()) { + return nullptr; + } else { + return iter->second.first; + } +} + +void *CpuKernelManager::SearchFuncWithSharedLock(const std::string &kernel_name) const { + std::shared_lock lock(mutex_); + return SearchFunc(kernel_name); +} + +void *CpuKernelManager::GetFunction(const std::string &kernel_name) { + if (auto func = SearchFuncWithSharedLock(kernel_name); func != nullptr) { + return func; + } + std::unique_lock lock(mutex_); + // Search cache again between setting unique lock and calling "dlopen", to make sure that + // only one thread can call "dlopen" and insert handle to the cache for a new kernel_name. + // To avoid that several nodes (with the same kernel_name) open the same "so" by dlopen, + // but only cache it once, then the "dlclose" will be called only once, causing resource leak. + if (auto func = SearchFunc(kernel_name); func != nullptr) { + return func; + } + std::string fn; + auto it = kernel_name.rfind("_kernel"); + if (it < kernel_name.size()) { + fn = kernel_name.substr(0, it); + } else { + fn = kernel_name; + } + std::string fn_so = kCpuKernelMeta + fn + ".so"; + auto handle = dlopen(fn_so.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (handle == nullptr) { + MS_LOG(ERROR) << "Load " << fn_so << " failed. kernel: " << kernel_name; + return nullptr; + } + auto launch_func = dlsym(handle, kernel_name.c_str()); + if (launch_func == nullptr) { + MS_LOG(ERROR) << "Undefined symbol " << kernel_name << " in " << fn_so; + return nullptr; + } + cpu_func_map_[kernel_name] = std::make_pair(launch_func, handle); + return launch_func; +} + +bool CpuKernelMod::Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs, void *stream_ptr) { + auto js = nlohmann::json::parse(kernel_pack_->GetJson()->contents, + kernel_pack_->GetJson()->contents + kernel_pack_->GetJson()->len); + std::string kernel_name = js["kernelName"]; + auto launch_func = kernelmanager_->GetFunction(kernel_name); + if (launch_func == nullptr) { + MS_LOG(ERROR) << "GetFunction failed. kernel: " << kernel_name; + return false; + } + std::vector runtimeargs; + (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs), + [](const AddressPtr &input) -> void * { return input->addr; }); + (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs), + [](const AddressPtr &output) -> void * { return output->addr; }); + AkgCallBack akg_callback; + runtimeargs.emplace_back(reinterpret_cast(&akg_callback)); + using AkgCpuKernelFunction = void (*)(void *); + reinterpret_cast(launch_func)(reinterpret_cast(runtimeargs.data())); + return true; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h new file mode 100644 index 00000000000..71cf82405f9 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h @@ -0,0 +1,73 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_ +#include +#include +#include +#include +#include +#include +#include +#include "backend/kernel_compiler/kernel.h" + +namespace mindspore { +namespace kernel { +class CpuKernelManager { + public: + CpuKernelManager() = default; + ~CpuKernelManager(); + + void *GetFunction(const std::string &kernel_name); + + private: + void *SearchFunc(const std::string &kernel_name) const; + void *SearchFuncWithSharedLock(const std::string &kernel_name) const; + + // cache the kernel function: kernel_name -> {kernel_func, so_handle} + std::unordered_map> cpu_func_map_; + mutable std::shared_mutex mutex_; +}; +using CpuKernelManagerPtr = std::shared_ptr; + +class CpuKernelMod : public KernelMod { + public: + explicit CpuKernelMod(const KernelPackPtr &kp) : kernel_pack_(kp) {} + ~CpuKernelMod() = default; + + void SetInputSizeList(const std::vector &size_list) { input_size_list_ = size_list; } + void SetOutputSizeList(const std::vector &size_list) { output_size_list_ = size_list; } + void SetWorkspaceSizeList(const std::vector &size_list) { workspace_size_list_ = size_list; } + const std::vector &GetInputSizeList() const override { return input_size_list_; } + const std::vector &GetOutputSizeList() const override { return output_size_list_; } + const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } + bool Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs, void *stream_ptr) override; + + static CpuKernelManagerPtr kernelmanager_; + + private: + KernelPackPtr kernel_pack_; + std::vector input_size_list_; + std::vector output_size_list_; + std::vector workspace_size_list_; // workspace is not used in cpu kernel. +}; + +using CpuKernelModPtr = std::shared_ptr; +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h index 5453aea7157..50ad8f1970c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.h @@ -27,7 +27,7 @@ class AkgGpuKernelBuilder : public AkgKernelBuilder { AkgGpuKernelBuilder() = default; ~AkgGpuKernelBuilder() = default; - kernel::KernelBuildClient *GetClient() override { return &(kernel::GpuKernelBuildClient::Instance()); } + kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); } KernelPackPtr AkgSearchCache(const std::string &kernel_name) override; KernelPackPtr AkgInsertCache(const std::string &kernel_name) override; void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator, diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc index 426889507c5..2eaa33f40d4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc @@ -157,16 +157,21 @@ FusionType GetFusionTypeByName(const std::string &name) { } void KernelMeta::Initialize() { - kernel_meta_path_ = std::string(kGpuKernelMeta) + "/"; + if (GetStrProcessorFromContext() == kProcessorCpu) { + kernel_meta_path_ = std::string(kCpuKernelMeta); + } else { + kernel_meta_path_ = std::string(kGpuKernelMeta) + "/"; #if defined(_WIN32) || defined(_WIN64) - auto ret = mkdir(kernel_meta_path_.c_str()); + auto ret = mkdir(kernel_meta_path_.c_str()); #else - auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU); + auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU); #endif - if (ret != 0) { - MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later"; + if (ret != 0) { + MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later"; + } } + initialized_ = true; } @@ -238,6 +243,8 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro std::string kernel_json; if (processor == kProcessorAiCore || processor == kProcessorAiCpu) { kernel_json = kCceKernelMeta; + } else if (processor == kProcessorCpu) { + kernel_json = kCpuKernelMeta; } else { kernel_json = bin_map->kernel_meta_path(); } @@ -872,6 +879,8 @@ Processor GetProcessorFromContext() { processor = kernel::Processor::CUDA; } else if (device_info == kAscendDevice) { processor = kernel::Processor::AICORE; + } else if (device_info == kCPUDevice) { + processor = kernel::Processor::CPU; } return processor; } @@ -883,6 +892,8 @@ std::string GetStrProcessorFromContext() { str_processor = kernel::kProcessorCuda; } else if (processor == kernel::Processor::AICORE) { str_processor = kernel::kProcessorAiCore; + } else if (processor == kernel::Processor::CPU) { + str_processor = kernel::kProcessorCpu; } return str_processor; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h index 7eee55693b6..e29e9040007 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h +++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h @@ -34,10 +34,12 @@ namespace mindspore { namespace kernel { constexpr auto kCceKernelMeta = "./kernel_meta/"; +constexpr auto kCpuKernelMeta = "./kernel_meta/"; constexpr auto kGpuKernelMeta = "./cuda_meta"; constexpr auto kProcessorAiCore = "aicore"; constexpr auto kProcessorAiCpu = "aicpu"; constexpr auto kProcessorCuda = "cuda"; +constexpr auto kProcessorCpu = "cpu"; constexpr auto kProcessorUnknown = "unknown"; constexpr auto kJsonSuffix = ".json"; constexpr auto kInfoSuffix = ".info"; diff --git a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc index 0319ec04995..63a211fb87c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2019-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -99,6 +99,14 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string & (void)kernel_json.seekg(0, std::ios::beg); (void)kernel_json.read(json_->contents, SizeToLong(json_->len)); + if (processor == kProcessorCpu) { + std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".so"; + if (!CheckHash(json_f, bin_f, js)) { + return false; + } + return true; + } + if (processor == kProcessorCuda) { std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".ptx"; std::ifstream kernelbin(bin_f); diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel.h b/mindspore/ccsrc/backend/kernel_compiler/kernel.h index e727a4884e4..50c8753f2e5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h @@ -107,6 +107,7 @@ enum Processor { AICORE = 0, AICPU, CUDA, + CPU, }; struct FlexArray { diff --git a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt index 52a365ade73..1f43c2099fb 100644 --- a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt +++ b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt @@ -13,8 +13,6 @@ endif() if(ENABLE_D OR ENABLE_ACL) file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ascend/*.cc" - "graph_kernel/*.cc" - "graph_kernel/model/*.cc" ) list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST}) endif() @@ -22,8 +20,6 @@ endif() if(ENABLE_GPU) file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc" - "graph_kernel/*.cc" - "graph_kernel/model/*.cc" ) list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST}) endif() @@ -43,6 +39,13 @@ if(ENABLE_CPU) list(APPEND _PREACTIVATE_SRC_LIST ${_CPU_SRC_LIST}) endif() +if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux") + file(GLOB_RECURSE _GK_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} + "graph_kernel/*.cc" + ) + list(APPEND _PREACTIVATE_SRC_LIST ${_GK_SRC_LIST}) +endif() + set_property(SOURCE ${_PREACTIVATE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PRE_ACT) add_library(_mindspore_backend_optimizer_obj OBJECT ${_PREACTIVATE_SRC_LIST}) diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc index 232875e9b93..0ffb2172ec0 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc @@ -38,6 +38,8 @@ #include "runtime/device/ascend/kernel_select_ascend.h" #elif ENABLE_GPU #include "runtime/device/gpu/kernel_info_setter.h" +#elif ENABLE_CPU +#include "runtime/device/cpu/kernel_select_cpu.h" #endif namespace mindspore::graphkernel { @@ -608,6 +610,9 @@ void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) { #elif ENABLE_GPU cnode->set_kernel_info(std::make_shared()); device::gpu::SetKernelInfo(cnode, kernel_type); +#elif ENABLE_CPU + cnode->set_kernel_info(std::make_shared()); + device::cpu::SetKernelInfo(cnode); #endif } diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc index 7c5611371d3..f82adb5715e 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc @@ -121,7 +121,7 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const { pm->AddPass(std::make_shared(), OptLevel_1, is_gpu); // Universal arithmetic simplify - pm->AddPass(std::make_shared(), OptLevel_2, is_gpu); + pm->AddPass(std::make_shared(), OptLevel_2, is_gpu || is_cpu); // Common subexpression elimination pm->AddPass(std::make_shared(), OptLevel_2); @@ -158,7 +158,7 @@ PassManagerPtr GraphKernelOptimizer::Split() const { PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const { auto pm = std::make_shared(4, "highlevelopt2"); // Enable atomic add - pm->AddPass(std::make_shared(), OptLevel_2); + pm->AddPass(std::make_shared(), OptLevel_2, is_gpu || is_ascend); // Enable atomic add for stitch nodes. auto level = GetPassLevelByFlag(context::GraphKernelFlags::GetInstance().enable_stitch_fusion); @@ -170,8 +170,8 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const { pm->AddPass(std::make_shared(), level_low_precision, is_ascend); // Enable tsa and uss - pm->AddPass(std::make_shared(), OptLevel_1); - pm->AddPass(std::make_shared(), OptLevel_1); + pm->AddPass(std::make_shared(), OptLevel_1, is_gpu); + pm->AddPass(std::make_shared(), OptLevel_1, is_gpu); return pm; } @@ -204,6 +204,7 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) { MS_EXCEPTION_IF_NULL(context_ptr); is_gpu = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice); is_ascend = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice); + is_cpu = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kCPUDevice); auto optimizer = std::make_shared("graph_kernel_optimizer"); optimizer->AddPassManager(PreProcess()); diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h index 59927c17e90..291574c70c2 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h @@ -46,6 +46,7 @@ class GraphKernelOptimizer { bool is_gpu{false}; bool is_ascend{false}; + bool is_cpu{false}; }; void GraphKernelOptimize(const KernelGraphPtr &kernel_graph); diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc index fe95ad3be0f..e2bfd76b9b8 100644 --- a/mindspore/ccsrc/backend/session/cpu_session.cc +++ b/mindspore/ccsrc/backend/session/cpu_session.cc @@ -21,14 +21,17 @@ #include "ir/anf.h" #include "utils/ms_utils.h" #include "utils/trace_base.h" +#include "utils/context/graph_kernel_flags.h" #include "backend/session/anf_runtime_algorithm.h" #include "runtime/device/kernel_runtime.h" +#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" #include "runtime/device/cpu/kernel_select_cpu.h" #include "backend/optimizer/common/optimizer.h" #include "backend/optimizer/common/pass_manager.h" #include "backend/optimizer/cpu/insert_cast_cpu.h" #include "backend/optimizer/cpu/insert_format_transform_op.h" +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" #include "backend/optimizer/pass/replace_node_by_proxy.h" #include "backend/optimizer/pass/erase_visit_attr.h" #include "debug/anf_ir_dump.h" @@ -102,6 +105,16 @@ void CPUSession::Optimize(const std::shared_ptr &kernel_graph) { kernel_graph->SetExecOrderByDefault(); } +void CPUSession::GraphKernelOptimize(const std::shared_ptr &kernel_graph) { +#ifdef ENABLE_AKG + if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) { + return; + } + graphkernel::GraphKernelOptimize(kernel_graph); + kernel_graph->SetExecOrderByDefault(); +#endif +} + GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { auto graph_id = graph_sum_; auto graph = ConstructKernelGraph(lst, outputs); @@ -112,6 +125,7 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr MS_LOG(INFO) << "Set kernel info end"; Optimize(graph); FinalOptimize(graph); + GraphKernelOptimize(graph); MS_LOG(INFO) << "Build kernel"; BuildKernel(graph.get()); // Remove reorder after PS feature finish adapting push/pull in auto_monad. @@ -352,10 +366,20 @@ void KernelNotSupportException(const AnfNodePtr &kernel_node) { void CPUSession::BuildKernel(const KernelGraph *kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); auto &kernel_nodes = kernel_graph->execution_order(); + kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance(); + MS_EXCEPTION_IF_NULL(bin_map); + std::vector akg_nodes; for (const auto &kernel_node : kernel_nodes) { MS_EXCEPTION_IF_NULL(kernel_node); std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "]."; + if (session::AnfRuntimeAlgorithm::GetKernelType(kernel_node) == KernelType::AKG_KERNEL) { + if (!bin_map->initialized()) { + bin_map->Initialize(); + } + akg_nodes.push_back(kernel_node); + continue; + } std::shared_ptr cpu_kernel = kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node); if (cpu_kernel == nullptr) { @@ -369,6 +393,10 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) { AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get()); MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "]."; } +#ifdef ENABLE_AKG + kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder; + (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes); +#endif } } // namespace session } // namespace mindspore diff --git a/mindspore/ccsrc/backend/session/cpu_session.h b/mindspore/ccsrc/backend/session/cpu_session.h index 90f6066407a..e969d3ab75e 100644 --- a/mindspore/ccsrc/backend/session/cpu_session.h +++ b/mindspore/ccsrc/backend/session/cpu_session.h @@ -42,6 +42,7 @@ class CPUSession : public SessionBasic { VectorRef *const outputs) override; void ExecuteGraph(const std::shared_ptr &kernel_graph) override; ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override; + void GraphKernelOptimize(const std::shared_ptr &kernel_graph); void Optimize(const std::shared_ptr &kernel_graph); KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, const std::vector &input_tensors, diff --git a/mindspore/ccsrc/backend/session/kernel_build_client.h b/mindspore/ccsrc/backend/session/kernel_build_client.h index 864de7ba0f3..9c4a15c6248 100644 --- a/mindspore/ccsrc/backend/session/kernel_build_client.h +++ b/mindspore/ccsrc/backend/session/kernel_build_client.h @@ -256,7 +256,7 @@ class AscendKernelBuildClient : public KernelBuildClient { ~AscendKernelBuildClient() override { Close(); } }; -class GpuKernelBuildClient : public KernelBuildClient { +class AkgKernelBuildClient : public KernelBuildClient { public: // Server configure constexpr inline static auto kGetPathScript = @@ -264,15 +264,15 @@ class GpuKernelBuildClient : public KernelBuildClient { "\"" "import pkgutil;" "path = pkgutil" - ".get_loader(\\\"mindspore._extends.remote.kernel_build_server_gpu\\\")" // Server module name + ".get_loader(\\\"mindspore._extends.remote.kernel_build_server_akg\\\")" // Server module name ".get_filename();" "print('[~]' + path)" "\""; - constexpr inline static auto kServerScript = "kernel_build_server_gpu.py"; + constexpr inline static auto kServerScript = "kernel_build_server_akg.py"; - static GpuKernelBuildClient &Instance() { - static GpuKernelBuildClient instance; + static AkgKernelBuildClient &Instance() { + static AkgKernelBuildClient instance; return instance; } @@ -283,15 +283,15 @@ class GpuKernelBuildClient : public KernelBuildClient { return GetScriptFilePath(env, kGetPathScript, kServerScript); } - GpuKernelBuildClient(const GpuKernelBuildClient &) = delete; - GpuKernelBuildClient &operator=(const GpuKernelBuildClient &) = delete; + AkgKernelBuildClient(const AkgKernelBuildClient &) = delete; + AkgKernelBuildClient &operator=(const AkgKernelBuildClient &) = delete; - GpuKernelBuildClient(GpuKernelBuildClient &&) = delete; - GpuKernelBuildClient &operator=(GpuKernelBuildClient &&) = delete; + AkgKernelBuildClient(AkgKernelBuildClient &&) = delete; + AkgKernelBuildClient &operator=(AkgKernelBuildClient &&) = delete; private: - GpuKernelBuildClient() { Open(); } - ~GpuKernelBuildClient() override { Close(); } + AkgKernelBuildClient() { Open(); } + ~AkgKernelBuildClient() override { Close(); } }; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc index a749fb01974..458b42d7104 100644 --- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc @@ -18,10 +18,12 @@ #include #include "runtime/device/cpu/cpu_device_address.h" #include "runtime/device/cpu/cpu_memory_manager.h" +#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" #include "backend/kernel_compiler/kernel_build_info.h" #include "runtime/device/cpu/kernel_select_cpu.h" #include "utils/trace_base.h" +#include "utils/context/graph_kernel_flags.h" #include "backend/optimizer/common/optimizer.h" #include "backend/optimizer/common/pass_manager.h" #include "backend/optimizer/common/common_backend_optimization.h" @@ -29,6 +31,8 @@ #include "backend/optimizer/cpu/insert_format_transform_op.h" #include "backend/optimizer/pass/replace_node_by_proxy.h" #include "backend/optimizer/pass/erase_visit_attr.h" +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" +#include "backend/session/anf_runtime_algorithm.h" #include "profiler/device/cpu/cpu_profiling.h" #ifndef ENABLE_SECURITY #include "debug/data_dump/dump_json_parser.h" @@ -105,6 +109,14 @@ void CPUDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const { // Run final optimization. opt::CommonFinalOptimization(graph); + +#ifdef ENABLE_AKG + // Run graph kernel fusion optimization + if (context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) { + graphkernel::GraphKernelOptimize(graph); + graph->SetExecOrderByDefault(); + } +#endif } void CPUDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const { @@ -165,11 +177,21 @@ void CPUDeviceContext::SetOperatorInfo(const std::vector &nodes) const } void CPUDeviceContext::CreateKernel(const std::vector &nodes) const { + kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance(); + MS_EXCEPTION_IF_NULL(bin_map); + std::vector akg_nodes; for (const auto &node : nodes) { MS_EXCEPTION_IF_NULL(node); if (AnfAlgo::IsControlOpExecInBackend(node)) { continue; } + if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) { + if (!bin_map->initialized()) { + bin_map->Initialize(); + } + akg_nodes.push_back(node); + continue; + } std::string kernel_name = AnfAlgo::GetCNodeName(node); std::shared_ptr cpu_kernel = kernel::CPUKernelFactory::GetInstance().Create(kernel_name, node); if (!cpu_kernel) { @@ -179,6 +201,10 @@ void CPUDeviceContext::CreateKernel(const std::vector &nodes) const { cpu_kernel->Init(node); AnfAlgo::SetKernelMod(cpu_kernel, node.get()); } +#ifdef ENABLE_AKG + kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder; + (void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes); +#endif } void CPUDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const { @@ -196,12 +222,12 @@ bool CPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vectorfullname_with_scope(); auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); - auto cpu_kernel_mod = dynamic_cast(kernel_mod); - MS_EXCEPTION_IF_NULL(cpu_kernel_mod); // Some CPU kernels can't initialize kernel and launch kernel in different thread, so reinitialize the kernels before // launch. if (kOpNotSupportMultiThreadExecList.find(AnfAlgo::GetCNodeName(kernel)) != kOpNotSupportMultiThreadExecList.end()) { + auto cpu_kernel_mod = dynamic_cast(kernel_mod); + MS_EXCEPTION_IF_NULL(cpu_kernel_mod); cpu_kernel_mod->InitKernel(kernel); } #ifndef ENABLE_SECURITY diff --git a/mindspore/ccsrc/utils/context/graph_kernel_flags.cc b/mindspore/ccsrc/utils/context/graph_kernel_flags.cc index 438b606169a..6555df521f0 100644 --- a/mindspore/ccsrc/utils/context/graph_kernel_flags.cc +++ b/mindspore/ccsrc/utils/context/graph_kernel_flags.cc @@ -172,7 +172,7 @@ void GraphKernelFlags::RegisterFlags(std::map *flag_ma FlagRegister reg(flag_map); auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); - bool is_gpu = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice); + bool is_ascend = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice); // Set opt_level first, some flags' default value depends on it. // Default optimization level is level 2 when enable graphkernel @@ -192,7 +192,7 @@ void GraphKernelFlags::RegisterFlags(std::map *flag_ma // Integer flags reg.AddFlag("online_tuning", &online_tuning); - reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_gpu ? OpLevel_MAX : OpLevel_0); + reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_ascend ? OpLevel_0 : OpLevel_MAX); // String flags reg.AddFlag("repository_path", &repository_path); diff --git a/mindspore/context.py b/mindspore/context.py index 33afbd754a1..33ff95af303 100644 --- a/mindspore/context.py +++ b/mindspore/context.py @@ -489,8 +489,8 @@ def _check_target_specific_cfgs(device, arg_key): device_cfgs = { 'enable_dump': ['Ascend'], 'save_dump_path': ['Ascend'], - 'enable_graph_kernel': ['Ascend', 'GPU'], - 'graph_kernel_flags': ['Ascend', 'GPU'], + 'enable_graph_kernel': ['Ascend', 'GPU', 'CPU'], + 'graph_kernel_flags': ['Ascend', 'GPU', 'CPU'], 'enable_reduce_precision': ['Ascend'], 'enable_profiling': ['Ascend'], 'profiling_options': ['Ascend'], diff --git a/scripts/build/build_mindspore.sh b/scripts/build/build_mindspore.sh index f08980dffb4..ecee047a97e 100755 --- a/scripts/build/build_mindspore.sh +++ b/scripts/build/build_mindspore.sh @@ -79,8 +79,11 @@ build_mindspore() if [[ "X$USE_GLOG" = "Xon" ]]; then CMAKE_ARGS="${CMAKE_ARGS} -DUSE_GLOG=ON" fi - if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then + if [[ "X$ENABLE_AKG" = "Xon" ]]; then CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_AKG=ON" + if [[ "X$ENABLE_CPU" = "Xon" && "X$ENABLE_D" != "Xon" && "X$ENABLE_GPU" != "Xon" ]]; then + CMAKE_ARGS="${CMAKE_ARGS} -DUSE_LLVM=ON" + fi fi if [[ "X$ENABLE_ACL" = "Xon" ]]; then CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ACL=ON" @@ -107,4 +110,4 @@ build_mindspore() fi cmake --build . --target package ${CMAKE_VERBOSE} -j$THREAD_NUM echo "success building mindspore project!" -} \ No newline at end of file +} diff --git a/scripts/build/default_options.sh b/scripts/build/default_options.sh index 2e12c6592e2..f2bfb38b794 100755 --- a/scripts/build/default_options.sh +++ b/scripts/build/default_options.sh @@ -44,7 +44,7 @@ init_default_options() export LITE_PLATFORM="" export LITE_ENABLE_AAR="off" export USE_GLOG="on" - export ENABLE_AKG="on" + export ENABLE_AKG="off" export ENABLE_ACL="off" export ENABLE_D="off" export ENABLE_DEBUGGER="on" @@ -63,4 +63,4 @@ init_default_options() export USER_ENABLE_DUMP_IR=false export USER_ENABLE_DEBUGGER=false export ENABLE_SYM_FILE="off" -} \ No newline at end of file +} diff --git a/scripts/build/parse_device.sh b/scripts/build/parse_device.sh index f3d233f458f..327cc186584 100755 --- a/scripts/build/parse_device.sh +++ b/scripts/build/parse_device.sh @@ -40,6 +40,7 @@ parse_device() exit 1 fi export CUDA_VERSION="$DEVICE_VERSION" + export ENABLE_AKG="on" elif [[ "X$DEVICE" == "Xd" || "X$DEVICE" == "Xascend" ]]; then # version default 910 if [[ "X$DEVICE_VERSION" == "X" ]]; then @@ -54,6 +55,7 @@ parse_device() export ENABLE_ACL="on" ENABLE_CPU="on" export ENABLE_MPI="on" + export ENABLE_AKG="on" else echo "Invalid value ${DEVICE_VERSION} for option -V" usage @@ -68,4 +70,4 @@ parse_device() usage exit 1 fi -} \ No newline at end of file +} diff --git a/scripts/build/usage.sh b/scripts/build/usage.sh index c802b8f1401..e50eab2aaca 100755 --- a/scripts/build/usage.sh +++ b/scripts/build/usage.sh @@ -21,7 +21,7 @@ usage() echo "Usage:" echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t ut|st] [-g on|off] [-h] [-b ge] [-m infer|train] \\" echo " [-a on|off] [-p on|off] [-i] [-R] [-D on|off] [-j[n]] [-e gpu|ascend|cpu] \\" - echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\" + echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K on|off] \\" echo " [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\" echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\" echo " [-L Tensor-RT path] [-y on|off] \\" @@ -52,7 +52,7 @@ usage() echo " -V Specify the device version, if -e gpu, default CUDA 10.1, if -e ascend, default Ascend 910" echo " -I Enable compiling mindspore lite for arm64, arm32 or x86_64, default disable mindspore lite compilation" echo " -A Enable compiling mindspore lite aar package, option: on/off, default: off" - echo " -K Compile with AKG, default on" + echo " -K Compile with AKG, default on if -e gpu or -e ascend, else default off" echo " -B Enable debugger, default on" echo " -E Enable IBVERBS for parameter server, default off" echo " -l Compile with python dependency, default on" @@ -62,4 +62,4 @@ usage() echo " -H Enable hidden" echo " -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking" echo " -y Compile the symbol table switch and save the symbol table to the directory output" -} \ No newline at end of file +}