!22731 GraphKernel supports CPU
Merge pull request !22731 from DeshiChen/0901_graphkernel_cpu
This commit is contained in:
commit
06b0beced7
|
@ -47,7 +47,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|||
set(PYBIND11_CPP_STANDARD -std=c++17)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPTION_CXX_FLAGS}")
|
||||
|
||||
if(ENABLE_AKG AND (ENABLE_D OR ENABLE_GPU))
|
||||
if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_subdirectory("${CMAKE_SOURCE_DIR}/akg")
|
||||
endif()
|
||||
|
||||
|
|
3
build.sh
3
build.sh
|
@ -45,7 +45,7 @@ update_submodule()
|
|||
cd "${BASEPATH}/graphengine"
|
||||
git submodule update --init metadef
|
||||
cd "${BASEPATH}"
|
||||
if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then
|
||||
if [[ "X$ENABLE_AKG" = "Xon" ]]; then
|
||||
git submodule update --init --recursive akg
|
||||
fi
|
||||
}
|
||||
|
@ -57,7 +57,6 @@ build_exit()
|
|||
exit 1
|
||||
}
|
||||
|
||||
|
||||
make_clean()
|
||||
{
|
||||
echo "enable make clean"
|
||||
|
|
|
@ -151,3 +151,7 @@ endif()
|
|||
if(ENABLE_CPU AND NOT WIN32)
|
||||
add_compile_definitions(ENABLE_ARMOUR)
|
||||
endif()
|
||||
|
||||
if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_compile_definitions(ENABLE_AKG)
|
||||
endif()
|
||||
|
|
|
@ -291,7 +291,7 @@ install(
|
|||
COMPONENT mindspore
|
||||
)
|
||||
|
||||
if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG)
|
||||
if(ENABLE_AKG AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
set (AKG_PATH ${BUILD_PATH}/mindspore/akg)
|
||||
file(REMOVE_RECURSE ${AKG_PATH}/_akg)
|
||||
file(MAKE_DIRECTORY ${AKG_PATH}/_akg)
|
||||
|
|
|
@ -187,20 +187,6 @@ install(
|
|||
COMPONENT mindspore
|
||||
)
|
||||
|
||||
if((ENABLE_D OR ENABLE_GPU) AND ENABLE_AKG)
|
||||
set (AKG_PATH ${CMAKE_SOURCE_DIR}/build/mindspore/akg)
|
||||
file(REMOVE_RECURSE ${AKG_PATH}/_akg)
|
||||
file(MAKE_DIRECTORY ${AKG_PATH}/_akg)
|
||||
file(TOUCH ${AKG_PATH}/_akg/__init__.py)
|
||||
install(DIRECTORY "${AKG_PATH}/akg" DESTINATION "${AKG_PATH}/_akg")
|
||||
install(
|
||||
DIRECTORY
|
||||
${AKG_PATH}/_akg
|
||||
DESTINATION ${INSTALL_PY_DIR}/
|
||||
COMPONENT mindspore
|
||||
)
|
||||
endif()
|
||||
|
||||
if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
|
||||
install(
|
||||
DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/dataset
|
||||
|
|
|
@ -1180,11 +1180,131 @@ class GraphSplitAscend(GraphSplitByPattern):
|
|||
_fuse_once(fuse_func)
|
||||
|
||||
|
||||
class GraphSplitCpu(GraphSplitByPattern):
|
||||
"""Graph splitter"""
|
||||
BORADCAST_FUSE_DEPTH = 20
|
||||
REDUCE_FUSE_DEPTH = 20
|
||||
|
||||
def get_default_mode(self, op):
|
||||
"""Get default mode in CPU"""
|
||||
pattern = PrimLib.iter_type(op)
|
||||
return self.Area.MODE_BASIC if pattern == PrimLib.RESHAPE else self.Area.MODE_COMPOSITE
|
||||
|
||||
def pattern_fuse(self, fuse_func=None):
|
||||
"""fuse Areas by pattern"""
|
||||
def _reshape(dom):
|
||||
if dom.pattern != PrimLib.RESHAPE:
|
||||
return None
|
||||
min_area, forward_fuse = None, False
|
||||
for a, _ in dom.out_relations.items():
|
||||
if a.pattern <= PrimLib.BROADCAST and dom.check_acyclic(a) and \
|
||||
(min_area is None or a.pattern < min_area.pattern):
|
||||
min_area = a
|
||||
for a, _ in dom.in_relations.items():
|
||||
if a.pattern <= PrimLib.BROADCAST and a.check_acyclic(dom) and \
|
||||
len(dom.ops[0].inputs[0].to_ops) == 1 and not a.is_output and \
|
||||
(min_area is None or a.pattern < min_area.pattern):
|
||||
min_area, forward_fuse = a, True
|
||||
return ([min_area], forward_fuse) if min_area else None
|
||||
|
||||
def _elemwise_depth(dom):
|
||||
if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.in_relations) != 1:
|
||||
return None
|
||||
a, r = list(dom.in_relations.items())[0]
|
||||
if a.pattern > PrimLib.BROADCAST or len(a.out_relations) != 1 or r != PrimLib.ELEMWISE or \
|
||||
a.dom_op().output.shape != dom.dom_op().output.shape:
|
||||
return None
|
||||
return [a], True
|
||||
|
||||
def _elemwise_width(dom):
|
||||
if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST):
|
||||
return None
|
||||
fused = []
|
||||
for a, r in dom.in_relations.items():
|
||||
if a.pattern <= PrimLib.BROADCAST and r == PrimLib.ELEMWISE and a.check_acyclic(dom) and \
|
||||
a.dom_op().output.shape == dom.dom_op().output.shape:
|
||||
fused.append(a)
|
||||
return fused, True
|
||||
|
||||
def _broadcast_pat_exclude(dom, a, r):
|
||||
if a.pattern == PrimLib.REDUCE:
|
||||
return dom.pattern > PrimLib.ELEMWISE or r > PrimLib.ELEMWISE
|
||||
return a.pattern > PrimLib.REDUCE or r > PrimLib.BROADCAST
|
||||
|
||||
def _broadcast_depth(dom):
|
||||
if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or len(dom.out_relations) != 1 or \
|
||||
dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH:
|
||||
return None
|
||||
a, r = list(dom.out_relations.items())[0]
|
||||
if _broadcast_pat_exclude(dom, a, r) or len(a.in_relations) != 1:
|
||||
return None
|
||||
return [a], False
|
||||
|
||||
def _broadcast_width(dom):
|
||||
if dom.pattern not in (PrimLib.ELEMWISE, PrimLib.BROADCAST) or \
|
||||
dom.is_output or len(dom.ops) > self.BORADCAST_FUSE_DEPTH:
|
||||
return None
|
||||
fused = []
|
||||
for a, r in dom.out_relations.items():
|
||||
if _broadcast_pat_exclude(dom, a, r) or not dom.check_acyclic(a) or \
|
||||
(fused and fused[0].dom_op().output.shape != a.dom_op().output.shape):
|
||||
return None
|
||||
fused.append(a)
|
||||
return fused, False
|
||||
|
||||
def _reduce_pat_exclude(_, a, r):
|
||||
if len(a.ops) > self.REDUCE_FUSE_DEPTH:
|
||||
return True
|
||||
return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST
|
||||
|
||||
def _reduce_depth(dom):
|
||||
if dom.pattern != PrimLib.REDUCE or len(dom.in_relations) != 1:
|
||||
return None
|
||||
a, r = list(dom.in_relations.items())[0]
|
||||
if _reduce_pat_exclude(dom, a, r) or len(a.out_relations) != 1:
|
||||
return None
|
||||
return [a], True
|
||||
|
||||
def _reduce_width(dom):
|
||||
if dom.pattern != PrimLib.REDUCE:
|
||||
return None
|
||||
fused = []
|
||||
for a, r in dom.in_relations.items():
|
||||
if not _reduce_pat_exclude(dom, a, r) and a.check_acyclic(dom):
|
||||
fused.append(a)
|
||||
return fused, True
|
||||
|
||||
def _fuse_loop():
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
changed = self.fuse(_reshape) or changed
|
||||
changed = self.fuse(_elemwise_depth) or changed
|
||||
changed = self.fuse(_elemwise_width) or changed
|
||||
changed = self.fuse(_reduce_depth) or changed
|
||||
changed = self.fuse(_reduce_width) or changed
|
||||
changed = self.fuse(_broadcast_depth) or changed
|
||||
changed = self.fuse(_broadcast_width) or changed
|
||||
|
||||
def _fuse_once(fuse_func):
|
||||
if fuse_func(_reshape) or fuse_func(_elemwise_depth) or fuse_func(_elemwise_width) or \
|
||||
fuse_func(_reduce_depth) or fuse_func(_reduce_width) or fuse_func(_broadcast_depth) or \
|
||||
fuse_func(_broadcast_width):
|
||||
return
|
||||
|
||||
if fuse_func is None:
|
||||
_fuse_loop()
|
||||
else:
|
||||
_fuse_once(fuse_func)
|
||||
|
||||
|
||||
def split(graph, target, flags):
|
||||
"""Split graph"""
|
||||
result = None
|
||||
if target == "cuda":
|
||||
result = GraphSplitGpu(graph, flags).split()
|
||||
else:
|
||||
elif target == "aicore":
|
||||
result = GraphSplitAscend(graph, flags).split()
|
||||
else:
|
||||
result = GraphSplitCpu(graph, flags).split()
|
||||
return result
|
||||
|
|
|
@ -132,7 +132,7 @@ class CompositeGraph:
|
|||
return dict()
|
||||
attr = {}
|
||||
for a in op['attr']:
|
||||
if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'):
|
||||
if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin', 'Argmax', 'Argmin'):
|
||||
attr['reduce_axis'] = a['value']
|
||||
else:
|
||||
attr[a['name']] = a['value']
|
||||
|
|
|
@ -33,7 +33,7 @@ def copy_json(pid_path, ppid_path):
|
|||
shutil.move(os.path.join(pid_path, json_file), ppid_path)
|
||||
|
||||
|
||||
def _compile_akg_task_gpu(json_strs, attrs):
|
||||
def _compile_akg_task_default(json_strs, attrs):
|
||||
"""
|
||||
compile func called in single process
|
||||
|
||||
|
@ -110,16 +110,14 @@ class AkgProcess:
|
|||
if self.argc == 0:
|
||||
raise ValueError("json must be not null")
|
||||
args = [(arg, attrs) for arg in self.args]
|
||||
if self.platform == "GPU":
|
||||
with Pool(processes=self.process_num) as pool:
|
||||
res = pool.starmap_async(_compile_akg_task_gpu, args)
|
||||
res.get(timeout=self.wait_time)
|
||||
elif self.platform == "ASCEND":
|
||||
if self.platform == "ASCEND":
|
||||
with Pool(processes=self.process_num) as pool:
|
||||
res = pool.starmap_async(_compile_akg_task_ascend, args)
|
||||
res.get(timeout=self.wait_time)
|
||||
else:
|
||||
raise ValueError("The value of 'platform' must be 'GPU' or 'ASCEND'.")
|
||||
with Pool(processes=self.process_num) as pool:
|
||||
res = pool.starmap_async(_compile_akg_task_default, args)
|
||||
res.get(timeout=self.wait_time)
|
||||
return True
|
||||
|
||||
def accept_json(self, json):
|
||||
|
|
|
@ -12,22 +12,22 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""kernel build server for gpu"""
|
||||
"""kernel build server for akg kernels"""
|
||||
import sys
|
||||
import warnings
|
||||
from mindspore._extends.remote.kernel_build_server import Messager, get_logger, AkgBuilder
|
||||
|
||||
|
||||
class GpuMessager(Messager):
|
||||
class AkgMessager(Messager):
|
||||
'''
|
||||
GPU Messager
|
||||
Default Messager for akg kernels.
|
||||
It works as a server, communicating with c++ client.
|
||||
'''
|
||||
|
||||
def __init__(self, fdin, fdout):
|
||||
super().__init__(fdin, fdout)
|
||||
get_logger().info("[TRACE] GPU Messager init...")
|
||||
self.akg_builder = AkgBuilder("GPU")
|
||||
get_logger().info("[TRACE] Akg Messager init...")
|
||||
self.akg_builder = AkgBuilder("default")
|
||||
|
||||
def handle(self):
|
||||
"""
|
||||
|
@ -42,7 +42,7 @@ class GpuMessager(Messager):
|
|||
self.exit()
|
||||
|
||||
def exit(self):
|
||||
get_logger().info("[TRACE] GPU Messager Exit...")
|
||||
get_logger().info("[TRACE] Akg Messager Exit...")
|
||||
exit()
|
||||
|
||||
|
||||
|
@ -51,5 +51,5 @@ if __name__ == '__main__':
|
|||
if len(sys.argv) != 3:
|
||||
raise Exception('Incorrect argv: {}'.format(sys.argv))
|
||||
get_logger().debug(f"[TRACE] argv: {str(sys.argv)}")
|
||||
messager = GpuMessager(int(sys.argv[1]), int(sys.argv[2]))
|
||||
messager = AkgMessager(int(sys.argv[1]), int(sys.argv[2]))
|
||||
messager.run()
|
|
@ -13,12 +13,6 @@ if(ENABLE_D)
|
|||
file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"kernel_query.cc"
|
||||
"kernel_fusion.cc"
|
||||
"akg/akg_kernel_build.cc"
|
||||
"akg/ascend/*.cc"
|
||||
"akg/akg_kernel_json_generator.cc"
|
||||
"akg/akg_kernel_json_decoder.cc"
|
||||
"akg/akg_kernel_attrs_process.cc"
|
||||
"akg/akg_kernel_metadata.cc"
|
||||
"tbe/*.cc"
|
||||
"host/*.cc"
|
||||
"aicpu/*.cc"
|
||||
|
@ -95,11 +89,6 @@ endif()
|
|||
if(ENABLE_GPU)
|
||||
file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"gpu/*.cu"
|
||||
"akg/akg_kernel_build.cc"
|
||||
"akg/gpu/*.cc"
|
||||
"akg/akg_kernel_json_generator.cc"
|
||||
"akg/akg_kernel_json_decoder.cc"
|
||||
"akg/akg_kernel_attrs_process.cc"
|
||||
)
|
||||
|
||||
file(GLOB_RECURSE GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
|
||||
|
@ -122,7 +111,35 @@ if(ENABLE_GPU)
|
|||
# add_library(_mindspore_kernel_cuda_obj OBJECT ${CUDA_SRC_LIST})
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST}
|
||||
if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
file(GLOB_RECURSE AKG_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"akg/akg_kernel_build.cc"
|
||||
"akg/akg_kernel_json_generator.cc"
|
||||
"akg/akg_kernel_json_decoder.cc"
|
||||
"akg/akg_kernel_attrs_process.cc"
|
||||
)
|
||||
if(ENABLE_GPU)
|
||||
file(GLOB_RECURSE AKG_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"akg/gpu/*.cc"
|
||||
)
|
||||
list(APPEND AKG_SRC_LIST ${AKG_GPU_SRC_LIST})
|
||||
endif()
|
||||
if(ENABLE_D)
|
||||
file(GLOB_RECURSE AKG_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"akg/ascend/*.cc"
|
||||
"akg/akg_kernel_metadata.cc"
|
||||
)
|
||||
list(APPEND AKG_SRC_LIST ${AKG_D_SRC_LIST})
|
||||
endif()
|
||||
if(ENABLE_CPU)
|
||||
file(GLOB_RECURSE AKG_CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"akg/cpu/*.cc"
|
||||
)
|
||||
list(APPEND AKG_SRC_LIST ${AKG_CPU_SRC_LIST})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${KERNEL_SRC_LIST} ${CPU_SRC_LIST} ${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST}
|
||||
PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_KERNEL)
|
||||
add_library(_mindspore_backend_kernel_compiler_obj OBJECT ${KERNEL_SRC_LIST} ${CPU_SRC_LIST}
|
||||
${GPU_SRC_LIST} ${D_SRC_LIST} ${QUANTUM_SRC_LIST})
|
||||
${GPU_SRC_LIST} ${D_SRC_LIST} ${AKG_SRC_LIST} ${QUANTUM_SRC_LIST})
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "backend/kernel_compiler/akg/akg_kernel_build.h"
|
||||
|
||||
#include <sys/shm.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
|
||||
|
||||
#include <sys/shm.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
|
||||
#include <Python.h>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
KernelPackPtr AkgCpuKernelBuilder::AkgSearchCache(const std::string &kernel_name) {
|
||||
return SearchCache(kernel_name, kProcessorCpu);
|
||||
}
|
||||
|
||||
KernelPackPtr AkgCpuKernelBuilder::AkgInsertCache(const std::string &kernel_name) {
|
||||
return InsertCache(kernel_name, kProcessorCpu);
|
||||
}
|
||||
|
||||
void AkgCpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
|
||||
const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
|
||||
auto kernel_mod_ptr = std::make_shared<CpuKernelMod>(kernel_pack);
|
||||
kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
|
||||
kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
|
||||
AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
|
||||
}
|
||||
|
||||
void AkgCpuKernelBuilder::AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) {
|
||||
kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path());
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/akg/akg_kernel_build.h"
|
||||
#include "base/base.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class AkgCpuKernelBuilder : public AkgKernelBuilder {
|
||||
public:
|
||||
AkgCpuKernelBuilder() = default;
|
||||
~AkgCpuKernelBuilder() = default;
|
||||
|
||||
kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); }
|
||||
KernelPackPtr AkgSearchCache(const std::string &kernel_name) override;
|
||||
KernelPackPtr AkgInsertCache(const std::string &kernel_name) override;
|
||||
void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator,
|
||||
const AnfNodePtr &anf_node) override;
|
||||
void AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) override;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_BUILD_H_
|
|
@ -0,0 +1,143 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
#include "common/thread_pool.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "mindspore/ccsrc/debug/common.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
using AkgParallelLambda = int (*)(int task_id, int num_task, void *cdata);
|
||||
int AkgLaunchFunc(AkgParallelLambda flambda, void *cdata, int num_task) {
|
||||
size_t num_workers =
|
||||
std::min(mindspore::common::ThreadPool::GetInstance().GetSyncRunThreadNum(), static_cast<size_t>(num_task));
|
||||
std::vector<mindspore::common::Task> tasks;
|
||||
size_t thread_index = 0;
|
||||
while (thread_index < num_workers) {
|
||||
auto block = [&, thread_index]() {
|
||||
flambda(thread_index, num_workers, cdata);
|
||||
return mindspore::common::SUCCESS;
|
||||
};
|
||||
tasks.emplace_back(block);
|
||||
thread_index++;
|
||||
}
|
||||
mindspore::common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct AkgCallBack {
|
||||
void *parallel_launch_func;
|
||||
void *(*malloc_func)(size_t);
|
||||
void (*free_func)(void *);
|
||||
|
||||
AkgCallBack() {
|
||||
parallel_launch_func = reinterpret_cast<void *>(&AkgLaunchFunc);
|
||||
malloc_func = &malloc;
|
||||
free_func = &free;
|
||||
}
|
||||
~AkgCallBack() = default;
|
||||
};
|
||||
} // namespace
|
||||
CpuKernelManagerPtr CpuKernelMod::kernelmanager_ = std::make_shared<CpuKernelManager>();
|
||||
|
||||
CpuKernelManager::~CpuKernelManager() {
|
||||
for (auto &cpu_func_pair : cpu_func_map_) {
|
||||
if (cpu_func_pair.second.second != nullptr) {
|
||||
(void)dlclose(cpu_func_pair.second.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void *CpuKernelManager::SearchFunc(const std::string &kernel_name) const {
|
||||
auto iter = cpu_func_map_.find(kernel_name);
|
||||
if (iter == cpu_func_map_.end()) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return iter->second.first;
|
||||
}
|
||||
}
|
||||
|
||||
void *CpuKernelManager::SearchFuncWithSharedLock(const std::string &kernel_name) const {
|
||||
std::shared_lock lock(mutex_);
|
||||
return SearchFunc(kernel_name);
|
||||
}
|
||||
|
||||
void *CpuKernelManager::GetFunction(const std::string &kernel_name) {
|
||||
if (auto func = SearchFuncWithSharedLock(kernel_name); func != nullptr) {
|
||||
return func;
|
||||
}
|
||||
std::unique_lock lock(mutex_);
|
||||
// Search cache again between setting unique lock and calling "dlopen", to make sure that
|
||||
// only one thread can call "dlopen" and insert handle to the cache for a new kernel_name.
|
||||
// To avoid that several nodes (with the same kernel_name) open the same "so" by dlopen,
|
||||
// but only cache it once, then the "dlclose" will be called only once, causing resource leak.
|
||||
if (auto func = SearchFunc(kernel_name); func != nullptr) {
|
||||
return func;
|
||||
}
|
||||
std::string fn;
|
||||
auto it = kernel_name.rfind("_kernel");
|
||||
if (it < kernel_name.size()) {
|
||||
fn = kernel_name.substr(0, it);
|
||||
} else {
|
||||
fn = kernel_name;
|
||||
}
|
||||
std::string fn_so = kCpuKernelMeta + fn + ".so";
|
||||
auto handle = dlopen(fn_so.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if (handle == nullptr) {
|
||||
MS_LOG(ERROR) << "Load " << fn_so << " failed. kernel: " << kernel_name;
|
||||
return nullptr;
|
||||
}
|
||||
auto launch_func = dlsym(handle, kernel_name.c_str());
|
||||
if (launch_func == nullptr) {
|
||||
MS_LOG(ERROR) << "Undefined symbol " << kernel_name << " in " << fn_so;
|
||||
return nullptr;
|
||||
}
|
||||
cpu_func_map_[kernel_name] = std::make_pair(launch_func, handle);
|
||||
return launch_func;
|
||||
}
|
||||
|
||||
bool CpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
auto js = nlohmann::json::parse(kernel_pack_->GetJson()->contents,
|
||||
kernel_pack_->GetJson()->contents + kernel_pack_->GetJson()->len);
|
||||
std::string kernel_name = js["kernelName"];
|
||||
auto launch_func = kernelmanager_->GetFunction(kernel_name);
|
||||
if (launch_func == nullptr) {
|
||||
MS_LOG(ERROR) << "GetFunction failed. kernel: " << kernel_name;
|
||||
return false;
|
||||
}
|
||||
std::vector<void *> runtimeargs;
|
||||
(void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
|
||||
[](const AddressPtr &input) -> void * { return input->addr; });
|
||||
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
|
||||
[](const AddressPtr &output) -> void * { return output->addr; });
|
||||
AkgCallBack akg_callback;
|
||||
runtimeargs.emplace_back(reinterpret_cast<void *>(&akg_callback));
|
||||
using AkgCpuKernelFunction = void (*)(void *);
|
||||
reinterpret_cast<AkgCpuKernelFunction>(launch_func)(reinterpret_cast<void *>(runtimeargs.data()));
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,73 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class CpuKernelManager {
|
||||
public:
|
||||
CpuKernelManager() = default;
|
||||
~CpuKernelManager();
|
||||
|
||||
void *GetFunction(const std::string &kernel_name);
|
||||
|
||||
private:
|
||||
void *SearchFunc(const std::string &kernel_name) const;
|
||||
void *SearchFuncWithSharedLock(const std::string &kernel_name) const;
|
||||
|
||||
// cache the kernel function: kernel_name -> {kernel_func, so_handle}
|
||||
std::unordered_map<std::string, std::pair<void *, void *>> cpu_func_map_;
|
||||
mutable std::shared_mutex mutex_;
|
||||
};
|
||||
using CpuKernelManagerPtr = std::shared_ptr<CpuKernelManager>;
|
||||
|
||||
class CpuKernelMod : public KernelMod {
|
||||
public:
|
||||
explicit CpuKernelMod(const KernelPackPtr &kp) : kernel_pack_(kp) {}
|
||||
~CpuKernelMod() = default;
|
||||
|
||||
void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
|
||||
void SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }
|
||||
void SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
|
||||
|
||||
static CpuKernelManagerPtr kernelmanager_;
|
||||
|
||||
private:
|
||||
KernelPackPtr kernel_pack_;
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_; // workspace is not used in cpu kernel.
|
||||
};
|
||||
|
||||
using CpuKernelModPtr = std::shared_ptr<CpuKernelMod>;
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_CPU_AKG_CPU_KERNEL_MOD_H_
|
|
@ -27,7 +27,7 @@ class AkgGpuKernelBuilder : public AkgKernelBuilder {
|
|||
AkgGpuKernelBuilder() = default;
|
||||
~AkgGpuKernelBuilder() = default;
|
||||
|
||||
kernel::KernelBuildClient *GetClient() override { return &(kernel::GpuKernelBuildClient::Instance()); }
|
||||
kernel::KernelBuildClient *GetClient() override { return &(kernel::AkgKernelBuildClient::Instance()); }
|
||||
KernelPackPtr AkgSearchCache(const std::string &kernel_name) override;
|
||||
KernelPackPtr AkgInsertCache(const std::string &kernel_name) override;
|
||||
void AkgSetKernelMod(const KernelPackPtr &kernel_pack, const AkgKernelJsonGenerator &json_generator,
|
||||
|
|
|
@ -157,16 +157,21 @@ FusionType GetFusionTypeByName(const std::string &name) {
|
|||
}
|
||||
|
||||
void KernelMeta::Initialize() {
|
||||
kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
|
||||
if (GetStrProcessorFromContext() == kProcessorCpu) {
|
||||
kernel_meta_path_ = std::string(kCpuKernelMeta);
|
||||
} else {
|
||||
kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
auto ret = mkdir(kernel_meta_path_.c_str());
|
||||
auto ret = mkdir(kernel_meta_path_.c_str());
|
||||
#else
|
||||
auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU);
|
||||
auto ret = mkdir(kernel_meta_path_.c_str(), S_IRWXG | S_IRWXU);
|
||||
#endif
|
||||
if (ret != 0) {
|
||||
MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later";
|
||||
if (ret != 0) {
|
||||
MS_LOG(INFO) << "kernel dir [" << kernel_meta_path_ << "], will be created later";
|
||||
}
|
||||
}
|
||||
|
||||
initialized_ = true;
|
||||
}
|
||||
|
||||
|
@ -238,6 +243,8 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
|
|||
std::string kernel_json;
|
||||
if (processor == kProcessorAiCore || processor == kProcessorAiCpu) {
|
||||
kernel_json = kCceKernelMeta;
|
||||
} else if (processor == kProcessorCpu) {
|
||||
kernel_json = kCpuKernelMeta;
|
||||
} else {
|
||||
kernel_json = bin_map->kernel_meta_path();
|
||||
}
|
||||
|
@ -872,6 +879,8 @@ Processor GetProcessorFromContext() {
|
|||
processor = kernel::Processor::CUDA;
|
||||
} else if (device_info == kAscendDevice) {
|
||||
processor = kernel::Processor::AICORE;
|
||||
} else if (device_info == kCPUDevice) {
|
||||
processor = kernel::Processor::CPU;
|
||||
}
|
||||
return processor;
|
||||
}
|
||||
|
@ -883,6 +892,8 @@ std::string GetStrProcessorFromContext() {
|
|||
str_processor = kernel::kProcessorCuda;
|
||||
} else if (processor == kernel::Processor::AICORE) {
|
||||
str_processor = kernel::kProcessorAiCore;
|
||||
} else if (processor == kernel::Processor::CPU) {
|
||||
str_processor = kernel::kProcessorCpu;
|
||||
}
|
||||
return str_processor;
|
||||
}
|
||||
|
|
|
@ -34,10 +34,12 @@
|
|||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr auto kCceKernelMeta = "./kernel_meta/";
|
||||
constexpr auto kCpuKernelMeta = "./kernel_meta/";
|
||||
constexpr auto kGpuKernelMeta = "./cuda_meta";
|
||||
constexpr auto kProcessorAiCore = "aicore";
|
||||
constexpr auto kProcessorAiCpu = "aicpu";
|
||||
constexpr auto kProcessorCuda = "cuda";
|
||||
constexpr auto kProcessorCpu = "cpu";
|
||||
constexpr auto kProcessorUnknown = "unknown";
|
||||
constexpr auto kJsonSuffix = ".json";
|
||||
constexpr auto kInfoSuffix = ".info";
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -99,6 +99,14 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string &
|
|||
(void)kernel_json.seekg(0, std::ios::beg);
|
||||
(void)kernel_json.read(json_->contents, SizeToLong(json_->len));
|
||||
|
||||
if (processor == kProcessorCpu) {
|
||||
std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".so";
|
||||
if (!CheckHash(json_f, bin_f, js)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (processor == kProcessorCuda) {
|
||||
std::string bin_f = json_f.substr(0, json_f.length() - 5) + ".ptx";
|
||||
std::ifstream kernelbin(bin_f);
|
||||
|
|
|
@ -107,6 +107,7 @@ enum Processor {
|
|||
AICORE = 0,
|
||||
AICPU,
|
||||
CUDA,
|
||||
CPU,
|
||||
};
|
||||
|
||||
struct FlexArray {
|
||||
|
|
|
@ -13,8 +13,6 @@ endif()
|
|||
if(ENABLE_D OR ENABLE_ACL)
|
||||
file(GLOB_RECURSE _D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"ascend/*.cc"
|
||||
"graph_kernel/*.cc"
|
||||
"graph_kernel/model/*.cc"
|
||||
)
|
||||
list(APPEND _PREACTIVATE_SRC_LIST ${_D_SRC_LIST})
|
||||
endif()
|
||||
|
@ -22,8 +20,6 @@ endif()
|
|||
if(ENABLE_GPU)
|
||||
file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"gpu/*.cc"
|
||||
"graph_kernel/*.cc"
|
||||
"graph_kernel/model/*.cc"
|
||||
)
|
||||
list(APPEND _PREACTIVATE_SRC_LIST ${_GPU_SRC_LIST})
|
||||
endif()
|
||||
|
@ -43,6 +39,13 @@ if(ENABLE_CPU)
|
|||
list(APPEND _PREACTIVATE_SRC_LIST ${_CPU_SRC_LIST})
|
||||
endif()
|
||||
|
||||
if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
file(GLOB_RECURSE _GK_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"graph_kernel/*.cc"
|
||||
)
|
||||
list(APPEND _PREACTIVATE_SRC_LIST ${_GK_SRC_LIST})
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${_PREACTIVATE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
|
||||
SUBMODULE_ID=mindspore::SubModuleId::SM_PRE_ACT)
|
||||
add_library(_mindspore_backend_optimizer_obj OBJECT ${_PREACTIVATE_SRC_LIST})
|
||||
|
|
|
@ -38,6 +38,8 @@
|
|||
#include "runtime/device/ascend/kernel_select_ascend.h"
|
||||
#elif ENABLE_GPU
|
||||
#include "runtime/device/gpu/kernel_info_setter.h"
|
||||
#elif ENABLE_CPU
|
||||
#include "runtime/device/cpu/kernel_select_cpu.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore::graphkernel {
|
||||
|
@ -608,6 +610,9 @@ void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) {
|
|||
#elif ENABLE_GPU
|
||||
cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
|
||||
device::gpu::SetKernelInfo(cnode, kernel_type);
|
||||
#elif ENABLE_CPU
|
||||
cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
|
||||
device::cpu::SetKernelInfo(cnode);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
|
|||
pm->AddPass(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);
|
||||
|
||||
// Universal arithmetic simplify
|
||||
pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu);
|
||||
pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);
|
||||
|
||||
// Common subexpression elimination
|
||||
pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_2);
|
||||
|
@ -158,7 +158,7 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
|
|||
PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
|
||||
auto pm = std::make_shared<GraphKernelPassManager>(4, "highlevelopt2");
|
||||
// Enable atomic add
|
||||
pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2);
|
||||
pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);
|
||||
|
||||
// Enable atomic add for stitch nodes.
|
||||
auto level = GetPassLevelByFlag(context::GraphKernelFlags::GetInstance().enable_stitch_fusion);
|
||||
|
@ -170,8 +170,8 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
|
|||
pm->AddPass(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);
|
||||
|
||||
// Enable tsa and uss
|
||||
pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1);
|
||||
pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1);
|
||||
pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
|
||||
pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);
|
||||
|
||||
return pm;
|
||||
}
|
||||
|
@ -204,6 +204,7 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
|
|||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
|
||||
is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
|
||||
is_cpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice);
|
||||
|
||||
auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
|
||||
optimizer->AddPassManager(PreProcess());
|
||||
|
|
|
@ -46,6 +46,7 @@ class GraphKernelOptimizer {
|
|||
|
||||
bool is_gpu{false};
|
||||
bool is_ascend{false};
|
||||
bool is_cpu{false};
|
||||
};
|
||||
|
||||
void GraphKernelOptimize(const KernelGraphPtr &kernel_graph);
|
||||
|
|
|
@ -21,14 +21,17 @@
|
|||
#include "ir/anf.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "utils/trace_base.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "runtime/device/kernel_runtime.h"
|
||||
#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
#include "runtime/device/cpu/kernel_select_cpu.h"
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
#include "backend/optimizer/common/pass_manager.h"
|
||||
#include "backend/optimizer/cpu/insert_cast_cpu.h"
|
||||
#include "backend/optimizer/cpu/insert_format_transform_op.h"
|
||||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "backend/optimizer/pass/replace_node_by_proxy.h"
|
||||
#include "backend/optimizer/pass/erase_visit_attr.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
|
@ -102,6 +105,16 @@ void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
|
|||
kernel_graph->SetExecOrderByDefault();
|
||||
}
|
||||
|
||||
void CPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
|
||||
#ifdef ENABLE_AKG
|
||||
if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
|
||||
return;
|
||||
}
|
||||
graphkernel::GraphKernelOptimize(kernel_graph);
|
||||
kernel_graph->SetExecOrderByDefault();
|
||||
#endif
|
||||
}
|
||||
|
||||
GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
|
||||
auto graph_id = graph_sum_;
|
||||
auto graph = ConstructKernelGraph(lst, outputs);
|
||||
|
@ -112,6 +125,7 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
|
|||
MS_LOG(INFO) << "Set kernel info end";
|
||||
Optimize(graph);
|
||||
FinalOptimize(graph);
|
||||
GraphKernelOptimize(graph);
|
||||
MS_LOG(INFO) << "Build kernel";
|
||||
BuildKernel(graph.get());
|
||||
// Remove reorder after PS feature finish adapting push/pull in auto_monad.
|
||||
|
@ -352,10 +366,20 @@ void KernelNotSupportException(const AnfNodePtr &kernel_node) {
|
|||
void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto &kernel_nodes = kernel_graph->execution_order();
|
||||
kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(bin_map);
|
||||
std::vector<AnfNodePtr> akg_nodes;
|
||||
for (const auto &kernel_node : kernel_nodes) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_LOG(INFO) << "Cpu building operator[" << kernel_name << "].";
|
||||
if (session::AnfRuntimeAlgorithm::GetKernelType(kernel_node) == KernelType::AKG_KERNEL) {
|
||||
if (!bin_map->initialized()) {
|
||||
bin_map->Initialize();
|
||||
}
|
||||
akg_nodes.push_back(kernel_node);
|
||||
continue;
|
||||
}
|
||||
std::shared_ptr<kernel::CPUKernel> cpu_kernel =
|
||||
kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
|
||||
if (cpu_kernel == nullptr) {
|
||||
|
@ -369,6 +393,10 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
|
|||
AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
|
||||
MS_LOG(INFO) << "Cpu build success operator[" << kernel_name << "].";
|
||||
}
|
||||
#ifdef ENABLE_AKG
|
||||
kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
|
||||
(void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
|
||||
#endif
|
||||
}
|
||||
} // namespace session
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -42,6 +42,7 @@ class CPUSession : public SessionBasic {
|
|||
VectorRef *const outputs) override;
|
||||
void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
|
||||
ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override;
|
||||
void GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph);
|
||||
void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
|
||||
KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
|
||||
const std::vector<tensor::TensorPtr> &input_tensors,
|
||||
|
|
|
@ -256,7 +256,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
|
|||
~AscendKernelBuildClient() override { Close(); }
|
||||
};
|
||||
|
||||
class GpuKernelBuildClient : public KernelBuildClient {
|
||||
class AkgKernelBuildClient : public KernelBuildClient {
|
||||
public:
|
||||
// Server configure
|
||||
constexpr inline static auto kGetPathScript =
|
||||
|
@ -264,15 +264,15 @@ class GpuKernelBuildClient : public KernelBuildClient {
|
|||
"\""
|
||||
"import pkgutil;"
|
||||
"path = pkgutil"
|
||||
".get_loader(\\\"mindspore._extends.remote.kernel_build_server_gpu\\\")" // Server module name
|
||||
".get_loader(\\\"mindspore._extends.remote.kernel_build_server_akg\\\")" // Server module name
|
||||
".get_filename();"
|
||||
"print('[~]' + path)"
|
||||
"\"";
|
||||
|
||||
constexpr inline static auto kServerScript = "kernel_build_server_gpu.py";
|
||||
constexpr inline static auto kServerScript = "kernel_build_server_akg.py";
|
||||
|
||||
static GpuKernelBuildClient &Instance() {
|
||||
static GpuKernelBuildClient instance;
|
||||
static AkgKernelBuildClient &Instance() {
|
||||
static AkgKernelBuildClient instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
|
@ -283,15 +283,15 @@ class GpuKernelBuildClient : public KernelBuildClient {
|
|||
return GetScriptFilePath(env, kGetPathScript, kServerScript);
|
||||
}
|
||||
|
||||
GpuKernelBuildClient(const GpuKernelBuildClient &) = delete;
|
||||
GpuKernelBuildClient &operator=(const GpuKernelBuildClient &) = delete;
|
||||
AkgKernelBuildClient(const AkgKernelBuildClient &) = delete;
|
||||
AkgKernelBuildClient &operator=(const AkgKernelBuildClient &) = delete;
|
||||
|
||||
GpuKernelBuildClient(GpuKernelBuildClient &&) = delete;
|
||||
GpuKernelBuildClient &operator=(GpuKernelBuildClient &&) = delete;
|
||||
AkgKernelBuildClient(AkgKernelBuildClient &&) = delete;
|
||||
AkgKernelBuildClient &operator=(AkgKernelBuildClient &&) = delete;
|
||||
|
||||
private:
|
||||
GpuKernelBuildClient() { Open(); }
|
||||
~GpuKernelBuildClient() override { Close(); }
|
||||
AkgKernelBuildClient() { Open(); }
|
||||
~AkgKernelBuildClient() override { Close(); }
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,10 +18,12 @@
|
|||
#include <string>
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "runtime/device/cpu/cpu_memory_manager.h"
|
||||
#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/kernel_build_info.h"
|
||||
#include "runtime/device/cpu/kernel_select_cpu.h"
|
||||
#include "utils/trace_base.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
#include "backend/optimizer/common/pass_manager.h"
|
||||
#include "backend/optimizer/common/common_backend_optimization.h"
|
||||
|
@ -29,6 +31,8 @@
|
|||
#include "backend/optimizer/cpu/insert_format_transform_op.h"
|
||||
#include "backend/optimizer/pass/replace_node_by_proxy.h"
|
||||
#include "backend/optimizer/pass/erase_visit_attr.h"
|
||||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "profiler/device/cpu/cpu_profiling.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
|
@ -113,6 +117,14 @@ void CPUDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
|
|||
|
||||
// Run final optimization.
|
||||
opt::CommonFinalOptimization(graph);
|
||||
|
||||
#ifdef ENABLE_AKG
|
||||
// Run graph kernel fusion optimization
|
||||
if (context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
|
||||
graphkernel::GraphKernelOptimize(graph);
|
||||
graph->SetExecOrderByDefault();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void CPUDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
|
||||
|
@ -173,11 +185,21 @@ void CPUDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const
|
|||
}
|
||||
|
||||
void CPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
|
||||
kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(bin_map);
|
||||
std::vector<AnfNodePtr> akg_nodes;
|
||||
for (const auto &node : nodes) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (AnfAlgo::IsControlOpExecInBackend(node)) {
|
||||
continue;
|
||||
}
|
||||
if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) {
|
||||
if (!bin_map->initialized()) {
|
||||
bin_map->Initialize();
|
||||
}
|
||||
akg_nodes.push_back(node);
|
||||
continue;
|
||||
}
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(node);
|
||||
std::shared_ptr<kernel::CPUKernel> cpu_kernel = kernel::CPUKernelFactory::GetInstance().Create(kernel_name, node);
|
||||
if (!cpu_kernel) {
|
||||
|
@ -195,6 +217,10 @@ void CPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
|
|||
cpu_kernel->Init(node);
|
||||
AnfAlgo::SetKernelMod(cpu_kernel, node.get());
|
||||
}
|
||||
#ifdef ENABLE_AKG
|
||||
kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
|
||||
(void)akg_cpu_kernel_builder.AkgKernelParallelBuild(akg_nodes);
|
||||
#endif
|
||||
}
|
||||
|
||||
void CPUDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const {
|
||||
|
@ -212,8 +238,6 @@ bool CPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
|
|||
MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
|
||||
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||
auto cpu_kernel_mod = dynamic_cast<kernel::CPUKernel *>(kernel_mod);
|
||||
MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
|
||||
|
||||
#ifdef PLATFORM_86
|
||||
// Some CPU kernels need set the flush zero mode to improve performance.
|
||||
|
@ -226,6 +250,8 @@ bool CPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
|
|||
// Some CPU kernels can't initialize kernel and launch kernel in different thread, so reinitialize the kernels before
|
||||
// launch.
|
||||
if (kOpNotSupportMultiThreadExecList.find(AnfAlgo::GetCNodeName(kernel)) != kOpNotSupportMultiThreadExecList.end()) {
|
||||
auto cpu_kernel_mod = dynamic_cast<kernel::CPUKernel *>(kernel_mod);
|
||||
MS_EXCEPTION_IF_NULL(cpu_kernel_mod);
|
||||
cpu_kernel_mod->InitKernel(kernel);
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
|
|
@ -172,7 +172,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma
|
|||
FlagRegister reg(flag_map);
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
bool is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
|
||||
bool is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
|
||||
|
||||
// Set opt_level first, some flags' default value depends on it.
|
||||
// Default optimization level is level 2 when enable graphkernel
|
||||
|
@ -192,7 +192,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma
|
|||
|
||||
// Integer flags
|
||||
reg.AddFlag("online_tuning", &online_tuning);
|
||||
reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_gpu ? OpLevel_MAX : OpLevel_0);
|
||||
reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_ascend ? OpLevel_0 : OpLevel_MAX);
|
||||
|
||||
// String flags
|
||||
reg.AddFlag("repository_path", &repository_path);
|
||||
|
|
|
@ -489,8 +489,8 @@ def _check_target_specific_cfgs(device, arg_key):
|
|||
device_cfgs = {
|
||||
'enable_dump': ['Ascend'],
|
||||
'save_dump_path': ['Ascend'],
|
||||
'enable_graph_kernel': ['Ascend', 'GPU'],
|
||||
'graph_kernel_flags': ['Ascend', 'GPU'],
|
||||
'enable_graph_kernel': ['Ascend', 'GPU', 'CPU'],
|
||||
'graph_kernel_flags': ['Ascend', 'GPU', 'CPU'],
|
||||
'enable_reduce_precision': ['Ascend'],
|
||||
'enable_profiling': ['Ascend'],
|
||||
'profiling_options': ['Ascend'],
|
||||
|
|
|
@ -79,8 +79,11 @@ build_mindspore()
|
|||
if [[ "X$USE_GLOG" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DUSE_GLOG=ON"
|
||||
fi
|
||||
if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then
|
||||
if [[ "X$ENABLE_AKG" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_AKG=ON"
|
||||
if [[ "X$ENABLE_CPU" = "Xon" && "X$ENABLE_D" != "Xon" && "X$ENABLE_GPU" != "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DUSE_LLVM=ON"
|
||||
fi
|
||||
fi
|
||||
if [[ "X$ENABLE_ACL" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ACL=ON"
|
||||
|
|
|
@ -44,7 +44,7 @@ init_default_options()
|
|||
export LITE_PLATFORM=""
|
||||
export LITE_ENABLE_AAR="off"
|
||||
export USE_GLOG="on"
|
||||
export ENABLE_AKG="on"
|
||||
export ENABLE_AKG="off"
|
||||
export ENABLE_ACL="off"
|
||||
export ENABLE_D="off"
|
||||
export ENABLE_DEBUGGER="on"
|
||||
|
|
|
@ -40,6 +40,7 @@ parse_device()
|
|||
exit 1
|
||||
fi
|
||||
export CUDA_VERSION="$DEVICE_VERSION"
|
||||
export ENABLE_AKG="on"
|
||||
elif [[ "X$DEVICE" == "Xd" || "X$DEVICE" == "Xascend" ]]; then
|
||||
# version default 910
|
||||
if [[ "X$DEVICE_VERSION" == "X" ]]; then
|
||||
|
@ -54,6 +55,7 @@ parse_device()
|
|||
export ENABLE_ACL="on"
|
||||
ENABLE_CPU="on"
|
||||
export ENABLE_MPI="on"
|
||||
export ENABLE_AKG="on"
|
||||
else
|
||||
echo "Invalid value ${DEVICE_VERSION} for option -V"
|
||||
usage
|
||||
|
|
|
@ -21,7 +21,7 @@ usage()
|
|||
echo "Usage:"
|
||||
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t ut|st] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
|
||||
echo " [-a on|off] [-p on|off] [-i] [-R] [-D on|off] [-j[n]] [-e gpu|ascend|cpu] \\"
|
||||
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
|
||||
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K on|off] \\"
|
||||
echo " [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
|
||||
echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
|
||||
echo " [-L Tensor-RT path] [-y on|off] \\"
|
||||
|
@ -52,7 +52,7 @@ usage()
|
|||
echo " -V Specify the device version, if -e gpu, default CUDA 10.1, if -e ascend, default Ascend 910"
|
||||
echo " -I Enable compiling mindspore lite for arm64, arm32 or x86_64, default disable mindspore lite compilation"
|
||||
echo " -A Enable compiling mindspore lite aar package, option: on/off, default: off"
|
||||
echo " -K Compile with AKG, default on"
|
||||
echo " -K Compile with AKG, default on if -e gpu or -e ascend, else default off"
|
||||
echo " -B Enable debugger, default on"
|
||||
echo " -E Enable IBVERBS for parameter server, default off"
|
||||
echo " -l Compile with python dependency, default on"
|
||||
|
|
Loading…
Reference in New Issue