forked from mindspore-Ecosystem/mindspore
!14648 [GraphKernel]adapt for layermorm C++ code
From: @wenfangpei Reviewed-by: @coding2020,@gaoxiong1,@ckey_dou Signed-off-by: @ckey_dou
This commit is contained in:
commit
9aa645e202
|
@ -472,7 +472,7 @@ class GraphSplitAscend(GraphSplitByPattern):
|
|||
def get_default_mode(self, op):
|
||||
if op.prim == "MatMul":
|
||||
return self.Area.MODE_COMPOSITE if op.inputs[0].dtype == "float16" else self.Area.MODE_BASIC
|
||||
if op.prim in ("Tile", "BroadcastTo"):
|
||||
if op.prim in ("Tile", "BroadcastTo", "ExpandDims"):
|
||||
return self.Area.MODE_COMPOSITE
|
||||
return self.Area.MODE_BASIC
|
||||
|
||||
|
|
|
@ -34,7 +34,9 @@
|
|||
#include "pipeline/jit/action.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
#include "vm/segment_runner.h"
|
||||
#if ENABLE_GPU
|
||||
#if ENABLE_D
|
||||
#include "runtime/device/ascend/kernel_select_ascend.h"
|
||||
#elif ENABLE_GPU
|
||||
#include "runtime/device/gpu/kernel_info_setter.h"
|
||||
#endif
|
||||
|
||||
|
@ -620,7 +622,11 @@ bool IsBasicFuseOp(const AnfNodePtr &node) {
|
|||
std::vector<PrimitivePtr> basic_ops = GetFusibleOpList();
|
||||
#if ENABLE_D
|
||||
if (!CheckProcessor(node)) {
|
||||
return false;
|
||||
std::vector<PrimitivePtr> fused_aicpu_op = {prim::kPrimExpandDims, prim::kPrimReshape};
|
||||
if (!std::any_of(fused_aicpu_op.begin(), fused_aicpu_op.end(),
|
||||
[&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); })) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return std::any_of(basic_ops.begin(), basic_ops.end(),
|
||||
|
@ -644,7 +650,9 @@ bool IsFusibleOp(const AnfNodePtr &node) {
|
|||
void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) {
|
||||
auto cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
#if ENABLE_GPU
|
||||
#if ENABLE_D
|
||||
device::ascend::SetKernelInfo(cnode, kernel_type);
|
||||
#elif ENABLE_GPU
|
||||
device::gpu::SetKernelInfo(cnode, kernel_type);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -100,17 +100,17 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() {
|
|||
|
||||
PassManagerPtr GraphKernelOptimizer::Split() {
|
||||
auto pm = std::make_shared<PassManager>("graphkernel_stage4_split");
|
||||
|
||||
// Move the non-scalar tensor (in composite node) to parameter list
|
||||
pm->AddPass(std::make_shared<TensorPromotion>());
|
||||
|
||||
// Make certain nodes redundant so that they are used by only one user,
|
||||
// which can avoid unnecessary input-output and get better performance.
|
||||
if (is_gpu) {
|
||||
// preprocess for ShapeOpsSplitter
|
||||
pm->AddPass(std::make_shared<ExtendOutputForUpdateState>());
|
||||
std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast};
|
||||
pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops));
|
||||
}
|
||||
|
||||
// preprocess for ShapeOpsSplitter
|
||||
pm->AddPass(std::make_shared<ExtendOutputForUpdateState>());
|
||||
std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast};
|
||||
pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops));
|
||||
|
||||
// Split kernel according to costmodel
|
||||
pm->AddPass(std::make_shared<GraphKernelSplitter>());
|
||||
|
@ -120,11 +120,9 @@ PassManagerPtr GraphKernelOptimizer::Split() {
|
|||
pm->AddPass(std::make_shared<GetitemTuple>());
|
||||
|
||||
// Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
|
||||
if (is_gpu) {
|
||||
pm->AddPass(std::make_shared<MergeOutputForUpdateState>());
|
||||
pm->AddPass(std::make_shared<GraphKernelCSE>());
|
||||
pm->AddPass(std::make_shared<EliminateRedundantOutput>());
|
||||
}
|
||||
pm->AddPass(std::make_shared<MergeOutputForUpdateState>());
|
||||
pm->AddPass(std::make_shared<GraphKernelCSE>());
|
||||
pm->AddPass(std::make_shared<EliminateRedundantOutput>());
|
||||
return pm;
|
||||
}
|
||||
|
||||
|
|
|
@ -359,12 +359,19 @@ class Splitter {
|
|||
Splitter(const CNodePtr &main_cnode, SplitSchemerPtr split_schemer)
|
||||
: main_func_graph_(main_cnode->func_graph()), old_subgraph_cnode_(main_cnode), split_schemer_(split_schemer) {}
|
||||
|
||||
void ResetInlinedNodesKernelInfo() {
|
||||
for (const auto &node : inlined_nodes_) {
|
||||
ResetKernelInfo(node);
|
||||
}
|
||||
}
|
||||
|
||||
// Maintain new subgraphs in main graph.
|
||||
void RebuildGraph(const std::vector<size_t> &cnodes_group_id) {
|
||||
BindFuncGraph();
|
||||
RecoverParameter();
|
||||
ConnectToMainGraph(cnodes_group_id);
|
||||
UpdateSubGraphInfo();
|
||||
ResetInlinedNodesKernelInfo();
|
||||
}
|
||||
|
||||
// Rebind nodes to its new sub_func_graph
|
||||
|
@ -420,7 +427,7 @@ class Splitter {
|
|||
}
|
||||
}
|
||||
if (AnfAlgo::IsRealKernel(node)) {
|
||||
ResetKernelInfo(node);
|
||||
inlined_nodes_.push_back(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -533,6 +540,7 @@ class Splitter {
|
|||
FuncGraphPtr main_func_graph_;
|
||||
CNodePtr old_subgraph_cnode_; // The cnode that holds the original sub_func_graph
|
||||
std::vector<CNodePtr> new_subgraph_cnodes_; // The cnode list that hold the new sub_func_graph
|
||||
std::vector<AnfNodePtr> inlined_nodes_;
|
||||
SplitSchemerPtr split_schemer_;
|
||||
std::unordered_map<ParameterPtr, AnfNodePtr> param_to_main_graph_node_map_;
|
||||
};
|
||||
|
|
|
@ -54,6 +54,7 @@
|
|||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#include "debug/tensor_load.h"
|
||||
#include "debug/anf_ir_utils.h"
|
||||
#include "backend/optimizer/graph_kernel/shape_ops_splitter.h"
|
||||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "backend/session/ascend_auto_monad.h"
|
||||
#include "debug/data_dump/e2e_dump_util.h"
|
||||
|
|
|
@ -515,6 +515,56 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kern
|
|||
}
|
||||
return select_status;
|
||||
}
|
||||
|
||||
void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) {
|
||||
auto kernel_info = static_cast<device::KernelInfo *>(kernel_node->kernel_info());
|
||||
MS_EXCEPTION_IF_NULL(kernel_info);
|
||||
auto kernel_build_info = kernel_info->select_kernel_build_info();
|
||||
MS_EXCEPTION_IF_NULL(kernel_build_info);
|
||||
|
||||
if (AnfAlgo::IsGraphKernel(kernel_node)) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
|
||||
builder->SetOriginDataFormat(kernel_build_info->GetOriginDataFormat());
|
||||
builder->SetInputsFormat(kernel_build_info->GetAllInputFormats());
|
||||
builder->SetInputsDeviceType(kernel_build_info->GetAllInputDeviceTypes());
|
||||
builder->SetOutputsFormat(kernel_build_info->GetAllOutputFormats());
|
||||
builder->SetOutputsDeviceType(kernel_build_info->GetAllOutputDeviceTypes());
|
||||
builder->SetOpPattern(kernel_build_info->op_pattern());
|
||||
builder->SetFusionType(kernel_build_info->fusion_type());
|
||||
|
||||
auto new_kernel_type = kernel_type;
|
||||
auto new_processor = kernel_build_info->processor();
|
||||
if (kernel_type == UNKNOWN_KERNEL_TYPE) {
|
||||
std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
|
||||
std::vector<std::shared_ptr<kernel::KernelBuildInfo>> aicpu_kernel_info_list;
|
||||
kernel::KernelQuery(kernel_node, &kernel_info_list, kernel_type);
|
||||
auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list);
|
||||
if (select_status != kNoMatched) {
|
||||
new_kernel_type = TBE_KERNEL;
|
||||
new_processor = kernel::Processor::AICORE;
|
||||
MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses TBE_KERNEL";
|
||||
} else {
|
||||
kernel::AICPUQuery(kernel_node, &aicpu_kernel_info_list);
|
||||
select_status = SetMatchedKernelInfo(kernel_node, aicpu_kernel_info_list);
|
||||
if (select_status != kNoMatched) {
|
||||
new_kernel_type = AICPU_KERNEL;
|
||||
new_processor = kernel::Processor::AICPU;
|
||||
MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AICPU_KERNEL";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (new_kernel_type == UNKNOWN_KERNEL_TYPE) {
|
||||
new_kernel_type = AKG_KERNEL;
|
||||
new_processor = kernel::Processor::AICORE;
|
||||
MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AKG_KERNEL";
|
||||
}
|
||||
builder->SetKernelType(new_kernel_type);
|
||||
builder->SetProcessor(new_processor);
|
||||
AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get());
|
||||
}
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -31,6 +31,7 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node,
|
|||
KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE);
|
||||
void SetTensorDeviceInfo(const CNodePtr &kernel_node);
|
||||
void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph);
|
||||
void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type);
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue