actor runtime support GraphKrenel

This commit is contained in:
limingqi107 2021-06-04 11:20:11 +08:00
parent f9d5a813e2
commit 7f634d12f0
11 changed files with 203 additions and 56 deletions

View File

@ -297,6 +297,58 @@ std::vector<AnfNodePtr> AnfRuntimeAlgorithm::GetAllOutput(const AnfNodePtr &node
return ret;
}
std::vector<KernelWithIndex> AnfRuntimeAlgorithm::GetAllOutputWithIndex(const AnfNodePtr &node) {
std::vector<KernelWithIndex> ret;
std::vector<KernelWithIndex> ret_empty;
// The MakeTuple node need expand and recurse.
if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimMakeTuple)) {
auto make_tuple = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(make_tuple);
for (size_t i = 1; i < make_tuple->inputs().size(); i++) {
auto input_i_vector = GetAllOutputWithIndex(make_tuple->input(i));
(void)std::copy(input_i_vector.begin(), input_i_vector.end(), std::back_inserter(ret));
}
return ret;
}
auto outputs_num = AnfAlgo::GetOutputTensorNum(node);
if (!IsRealCNodeKernel(node)) {
outputs_num = 1;
}
// The output may be the tuple, so need visit all the outputs of node.
for (size_t i = 0; i < outputs_num; ++i) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(node, i, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
// The MakeTuple node need recurse.
if (AnfAlgo::CheckPrimitiveType(output_with_index.first, prim::kPrimMakeTuple)) {
auto input_vector = GetAllOutputWithIndex(output_with_index.first);
(void)std::copy(input_vector.begin(), input_vector.end(), std::back_inserter(ret));
continue;
}
// Ignore the output of front call node.
if (output_with_index.first->isa<CNode>()) {
auto cnode = output_with_index.first->cast<CNodePtr>();
auto inputs = cnode->inputs();
if (inputs[0]->isa<CNode>()) {
MS_LOG(INFO) << "The output is call node: " << output_with_index.first->DebugString();
return ret_empty;
}
}
// The InitDataSetQueue node has no output.
if (AnfAlgo::CheckPrimitiveType(output_with_index.first, prim::kPrimInitDataSetQueue)) {
return ret_empty;
}
ret.push_back(output_with_index);
}
return ret;
}
AnfNodePtr AnfRuntimeAlgorithm::GetCNodePrimitiveNode(const CNodePtr &node) {
MS_EXCEPTION_IF_NULL(node);
return node->input(kAnfPrimitiveIndex);

View File

@ -72,6 +72,7 @@ class AnfRuntimeAlgorithm {
prim::kPrimMakeTuple});
static std::vector<AnfNodePtr> GetAllOutput(const AnfNodePtr &node,
const std::vector<PrimitivePtr> &return_types = {});
static std::vector<KernelWithIndex> GetAllOutputWithIndex(const AnfNodePtr &node);
// get cnode primitive
static AnfNodePtr GetCNodePrimitiveNode(const CNodePtr &node);
static void SetNodeInput(const CNodePtr &node, const AnfNodePtr &input_node, size_t index);

View File

@ -1106,7 +1106,7 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
MS_LOG(INFO) << "Node is not internal output";
return;
}
MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
MS_LOG(INFO) << "Replace internal output node " << node->DebugString() << " to " << new_node->DebugString();
auto &front_nodes = iter->second;
// Move specified front node to new node mapping
auto front_node_iter = front_nodes.find(src_output_idx);
@ -1139,6 +1139,85 @@ AnfWithOutIndex KernelGraph::GetFrontNodeByInternalParameter(const AnfNodePtr &p
return AnfWithOutIndex();
}
void KernelGraph::CacheGraphOutputToFrontNodeWithIndex(const AnfNodePtr &backend_graph_output,
const AnfNodePtr &front_node) {
if ((backend_graph_output == nullptr) || (front_node == nullptr)) {
return;
}
auto backend_outputs = AnfAlgo::GetAllOutputWithIndex(backend_graph_output);
auto front_outputs = AnfAlgo::GetAllOutputWithIndex(front_node);
if (backend_outputs.size() != front_outputs.size()) {
MS_LOG(INFO) << "The size(" << backend_outputs.size()
<< ") of backend output: " << backend_graph_output->DebugString() << " is not equal to the size("
<< front_outputs.size() << ") of front output: " << front_node->DebugString();
return;
}
for (size_t i = 0; i < backend_outputs.size(); ++i) {
auto backend_output = backend_outputs[i];
auto front_output = front_outputs[i];
graph_output_to_front_node_map_[backend_output] = front_output;
MS_LOG(INFO) << "Backend output: " << backend_output.first->fullname_with_scope()
<< " with index: " << backend_output.second
<< " map to front node: " << front_output.first->fullname_with_scope()
<< " with index: " << front_output.second;
}
}
AnfWithOutIndex KernelGraph::GetFrontNodeWithIndexByGraphOutput(
const AnfWithOutIndex &backend_graph_output_with_index) const {
const auto &iter = graph_output_to_front_node_map_.find(backend_graph_output_with_index);
if (iter != graph_output_to_front_node_map_.end()) {
return iter->second;
}
return AnfWithOutIndex();
}
void KernelGraph::UpdateGraphOutputMap(const std::vector<AnfWithOutIndex> &old_outputs,
const std::vector<AnfWithOutIndex> &new_outputs) {
MS_LOG(INFO) << "The size of old outputs: " << old_outputs.size()
<< ", the size of new outputs: " << new_outputs.size();
if (old_outputs.size() != new_outputs.size()) {
MS_LOG(EXCEPTION) << "The size of old outputs is not equal to the size of new outputs.";
}
for (size_t i = 0; i < old_outputs.size(); ++i) {
auto old_output = old_outputs[i];
auto new_output = new_outputs[i];
if (old_output == new_output) {
continue;
}
// Update the graph output map.
if (graph_output_to_front_node_map_.count(old_output) > 0) {
MS_LOG(INFO) << "Replace backend output node " << old_output.first->fullname_with_scope() << " with index "
<< old_output.second << " to " << new_output.first->fullname_with_scope() << " with index "
<< new_output.second;
graph_output_to_front_node_map_[new_output] = graph_output_to_front_node_map_[old_output];
graph_output_to_front_node_map_.erase(old_output);
}
// Update the internal output map.
if (IsInternalOutput(old_output.first, old_output.second)) {
ReplaceInternalOutput(old_output.first, new_output.first, old_output.second, new_output.second);
}
if (old_output.first == new_output.first) {
continue;
}
// Update the front backend node map.
if (backend_front_anf_map_.count(old_output.first) > 0) {
MS_LOG(INFO) << "Replace backend output node " << old_output.first->fullname_with_scope() << " to "
<< new_output.first->fullname_with_scope();
auto front_node = backend_front_anf_map_[old_output.first];
front_backend_anf_map_[front_node] = new_output.first;
backend_front_anf_map_[new_output.first] = front_node;
(void)backend_front_anf_map_.erase(old_output.first);
}
}
}
AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
auto iter = front_to_internal_outputs_map_.find(front_node);
if (iter != front_to_internal_outputs_map_.end()) {

View File

@ -72,6 +72,7 @@ class KernelGraph : public FuncGraph {
start_label_ = graph.start_label_;
end_goto_ = graph.end_goto_;
internal_parameter_to_front_node_map_ = graph.internal_parameter_to_front_node_map_;
graph_output_to_front_node_map_ = graph.graph_output_to_front_node_map_;
front_to_internal_outputs_map_ = graph.front_to_internal_outputs_map_;
internal_outputs_to_front_map_ = graph.internal_outputs_to_front_map_;
internal_outputs_tensor_map_ = graph.internal_outputs_tensor_map_;
@ -206,9 +207,6 @@ class KernelGraph : public FuncGraph {
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, size_t src_output_idx,
size_t dst_output_idx);
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node);
// Cache the internal parameter and corresponding to front node into internal_parameter_to_front_node_map_.
void CacheInternalParameterToFrontNode(const AnfNodePtr &parameter, const AnfWithOutIndex &front_node_with_index);
AnfWithOutIndex GetFrontNodeByInternalParameter(const AnfNodePtr &parameter) const;
AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
bool IsInternalOutput(const AnfNodePtr &node, size_t output_idx) const;
bool IsInternalOutput(const AnfNodePtr &node) const;
@ -216,6 +214,18 @@ class KernelGraph : public FuncGraph {
void AddInternalOutputTensor(const AnfNodePtr &node, size_t output_idx, const tensor::TensorPtr &tensor);
tensor::TensorPtr GetInternalOutputTensor(const AnfNodePtr &node, size_t output_idx);
// Cache the internal parameter and corresponding to front node into internal_parameter_to_front_node_map_.
void CacheInternalParameterToFrontNode(const AnfNodePtr &parameter, const AnfWithOutIndex &front_node_with_index);
AnfWithOutIndex GetFrontNodeByInternalParameter(const AnfNodePtr &parameter) const;
// Cache the backend graph output nodes and corresponding to front nodes with output index into
// graph_output_to_front_node_map_.
void CacheGraphOutputToFrontNodeWithIndex(const AnfNodePtr &backend_graph_output, const AnfNodePtr &front_node);
AnfWithOutIndex GetFrontNodeWithIndexByGraphOutput(const AnfWithOutIndex &backend_graph_output_with_index) const;
// Update the related map of backend graph output nodes by modified backend output nodes.
void UpdateGraphOutputMap(const std::vector<AnfWithOutIndex> &old_outputs,
const std::vector<AnfWithOutIndex> &new_outputs);
uint32_t current_epoch() const { return current_epoch_; }
void set_current_epoch(uint32_t epoch) { current_epoch_ = epoch; }
void UpdateChildGraphOrder();
@ -376,10 +386,15 @@ class KernelGraph : public FuncGraph {
CNodePtr start_label_;
CNodePtr end_goto_;
// Internal parameter is not the origin parameter of func graph, it is the output of previous kernel graph which is
// related to the input of this kernel graph. The first of unordered map is the input of this kernel graph, the second
// of unordered map is front node corresponding to the output of previous kernel graph.
std::unordered_map<AnfNodePtr, AnfWithOutIndex> internal_parameter_to_front_node_map_;
// The first of map is the backend graph output of this kernel graph, the second of map is front node corresponding to
// the backend node with index.
std::map<AnfWithOutIndex, AnfWithOutIndex> graph_output_to_front_node_map_;
std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
std::unordered_map<AnfNodePtr, std::unordered_map<size_t, std::pair<AnfNodePtr, bool>>>
internal_outputs_to_front_map_;

View File

@ -1919,6 +1919,7 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
auto FindEqu = [graph, outputs, this](const AnfNodePtr &out) -> AnfNodePtr {
auto backend_anf = graph->GetBackendAnfByFrontAnf(out);
if (backend_anf != nullptr) {
graph->CacheGraphOutputToFrontNodeWithIndex(backend_anf, out);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {

View File

@ -266,7 +266,11 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
MS_EXCEPTION_IF_NULL(device_context);
// Execute optimization pass.
auto outputs_before_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output());
device_context->OptimizeGraph(graph);
auto outputs_after_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output());
// Update the output map of kernel graph by modified output nodes.
graph->UpdateGraphOutputMap(outputs_before_optimizer, outputs_after_optimizer);
// Generate 'KernelMod' for all kernels and set 'KernelMod' into kernel,
// 'KernelMod' is real executive object of kernel.

View File

@ -74,6 +74,17 @@ AnfNodePtr FetchFrontNodeByBackendNode(const AnfNodePtr &backend_node, const Ker
return front_node;
}
KernelWithIndex FetchFrontNodeWithIndexByGraphOutput(const KernelWithIndex &output_with_index,
const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
auto front_node_with_index = graph->GetFrontNodeWithIndexByGraphOutput(output_with_index);
// PyNative forward graph does not has front node, using backend node instead.
if (front_node_with_index.first == nullptr) {
front_node_with_index = output_with_index;
}
return front_node_with_index;
}
// The branch processing of PrepareDataForValueNode that value type is tensor.
void PrepareDataForValueNodeTensor(const ValueNodePtr &node, const ValuePtr &node_value,
const DeviceContext *device_context) {
@ -649,18 +660,16 @@ ActorSetPtr GraphScheduler::Build(const GraphCompilerInfo &graph_compiler_info,
void GraphScheduler::CacheGraphOutputToActor(const GraphCompilerInfo &graph_compiler_info) {
for (const auto &graph : graph_compiler_info.graphs_) {
MS_EXCEPTION_IF_NULL(graph);
const auto &outputs = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
for (const auto &output : outputs) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
auto outputs = AnfAlgo::GetAllOutputWithIndex(graph->output());
for (const auto &output_with_index : outputs) {
auto output_kernel = output_with_index.first;
MS_EXCEPTION_IF_NULL(output_kernel);
const auto &front_node = graph->GetFrontAnfByBackendAnf(output_kernel);
if (front_node == nullptr) {
auto origin_output_with_index = graph->GetFrontNodeWithIndexByGraphOutput(output_with_index);
if (origin_output_with_index.first == nullptr) {
continue;
}
auto actor_output_index = output_with_index.second;
auto origin_output_with_index = KernelWithIndex(front_node, actor_output_index);
OpActor<DeviceTensor> *actor = nullptr;
if (IsKernelActor(output_kernel)) {
actor = FetchActor(output_kernel->fullname_with_scope());
@ -684,7 +693,8 @@ void GraphScheduler::CacheGraphOutputToActor(const GraphCompilerInfo &graph_comp
MS_EXCEPTION_IF_NULL(actor);
MS_LOG(INFO) << "Cache the graph " << graph->graph_id() << " output node:" << output_kernel->fullname_with_scope()
<< " to actor:" << actor->GetAID().Name() << " with output index:" << actor_output_index;
<< " with index: " << output_with_index.second << " to actor:" << actor->GetAID().Name()
<< " with index:" << actor_output_index;
graph_output_to_actor_.emplace(origin_output_with_index, GraphOutputPair(actor, actor_output_index));
}
}
@ -1302,20 +1312,22 @@ void GraphScheduler::LinkControlArrowByAutoMonad(KernelActor *to_actor, const An
const std::unordered_set<PrimitivePtr, PrimitiveHasher, PrimitiveEqual> recursion_prims = {
prim::kPrimDepend, prim::kPrimUpdateState, prim::kPrimLoad, prim::kPrimMakeTuple};
for (const auto &real_depend_input : real_depend_inputs) {
auto real_depend_input_with_idx = AnfAlgo::VisitKernelWithReturnType(real_depend_input, 0, false, return_types);
auto real_depend_kernel = real_depend_input_with_idx.first;
// The monad node and make tuple node need recursion.
if (AnfAlgo::IsOneOfPrimitiveCNode(real_depend_input, recursion_prims)) {
LinkControlArrowByAutoMonad(to_actor, real_depend_input);
if (AnfAlgo::IsOneOfPrimitiveCNode(real_depend_kernel, recursion_prims)) {
LinkControlArrowByAutoMonad(to_actor, real_depend_kernel);
continue;
}
if (!IsKernelActor(real_depend_input)) {
if (!IsKernelActor(real_depend_kernel)) {
continue;
}
// Link the control arrow between the kernel actors.
const auto &from_actor = dynamic_cast<KernelActor *>(FetchActor(real_depend_input->fullname_with_scope()));
MS_EXCEPTION_IF_NULL(from_actor);
MS_LOG(INFO) << "Link control arrow by auto monad, from actor: " << from_actor->GetAID().Name()
const auto &from_actor = dynamic_cast<KernelActor *>(FetchActor(real_depend_kernel->fullname_with_scope()));
MS_LOG(INFO) << "Link control arrow by auto monad, from actor: " << real_depend_kernel->fullname_with_scope()
<< ", to actor: " << to_actor->GetAID().Name();
MS_EXCEPTION_IF_NULL(from_actor);
from_actor->output_control_arrows_.emplace_back(to_actor->GetAID());
to_actor->input_controls_num_++;
}
@ -1427,12 +1439,10 @@ void GraphScheduler::LinkOutputResultArrowForOutputActor(OutputActor *to_actor,
for (const auto &graph : graph_compiler_info.graphs_) {
MS_EXCEPTION_IF_NULL(graph);
++number;
const auto &outputs = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
for (const auto &output : outputs) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
auto outputs = AnfAlgo::GetAllOutputWithIndex(graph->output());
for (const auto &output_with_index : outputs) {
MS_EXCEPTION_IF_NULL(output_with_index.first);
const auto &front_node = FetchFrontNodeByBackendNode(output_with_index.first, graph);
auto origin_output_with_index = KernelWithIndex(front_node, output_with_index.second);
auto origin_output_with_index = FetchFrontNodeWithIndexByGraphOutput(output_with_index, graph);
const auto &iter = graph_compiler_info.origin_outputs_order_.find(origin_output_with_index);
if (iter == graph_compiler_info.origin_outputs_order_.end()) {
continue;
@ -1618,12 +1628,12 @@ void GraphScheduler::LinkDataArrowByCallInput(const GraphCompilerInfo &graph_com
// Collect the output of each funcgraph.
for (const auto &func_graph : func_graphs) {
// The output of funcgraph can only have one.
const auto &outputs = AnfAlgo::GetAllOutput(func_graph->output(), {prim::kPrimTupleGetItem});
auto outputs = AnfAlgo::GetAllOutputWithIndex(func_graph->output());
if (outputs.size() != 1) {
MS_LOG(EXCEPTION) << "Output of func graph is more than one, func graph:" << func_graph->ToString();
}
auto output_with_index = AnfAlgo::VisitKernelWithReturnType(outputs[0], 0);
auto output_with_index = outputs[0];
if (IsKernelActor(output_with_index.first)) {
// Input is a kernel actor.
const auto &iter = front_node_to_actor_.find(output_with_index.first);

View File

@ -23,6 +23,7 @@
#include "utils/trace_base.h"
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/common/pass_manager.h"
#include "backend/optimizer/common/common_backend_optimization.h"
#include "backend/optimizer/cpu/insert_cast_cpu.h"
#include "backend/optimizer/cpu/insert_format_transform_op.h"
#include "backend/optimizer/pass/replace_node_by_proxy.h"
@ -75,6 +76,9 @@ void CPUDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
SetOperatorInfo(graph->execution_order());
OptimizeGraphImpl(graph);
// Run final optimization.
opt::CommonFinalOptimization(graph);
// Remove reorder after PS feature finish adapting push/pull in auto_monad.
auto execution_order = graph->execution_order();
AnfAlgo::ReorderPosteriorExecList(NOT_NULL(&execution_order));

View File

@ -184,11 +184,15 @@ void GPUDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
// Optimization pass which is irrelevant to device type or format.
OptimizeGraphWithoutDeviceInfo(graph);
FormatTransformChecker::GetInstance().CheckSupportFormatTransform(graph);
SetOperatorInfo(graph->execution_order());
// Optimization pass which is relevant to device type or format.
OptimizeGraphWithDeviceInfo(graph);
// Run final optimization.
opt::CommonFinalOptimization(graph);
// Graph kernel fusion optimization
if (context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
opt::GraphKernelOptimize(graph);
@ -270,6 +274,7 @@ void GPUDeviceContext::UpdateGraphDynamicShapeAttr(const NotNull<KernelGraphPtr>
void GPUDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
FormatTransformChecker::GetInstance().CheckSupportFormatTransform(graph);
SetOperatorInfo(graph->execution_order());
auto optimizer = std::make_shared<opt::GraphOptimizer>();

View File

@ -514,30 +514,10 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
const auto &all_branch_output = ControlNodeParser::FetchAllBranchOutputs(root_graph);
for (const auto &branch_output : all_branch_output) {
size_t position = 0;
if (AnfAlgo::CheckPrimitiveType(branch_output, prim::kPrimMakeTuple)) {
const auto &outputs = AnfAlgo::GetAllOutput(branch_output, {prim::kPrimTupleGetItem});
outputs_num = outputs.size();
for (const auto &output : outputs) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
// The InitDataSetQueue kernel has no output.
if (AnfAlgo::GetCNodeName(output_with_index.first) == kInitDatasetQueueOpName) {
continue;
}
outputs_order.emplace(output_with_index, position++);
}
} else if (branch_output->isa<CNode>()) {
outputs_num = AnfAlgo::GetOutputTensorNum(branch_output);
for (size_t i = 0; i < outputs_num; i++) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(branch_output, i, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
// The InitDataSetQueue kernel has no output.
if (AnfAlgo::GetCNodeName(output_with_index.first) == kInitDatasetQueueOpName) {
continue;
}
outputs_order.emplace(output_with_index, position++);
}
auto outputs = AnfAlgo::GetAllOutputWithIndex(branch_output);
outputs_num = outputs.size();
for (const auto &output : outputs) {
outputs_order.emplace(output, position++);
}
}
@ -580,17 +560,12 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
device_contexts.emplace_back(graph_info_to_context.second);
name.append(graph_info_to_context.first);
const auto &outputs = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
auto outputs = AnfAlgo::GetAllOutputWithIndex(graph->output());
for (const auto &output : outputs) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
// The InitDataSetQueue kernel has no output.
if (AnfAlgo::GetCNodeName(output_with_index.first) == kInitDatasetQueueOpName) {
continue;
}
outputs_order.emplace(output_with_index, position++);
outputs_order.emplace(output, position++);
}
}
std::vector<std::vector<int64_t> *> tensors_mask_list(1, const_cast<std::vector<int64_t> *>(tensors_mask));
std::vector<std::vector<TensorPtr> *> input_tensors_list(1,
const_cast<std::vector<tensor::TensorPtr> *>(input_tensors));

View File

@ -372,6 +372,7 @@ inline const PrimitivePtr kPrimMemCpyAsync = std::make_shared<Primitive>("memcpy
inline const PrimitivePtr kPrimFill = std::make_shared<Primitive>("Fill");
inline const PrimitivePtr kPrimFusedPushWeight = std::make_shared<Primitive>("FusedPushWeight");
inline const PrimitivePtr kPrimFusedPullWeight = std::make_shared<Primitive>("FusedPullWeight");
inline const PrimitivePtr kPrimInitDataSetQueue = std::make_shared<Primitive>("InitDataSetQueue");
// Quant ops
inline const PrimitivePtr kPrimBatchNormFold = std::make_shared<Primitive>("BatchNormFold");