!14304 [GraphKernel] Dump GraphKernel split info as text; dump akg kernel launch fail message

From: @dayschan
Reviewed-by: @gaoxiong1,@gaoxiong1,@anyrenwei
Signed-off-by: @anyrenwei
This commit is contained in:
mindspore-ci-bot 2021-03-30 15:32:14 +08:00 committed by Gitee
commit 75fdaaa6aa
7 changed files with 92 additions and 21 deletions

View File

@ -14,33 +14,57 @@
# ============================================================================
"""GraphKernel splitter"""
import os
import json
import json.decoder as jd
import traceback
from mindspore import log as logger
from . import model
from . import utils
def reset_graphmode_for_inplaceassign(graph_list, graph_mode):
for i, g in enumerate(graph_list):
if any([op['name'] == 'InplaceAssign' for op in g['op_desc']]):
graph_mode[i] = 'composite'
def split_with_json(json_str: str):
def split_with_json(json_str, flags_str):
"""Call costmodel to split GraphKernel"""
try:
graph_desc = json.loads(json_str)
flags = json.loads(flags_str)
target = graph_desc['process']
comp = model.load_composite(graph_desc)
graph_split, graph_mode = model.split(comp.graph, target)
is_multi_graph = len(graph_split) > 1
graph_list = list(map(comp.dump, graph_split))
reset_graphmode_for_inplaceassign(graph_list, graph_mode)
_reset_graphmode_for_inplaceassign(graph_list, graph_mode)
result = {"multi_graph": is_multi_graph,
"graph_desc": graph_list,
"graph_mode": graph_mode}
_dump_split_info(flags, json_str, comp.graph, graph_split, graph_mode)
return json.dumps(result)
except jd.JSONDecodeError:
logger.error(traceback.format_exc())
return None
def _reset_graphmode_for_inplaceassign(graph_list, graph_mode):
"""Operator with InplaceAssign should always be composite op"""
for i, g in enumerate(graph_list):
if any([op['name'] == 'InplaceAssign' for op in g['op_desc']]):
graph_mode[i] = 'composite'
def _dump_split_info(flags, graph_json, graph_desc, subgraphs, graph_mode):
"""Dump split info as text"""
if not flags.get("dump_as_text", False):
return
utils.create_dir(utils.GRAPH_KERNEL_DUMP_PATH)
filename = os.path.join(utils.GRAPH_KERNEL_DUMP_PATH, "graph_kernel_split_mode.txt")
with open(filename, "a+") as f:
f.write("********** main graph: {} **********\n".format(graph_desc.name))
f.write("input json:\n{}\n".format(graph_json))
f.write("graph desc:\n{}\n".format(str(graph_desc)))
if len(subgraphs) > 1:
for i, g in enumerate(subgraphs):
f.write("-------- subgraph {}, mode: {} --------\n".format(i, graph_mode[i]))
f.write("{}\n".format(str(g)))
else:
f.write("Graph unchanged.\n")
f.write("\n")

View File

@ -0,0 +1,28 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""GraphKernel utils"""
import os
GRAPH_KERNEL_DUMP_PATH = "graph_kernel_dump"
def create_dir(pathname):
"""Try to create directory"""
if os.path.exists(pathname):
return
try:
os.mkdir(pathname)
except OSError:
pass

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -107,7 +107,9 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
reinterpret_cast<void **>(&runtimeargs[0]), 0);
if (result != CUDA_SUCCESS) {
MS_LOG(ERROR) << "Launch Kernel failed.";
const char *msg = nullptr;
cuGetErrorName(result, &msg);
MS_LOG(ERROR) << "Launch Kernel failed. error: " << msg;
return false;
}
return true;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -29,6 +29,7 @@
#include "backend/kernel_compiler/akg/akg_kernel_json_decoder.h"
#include "backend/optimizer/graph_kernel/graph_kernel_helper.h"
#include "debug/anf_ir_dump.h"
#include "utils/context/graph_kernel_flags.h"
namespace mindspore {
namespace opt {
@ -572,17 +573,19 @@ class CostModelSplitSchemer : public Splitter::SplitSchemer {
// call costmodel split function.
auto json_desc_str = json_desc.dump();
MS_LOG(DEBUG) << "CallPyFn: [" << kGraphKernelSplitFunc << "] with input json:\n" << json_desc_str;
auto ret = parse::python_adapter::CallPyFn(kGraphKernelModule, kGraphKernelSplitFunc, json_desc_str);
auto flags_str = CollectSplitFlags();
MS_LOG(DEBUG) << "CallPyFn: [" << kGraphKernelSplitFunc << "] with input json: " << json_desc_str
<< ". flag: " << flags_str;
auto ret = parse::python_adapter::CallPyFn(kGraphKernelModule, kGraphKernelSplitFunc, json_desc_str, flags_str);
if (py::isinstance<py::none>(ret)) {
MS_LOG(ERROR) << "CallPyFn: [" << kGraphKernelSplitFunc << "] return invalid result. input json:\n"
<< json_desc_str;
<< json_desc_str << ". flag: " << flags_str;
return false;
}
std::string split_graphs_str = py::cast<std::string>(ret);
if (split_graphs_str.empty()) {
MS_LOG(ERROR) << "CallPyFn: [" << kGraphKernelSplitFunc << "] return invalid result. input json:\n"
<< json_desc_str;
<< json_desc_str << ". flag: " << flags_str;
return false;
}
@ -713,6 +716,13 @@ class CostModelSplitSchemer : public Splitter::SplitSchemer {
}
}
virtual std::string CollectSplitFlags() {
const auto &flags = context::GraphKernelFlags::GetInstance();
nlohmann::json flag_json;
flag_json["dump_as_text"] = flags.dump_as_text;
return flag_json.dump();
}
std::shared_ptr<FuncGraph> func_graph_;
AnfNodePtrList topo_all_nodes_;
AnfNodePtrList topo_valid_nodes_;

View File

@ -49,12 +49,13 @@ class GraphKernelFlags {
public:
/**
* dump_as_text, unsupported now.
* Dump info as human-readable text.
* A directory "graph_kernel_dump" will be created, and all information will be dumped in this directory.
*/
bool dump_as_text{false};
/**
* opt_level, value from 0 to 3.
* Optimization level, value from 0 to 3.
* 0: GraphKernel disabled
* 1: GraphKernel enabled
* 2 and 3 are not supported now.
@ -93,17 +94,21 @@ class GraphKernelFlags {
std::vector<std::string> disable_expand_ops;
/**
* enable_cluster_ops, unsupported now.
* Additional clustering operators (case sensitive).
* The operators to be added into the default clustering operator list.
*/
std::vector<std::string> enable_cluster_ops;
/**
* enable_cluster_ops_only, unsupported now.
* Clustering operators to be enabled (case sensitive).
* Unlike the "enable_cluster_ops", the default list will be overwritten by this list.
* Note that the "enable_cluster_ops" and "disable_cluster_ops" will be ignored if this flag is set.
*/
std::vector<std::string> enable_cluster_ops_only;
/**
* disable_cluster_ops, unsupported now.
* Clustering operators to be disabled (case sensitive).
* The behavior is undefined when this list overlaps with "enable_cluster_ops".
*/
std::vector<std::string> disable_cluster_ops;

View File

@ -26,3 +26,5 @@ CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDi
}
CUresult cuModuleUnload(CUmodule hmod) { return CUDA_SUCCESS; }
CUresult cuGetErrorName(CUresult error, const char **pStr) { return CUDA_SUCCESS; }

View File

@ -46,5 +46,5 @@ CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDi
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
CUresult cuModuleUnload(CUmodule hmod);
CUresult cuGetErrorName(CUresult error, const char **pStr);
#endif // TESTS_UT_STUB_RUNTIME_INCLUDE_CUDA_H_