dump graph_kernel_split info

2021-03-27 15:56:54 +08:00 · 2021-03-27 15:56:54 +08:00 · 3c6c30024c
parent 0a0dc05d51
commit 3c6c30024c
7 changed files with 92 additions and 21 deletions
--- a/mindspore/_extends/graph_kernel/splitter.py
+++ b/mindspore/_extends/graph_kernel/splitter.py
@ -14,33 +14,57 @@
 # ============================================================================
 """GraphKernel splitter"""

+import os
 import json
 import json.decoder as jd
 import traceback
 from mindspore import log as logger
 from . import model
+from . import utils


-def reset_graphmode_for_inplaceassign(graph_list, graph_mode):
-    for i, g in enumerate(graph_list):
-        if any([op['name'] == 'InplaceAssign' for op in g['op_desc']]):
-            graph_mode[i] = 'composite'
-
-
-def split_with_json(json_str: str):
+def split_with_json(json_str, flags_str):
    """Call costmodel to split GraphKernel"""
    try:
        graph_desc = json.loads(json_str)
+        flags = json.loads(flags_str)
        target = graph_desc['process']
        comp = model.load_composite(graph_desc)
        graph_split, graph_mode = model.split(comp.graph, target)
        is_multi_graph = len(graph_split) > 1
        graph_list = list(map(comp.dump, graph_split))
-        reset_graphmode_for_inplaceassign(graph_list, graph_mode)
+        _reset_graphmode_for_inplaceassign(graph_list, graph_mode)
        result = {"multi_graph": is_multi_graph,
                  "graph_desc": graph_list,
                  "graph_mode": graph_mode}
+        _dump_split_info(flags, json_str, comp.graph, graph_split, graph_mode)
        return json.dumps(result)
    except jd.JSONDecodeError:
        logger.error(traceback.format_exc())
        return None
+
+
+def _reset_graphmode_for_inplaceassign(graph_list, graph_mode):
+    """Operator with InplaceAssign should always be composite op"""
+    for i, g in enumerate(graph_list):
+        if any([op['name'] == 'InplaceAssign' for op in g['op_desc']]):
+            graph_mode[i] = 'composite'
+
+
+def _dump_split_info(flags, graph_json, graph_desc, subgraphs, graph_mode):
+    """Dump split info as text"""
+    if not flags.get("dump_as_text", False):
+        return
+    utils.create_dir(utils.GRAPH_KERNEL_DUMP_PATH)
+    filename = os.path.join(utils.GRAPH_KERNEL_DUMP_PATH, "graph_kernel_split_mode.txt")
+    with open(filename, "a+") as f:
+        f.write("********** main graph: {} **********\n".format(graph_desc.name))
+        f.write("input json:\n{}\n".format(graph_json))
+        f.write("graph desc:\n{}\n".format(str(graph_desc)))
+        if len(subgraphs) > 1:
+            for i, g in enumerate(subgraphs):
+                f.write("-------- subgraph {}, mode: {} --------\n".format(i, graph_mode[i]))
+                f.write("{}\n".format(str(g)))
+        else:
+            f.write("Graph unchanged.\n")
+        f.write("\n")
--- a/mindspore/_extends/graph_kernel/utils.py
+++ b/mindspore/_extends/graph_kernel/utils.py
@ -0,0 +1,28 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""GraphKernel utils"""
+import os
+
+GRAPH_KERNEL_DUMP_PATH = "graph_kernel_dump"
+
+
+def create_dir(pathname):
+    """Try to create directory"""
+    if os.path.exists(pathname):
+        return
+    try:
+        os.mkdir(pathname)
+    except OSError:
+        pass
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -107,7 +107,9 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
                          thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
                          reinterpret_cast<void **>(&runtimeargs[0]), 0);
  if (result != CUDA_SUCCESS) {
-    MS_LOG(ERROR) << "Launch Kernel failed.";
+    const char *msg = nullptr;
+    cuGetErrorName(result, &msg);
+    MS_LOG(ERROR) << "Launch Kernel failed. error: " << msg;
    return false;
  }
  return true;
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -29,6 +29,7 @@
 #include "backend/kernel_compiler/akg/akg_kernel_json_decoder.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_helper.h"
 #include "debug/anf_ir_dump.h"
+#include "utils/context/graph_kernel_flags.h"

 namespace mindspore {
 namespace opt {
@ -572,17 +573,19 @@ class CostModelSplitSchemer : public Splitter::SplitSchemer {

    // call costmodel split function.
    auto json_desc_str = json_desc.dump();
-    MS_LOG(DEBUG) << "CallPyFn: [" << kGraphKernelSplitFunc << "] with input json:\n" << json_desc_str;
-    auto ret = parse::python_adapter::CallPyFn(kGraphKernelModule, kGraphKernelSplitFunc, json_desc_str);
+    auto flags_str = CollectSplitFlags();
+    MS_LOG(DEBUG) << "CallPyFn: [" << kGraphKernelSplitFunc << "] with input json: " << json_desc_str
+                  << ". flag: " << flags_str;
+    auto ret = parse::python_adapter::CallPyFn(kGraphKernelModule, kGraphKernelSplitFunc, json_desc_str, flags_str);
    if (py::isinstance<py::none>(ret)) {
      MS_LOG(ERROR) << "CallPyFn: [" << kGraphKernelSplitFunc << "] return invalid result. input json:\n"
-                    << json_desc_str;
+                    << json_desc_str << ". flag: " << flags_str;
      return false;
    }
    std::string split_graphs_str = py::cast<std::string>(ret);
    if (split_graphs_str.empty()) {
      MS_LOG(ERROR) << "CallPyFn: [" << kGraphKernelSplitFunc << "] return invalid result. input json:\n"
-                    << json_desc_str;
+                    << json_desc_str << ". flag: " << flags_str;
      return false;
    }

@ -713,6 +716,13 @@ class CostModelSplitSchemer : public Splitter::SplitSchemer {
    }
  }

+  virtual std::string CollectSplitFlags() {
+    const auto &flags = context::GraphKernelFlags::GetInstance();
+    nlohmann::json flag_json;
+    flag_json["dump_as_text"] = flags.dump_as_text;
+    return flag_json.dump();
+  }
+
  std::shared_ptr<FuncGraph> func_graph_;
  AnfNodePtrList topo_all_nodes_;
  AnfNodePtrList topo_valid_nodes_;
--- a/mindspore/ccsrc/utils/context/graph_kernel_flags.h
+++ b/mindspore/ccsrc/utils/context/graph_kernel_flags.h
@ -49,12 +49,13 @@ class GraphKernelFlags {

 public:
  /**
-   * dump_as_text, unsupported now.
+   * Dump info as human-readable text.
+   * A directory "graph_kernel_dump" will be created, and all information will be dumped in this directory.
   */
  bool dump_as_text{false};

  /**
-   * opt_level, value from 0 to 3.
+   * Optimization level, value from 0 to 3.
   * 0: GraphKernel disabled
   * 1: GraphKernel enabled
   * 2 and 3 are not supported now.
@ -93,17 +94,21 @@ class GraphKernelFlags {
  std::vector<std::string> disable_expand_ops;

  /**
-   * enable_cluster_ops, unsupported now.
+   * Additional clustering operators (case sensitive).
+   * The operators to be added into the default clustering operator list.
   */
  std::vector<std::string> enable_cluster_ops;

  /**
-   * enable_cluster_ops_only, unsupported now.
+   * Clustering operators to be enabled (case sensitive).
+   * Unlike the "enable_cluster_ops", the default list will be overwritten by this list.
+   * Note that the "enable_cluster_ops" and "disable_cluster_ops" will be ignored if this flag is set.
   */
  std::vector<std::string> enable_cluster_ops_only;

  /**
-   * disable_cluster_ops, unsupported now.
+   * Clustering operators to be disabled (case sensitive).
+   * The behavior is undefined when this list overlaps with "enable_cluster_ops".
   */
  std::vector<std::string> disable_cluster_ops;

--- a/tests/ut/cpp/stub/runtime/cuda.cc
+++ b/tests/ut/cpp/stub/runtime/cuda.cc
@ -26,3 +26,5 @@ CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDi
 }

 CUresult cuModuleUnload(CUmodule hmod) { return CUDA_SUCCESS; }
+
+CUresult cuGetErrorName(CUresult error, const char **pStr) { return CUDA_SUCCESS; }
--- a/tests/ut/cpp/stub/runtime/cuda.h
+++ b/tests/ut/cpp/stub/runtime/cuda.h
@ -46,5 +46,5 @@ CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDi
                        unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
                        unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 CUresult cuModuleUnload(CUmodule hmod);
-
+CUresult cuGetErrorName(CUresult error, const char **pStr);
 #endif  // TESTS_UT_STUB_RUNTIME_INCLUDE_CUDA_H_