!20005 disable mindRT in control flow

Merge pull request !20005 from limingqi107/r1.3
2021-07-12 13:27:42 +00:00 · 2021-07-12 13:27:42 +00:00 · ee5ff9d273
parent 7703c631b3 516122c52c
commit ee5ff9d273
6 changed files with 58 additions and 8 deletions
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -94,6 +94,7 @@ namespace gpu {
 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
 using CollectiveInitializer = device::gpu::CollectiveInitializer;
 using GetLocalRankId = device::gpu::GetLocalRankId;
+using InitNCCLComm = device::gpu::InitNCCLComm;

 void GPUSession::Init(uint32_t device_id) {
  const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
@ -113,7 +114,14 @@ void GPUSession::Init(uint32_t device_id) {
  ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id);
  if (collective_inited) {
    rank_id_ = GetRankId();
+    if (collective_handle_ != nullptr) {
+      auto init_nccl_comm_funcptr =
+        reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
+      MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
+      (*init_nccl_comm_funcptr)();
+    }
  }
+
  auto &json_parser = DumpJsonParser::GetInstance();
  // Dump json config file if dump is enabled
  json_parser.CopyJsonToDir(rank_id_);
--- a/mindspore/ccsrc/frontend/parallel/group_manager.cc
+++ b/mindspore/ccsrc/frontend/parallel/group_manager.cc
@ -73,7 +73,9 @@ GroupManager::GroupManager() { groups_.clear(); }
 #if !defined(NO_DLIB) || defined(ENABLE_GPU)
 bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
                                         const std::vector<uint32_t> ranks, int device_id) {
-  if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
+  // The group operation thread must be same with nccl init thread in the GPU device.
+  if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) ||
+      (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice)) {
    return CommManager::GetInstance().CreateGroupSync(group_name, ranks);
  } else {
    auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
@ -84,7 +86,9 @@ bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const s

 bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name,
                                          int device_id) {
-  if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
+  // The group operation thread must be same with nccl init thread in the GPU device.
+  if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) ||
+      (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice)) {
    return CommManager::GetInstance().DestroyGroup(group_name);
  } else {
    auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
@ -103,7 +107,9 @@ Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_
  MS_EXCEPTION_IF_NULL(executor);
  for (auto &group : group_info) {
    bool ret = true;
-    if (context_ptr->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
+    // The group operation thread must be same with nccl init thread in the GPU device.
+    if (context_ptr->get_param<bool>(MS_CTX_ENABLE_MINDRT) ||
+        (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice)) {
      ret = CommManager::GetInstance().CreateGroupSync(group.first, group.second);
    } else {
      ret = executor->CreateCommGroup(group.first, group.second);
--- a/mindspore/ccsrc/pipeline/jit/action.cc
+++ b/mindspore/ccsrc/pipeline/jit/action.cc
@ -55,6 +55,33 @@
 namespace mindspore {
 namespace pipeline {
 namespace {
+// Disable mindRT in the control flow scenario.
+void ResetMindRTEnable(const ResourcePtr &res) {
+  MS_EXCEPTION_IF_NULL(res);
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (context_ptr->get_param<bool>(MS_CTX_ENABLE_MINDRT) == false) {
+    return;
+  }
+
+  auto func_graph = res->func_graph();
+  MS_EXCEPTION_IF_NULL(func_graph);
+  if (func_graph != nullptr && func_graph->manager() != nullptr) {
+    auto manager = func_graph->manager();
+    size_t graph_nums = manager->func_graphs().size();
+    if (graph_nums == 1) {
+      return;
+    }
+
+    MS_LOG(INFO) << "Disable mindRT in the multi graphs scenario.";
+    context_ptr->set_param<bool>(MS_CTX_ENABLE_MINDRT, false);
+    // Update the backend.
+    auto new_backend = compile::CreateBackend();
+    new_backend->SetDebugger();
+    res->results()[kBackend] = new_backend;
+  }
+}
+
 void TaskEmitActionForMindRT(const ResourcePtr &res) {
  MS_EXCEPTION_IF_NULL(res);
  // Get the mindRT backend.
@ -544,6 +571,8 @@ bool TaskEmitAction(const ResourcePtr &res) {
  if (res->func_graph() == nullptr) {
    MS_LOG(EXCEPTION) << "TaskEmit args error";
  }
+  // Disable mindRT in the control flow scenario.
+  ResetMindRTEnable(res);
  FuncGraphPtr func_graph = res->func_graph();
  MS_EXCEPTION_IF_NULL(func_graph);
  auto bc_ptr = res->results()[kBackend].cast<compile::BackendPtr>();
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc
@ -25,6 +25,9 @@ namespace device {
 namespace gpu {
 void GPUDeviceManager::InitDevice() {
  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SetDevice(SizeToInt(cur_dev_id_)), "Failed to set current device id");
+  if (dev_alive_) {
+    return;
+  }
  CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream.");
  CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle");
  CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@ -506,11 +506,6 @@ void GraphScheduler::BuildAndScheduleGlobalActor() {
 }

 ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) {
-  // Local maps and vectors clear.
-  graph_output_to_actor_.clear();
-  front_node_to_actor_.clear();
-  copy_actors_.clear();
-
  MS_LOG(INFO) << "Graph(" << graph_compiler_info.name_ << ") transforms actor begin.";
  if (graph_compiler_info.graphs_.size() == 0) {
    MS_LOG(EXCEPTION) << "The number of graphs is zero.";
@ -534,6 +529,12 @@ ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info
    MS_LOG(EXCEPTION) << "The actor set of " << graph_compiler_info.name_ << " is invalid.";
  }
  MS_LOG(INFO) << "Graph(" << graph_compiler_info.name_ << ") transforms actor end.";
+
+  // Local maps and vectors clear.
+  graph_output_to_actor_.clear();
+  front_node_to_actor_.clear();
+  copy_actors_.clear();
+
  return actor_set.get();
 }

--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -90,6 +90,9 @@ def run_e2e_dump():
    if context.get_context("device_target") == "Ascend":
        assert len(os.listdir(dump_file_path)) == 5
        output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy"
+    elif context.get_context("device_target") == "CPU":
+        assert len(os.listdir(dump_file_path)) == 5
+        output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
    else:
        assert len(os.listdir(dump_file_path)) == 3
        output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"