From 4645a43e089a45fbb4130bd137afb36f665eb5d8 Mon Sep 17 00:00:00 2001 From: ZPaC Date: Thu, 9 Jul 2020 11:12:35 +0800 Subject: [PATCH] Add ps module in batches --- mindspore/ccsrc/parallel/CMakeLists.txt | 1 + mindspore/ccsrc/parallel/ps/scheduler.cc | 32 ++++++ mindspore/ccsrc/parallel/ps/scheduler.h | 40 +++++++ mindspore/ccsrc/parallel/ps/util.cc | 128 ++++++++++++++++++++++ mindspore/ccsrc/parallel/ps/util.h | 47 ++++++++ tests/ut/cpp/CMakeLists.txt | 2 + third_party/patch/pslite/ps_lite.patch001 | 11 +- 7 files changed, 251 insertions(+), 10 deletions(-) create mode 100755 mindspore/ccsrc/parallel/ps/scheduler.cc create mode 100755 mindspore/ccsrc/parallel/ps/scheduler.h create mode 100644 mindspore/ccsrc/parallel/ps/util.cc create mode 100644 mindspore/ccsrc/parallel/ps/util.h diff --git a/mindspore/ccsrc/parallel/CMakeLists.txt b/mindspore/ccsrc/parallel/CMakeLists.txt index 940b1ed1d85..e435599e095 100644 --- a/mindspore/ccsrc/parallel/CMakeLists.txt +++ b/mindspore/ccsrc/parallel/CMakeLists.txt @@ -1,4 +1,5 @@ file(GLOB_RECURSE _PARALLEL_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") +list(REMOVE_ITEM _PARALLEL_SRC_FILES "ps/util.cc" "ps/scheduler.cc") if (ENABLE_DUMP_PROTO) list(REMOVE_ITEM _PARALLEL_SRC_FILES "parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc") endif () diff --git a/mindspore/ccsrc/parallel/ps/scheduler.cc b/mindspore/ccsrc/parallel/ps/scheduler.cc new file mode 100755 index 00000000000..81cd5f9358e --- /dev/null +++ b/mindspore/ccsrc/parallel/ps/scheduler.cc @@ -0,0 +1,32 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parallel/ps/scheduler.h" +#include +#include "ps/ps.h" + +namespace mindspore { +namespace parallel { +namespace ps { +void Scheduler::Run() { + ::ps::Start(0); + while (true) { + sleep(1); + } +} +} // namespace ps +} // namespace parallel +} // namespace mindspore diff --git a/mindspore/ccsrc/parallel/ps/scheduler.h b/mindspore/ccsrc/parallel/ps/scheduler.h new file mode 100755 index 00000000000..e656bcfd22c --- /dev/null +++ b/mindspore/ccsrc/parallel/ps/scheduler.h @@ -0,0 +1,40 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_PARALLEL_PS_SCHEDULER_H_ +#define MINDSPORE_MINDSPORE_CCSRC_PARALLEL_PS_SCHEDULER_H_ +namespace mindspore { +namespace parallel { +namespace ps { +class Scheduler { + public: + static Scheduler &GetInstance() { + static Scheduler instance; + return instance; + } + + void Run(); + + private: + Scheduler() = default; + ~Scheduler() = default; + Scheduler(const Scheduler &) = delete; + Scheduler &operator=(const Scheduler &) = delete; +}; +} // namespace ps +} // namespace parallel +} // namespace mindspore +#endif // MINDSPORE_MINDSPORE_CCSRC_PARALLEL_PS_SCHEDULER_H_ diff --git a/mindspore/ccsrc/parallel/ps/util.cc b/mindspore/ccsrc/parallel/ps/util.cc new file mode 100644 index 00000000000..dbc258284e7 --- /dev/null +++ b/mindspore/ccsrc/parallel/ps/util.cc @@ -0,0 +1,128 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parallel/ps/util.h" +#include +#include "parallel/ps/common.h" +#include "common/utils.h" + +namespace mindspore { +namespace parallel { +namespace ps { +std::unordered_map Util::optimizer_to_ids{ + {kApplyMomentum, 0}, + {kSparseAdam, 1}, + {kSparseFtrl, 2}, +}; + +std::unordered_map Util::id_to_optimizers{ + {0, kApplyMomentum}, + {1, kSparseAdam}, + {2, kSparseFtrl}, +}; +bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); } + +bool Util::IsRoleOfWorker() { + auto role = common::GetEnv(kEnvRole); + if (strcmp(role.c_str(), kEnvRoleOfWorker) == 0) { + return true; + } else { + return false; + } +} + +bool Util::IsRoleOfPServer() { + auto role = common::GetEnv(kEnvRole); + if (strcmp(role.c_str(), kEnvRoleOfPServer) == 0) { + return true; + } else { + return false; + } +} + +bool Util::IsRoleOfScheduler() { + auto role = common::GetEnv(kEnvRole); + if (strcmp(role.c_str(), kEnvRoleOfScheduler) == 0) { + return true; + } else { + return false; + } +} + +void Util::SetInternalEnvVar() { + if (IsParamServerMode()) { + auto comm_type = common::GetEnv(kEnvCommType); + if (comm_type.size() > 0) { + (void)common::SetEnv(kDmlcCommType, comm_type.c_str()); + } + auto interface = common::GetEnv(kEnvInterface); + if (interface.size() > 0) { + (void)common::SetEnv(kDmlcInterface, interface.c_str()); + } + auto server_num = common::GetEnv(kEnvPServerNum); + if (server_num.size() > 0) { + (void)common::SetEnv(kDmlcPServerNum, server_num.c_str()); + } + auto worker_num = common::GetEnv(kEnvWorkerNum); + if (worker_num.size() > 0) { + (void)common::SetEnv(kDmlcWorkerNum, worker_num.c_str()); + } + if (IsRoleOfScheduler()) { + (void)common::SetEnv(kDmlcRole, kRoleOfScheduler); + } else if (IsRoleOfPServer()) { + (void)common::SetEnv(kDmlcRole, kRoleOfPServer); + } else if (IsRoleOfWorker()) { + (void)common::SetEnv(kDmlcRole, kRoleOfWorker); + } + auto scheduler_host = common::GetEnv(kEnvSchedulerHost); + if (scheduler_host.size() > 0) { + (void)common::SetEnv(kDmlcSchedulerHost, scheduler_host.c_str()); + } + auto scheduler_port = common::GetEnv(kEnvSchedulerPort); + if (scheduler_port.size() > 0) { + (void)common::SetEnv(kDmlcSchedulerPort, scheduler_port.c_str()); + } + } +} + +int Util::optimizer_id(std::string name) { + if (optimizer_to_ids.count(name) > 0) { + return optimizer_to_ids[name]; + } + return -1; +} + +std::string Util::optimizer_name(int id) { + if (id_to_optimizers.count(id) > 0) { + return id_to_optimizers[id]; + } + return ""; +} + +bool Util::is_optimizer(std::string name) { return optimizer_to_ids.count(name) > 0; } + +int Util::LocalShard(int first_dim, int rank_id, int server_num) { + int shard_size = std::round((static_cast(first_dim)) / server_num); + int remain_size = first_dim % server_num; + if (remain_size == 0 || rank_id < server_num - 1) { + return shard_size; + } else { + return first_dim - (shard_size * (server_num - 1)); + } +} +} // namespace ps +} // namespace parallel +} // namespace mindspore diff --git a/mindspore/ccsrc/parallel/ps/util.h b/mindspore/ccsrc/parallel/ps/util.h new file mode 100644 index 00000000000..b55ced0c97c --- /dev/null +++ b/mindspore/ccsrc/parallel/ps/util.h @@ -0,0 +1,47 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_PARALLEL_PS_UTIL_H_ +#define MINDSPORE_MINDSPORE_CCSRC_PARALLEL_PS_UTIL_H_ + +#include +#include +#include +#include "session/anf_runtime_algorithm.h" + +namespace mindspore { +namespace parallel { +namespace ps { +class Util { + public: + static bool IsParamServerMode(); + static bool IsRoleOfWorker(); + static bool IsRoleOfPServer(); + static bool IsRoleOfScheduler(); + static void SetInternalEnvVar(); + static int optimizer_id(std::string name); + static std::string optimizer_name(int id); + static bool is_optimizer(std::string name); + static int LocalShard(int first_dim, int rank_id, int server_num); + + private: + static std::unordered_map optimizer_to_ids; + static std::unordered_map id_to_optimizers; +}; +} // namespace ps +} // namespace parallel +} // namespace mindspore +#endif // MINDSPORE_MINDSPORE_CCSRC_PARALLEL_PS_UTIL_H_ diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index dcc798165b7..e4d52f6eeec 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -115,6 +115,8 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/debug/dump_proto.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ir/lite/tensor.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc") +list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/parallel/ps/util.cc") +list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/parallel/ps/scheduler.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/anf_ir.pb.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/node_strategy.pb.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/utils/load_onnx/anf_model_parser.cc") diff --git a/third_party/patch/pslite/ps_lite.patch001 b/third_party/patch/pslite/ps_lite.patch001 index bdc7b11a4b2..e2e51e93c86 100644 --- a/third_party/patch/pslite/ps_lite.patch001 +++ b/third_party/patch/pslite/ps_lite.patch001 @@ -12,16 +12,7 @@ diff -Npur ps-lite-master/include/dmlc/base.h ps-lite-master-new/include/dmlc/ba /*! diff -Npur ps-lite-master/include/dmlc/logging.h ps-lite-master-new/include/dmlc/logging.h --- ps-lite-master/include/dmlc/logging.h 2020-02-29 13:59:55.000000000 +0800 -+++ ps-lite-master-new/include/dmlc/logging.h 2020-07-01 11:58:00.015919207 +0800 -@@ -13,7 +13,7 @@ - #include - #include - #include --#include "./base.h" -+//#include "./base.h" - - #if DMLC_LOG_STACK_TRACE - #include ++++ ps-lite-master-new/include/dmlc/logging.h 2020-07-08 21:35:33.334584767 +0800 @@ -52,7 +52,7 @@ struct Error : public std::runtime_error namespace dmlc {