forked from mindspore-Ecosystem/mindspore
add parallel executor stup
This commit is contained in:
parent
183b8f4725
commit
06a13736a6
|
@ -18,7 +18,11 @@
|
|||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#if !defined(NO_DLIB) || defined(ENABLE_GPU)
|
||||
#include "backend/session/executor_manager.h"
|
||||
#else
|
||||
#include "frontend/parallel/parallel_stub/executor_manager_stub.h"
|
||||
#endif
|
||||
#include "frontend/parallel/device_manager.h"
|
||||
#include "utils/comm_manager.h"
|
||||
#include "utils/ms_context.h"
|
||||
|
@ -66,6 +70,79 @@ Status Group::GetIndex(size_t *index) {
|
|||
|
||||
GroupManager::GroupManager() { groups_.clear(); }
|
||||
|
||||
#if !defined(NO_DLIB) || defined(ENABLE_GPU)
|
||||
bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
|
||||
const std::vector<uint32_t> ranks, int device_id) {
|
||||
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
bool ret = executor->CreateCommGroup(group_name, ranks);
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name,
|
||||
int device_id) {
|
||||
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
bool ret = executor->DestroyCommGroup(group_name);
|
||||
return ret;
|
||||
}
|
||||
|
||||
Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info) {
|
||||
// Create group through the executor
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
for (auto &group : group_info) {
|
||||
bool ret = executor->CreateCommGroup(group.first, group.second);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second;
|
||||
return FAILED;
|
||||
}
|
||||
MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
#else
|
||||
bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
|
||||
const std::vector<uint32_t> ranks, int device_id) {
|
||||
MS_LOG(WARNING) << "Create group in stub";
|
||||
auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
return executor->CreateCommGroup(group_name, ranks);
|
||||
}
|
||||
|
||||
bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name,
|
||||
int device_id) {
|
||||
MS_LOG(WARNING) << "Destroy group in stub";
|
||||
auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
return executor->DestroyCommGroup(group_name);
|
||||
}
|
||||
|
||||
Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info) {
|
||||
// Create group through the executor
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
for (auto &group : group_info) {
|
||||
bool ret = executor->CreateCommGroup(group.first, group.second);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second;
|
||||
return FAILED;
|
||||
}
|
||||
MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
#endif
|
||||
Status GroupManager::CreateGroup(const std::string &group_name, const std::vector<Device> &devices,
|
||||
mindspore::parallel::Group *const group) {
|
||||
// it is simple to use size to determine whether it is a world group
|
||||
|
@ -102,9 +179,7 @@ Status GroupManager::CreateGroup(const std::string &group_name, const std::vecto
|
|||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
bool ret = executor->CreateCommGroup(group_name, ranks);
|
||||
bool ret = CreateGroupByExecutor(device_name, group_name, ranks, device_id);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Create group failed, group name is " << group_name;
|
||||
return Status::FAILED;
|
||||
|
@ -123,9 +198,7 @@ Status GroupManager::DestroyGroup(const std::string &group_name) {
|
|||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
bool ret = executor->DestroyCommGroup(group_name);
|
||||
bool ret = DestroyGroupByExecutor(device_name, group_name, device_id);
|
||||
if (!ret) {
|
||||
return Status::FAILED;
|
||||
}
|
||||
|
@ -192,26 +265,5 @@ Status GroupManager::FindGroup(const std::string &name, mindspore::parallel::Gro
|
|||
|
||||
void GroupManager::Clear() { (void)DestroyAllGroups(); }
|
||||
|
||||
Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info) {
|
||||
// Create group through the executor
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(executor);
|
||||
|
||||
for (auto &group : group_info) {
|
||||
bool ret = executor->CreateCommGroup(group.first, group.second);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second;
|
||||
return FAILED;
|
||||
}
|
||||
MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -67,6 +67,9 @@ class GroupManager {
|
|||
void Clear();
|
||||
|
||||
private:
|
||||
bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
|
||||
const std::vector<uint32_t> ranks, int device_id);
|
||||
bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, int device_id);
|
||||
Status DestroyGroup(const std::string &group_name);
|
||||
// the key is group name (name_)
|
||||
std::map<std::string, Group> groups_;
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "frontend/parallel/parallel_stub/executor_manager_stub.h"
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
std::shared_ptr<Executor> ExecutorManager::GetExecutor(const std::string &device_name, int device_id) {
|
||||
std::string device_key = device_name + "_" + std::to_string(device_id);
|
||||
auto iter = executors_.find(device_key);
|
||||
if (iter != executors_.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
auto executor = std::make_shared<Executor>(device_name, device_id);
|
||||
executors_[device_key] = executor;
|
||||
return executor;
|
||||
}
|
||||
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_
|
||||
#define MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "frontend/parallel/parallel_stub/executor_stub.h"
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
class Executor;
|
||||
class ExecutorManager {
|
||||
public:
|
||||
static ExecutorManager &Instance() {
|
||||
static ExecutorManager instance;
|
||||
return instance;
|
||||
}
|
||||
std::shared_ptr<Executor> GetExecutor(const std::string &device_name, int device_id);
|
||||
|
||||
private:
|
||||
ExecutorManager() = default;
|
||||
~ExecutorManager() = default;
|
||||
std::map<std::string, std::shared_ptr<Executor>> executors_;
|
||||
};
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H
|
||||
#define MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
class Executor {
|
||||
public:
|
||||
Executor(const std::string &device_name, uint32_t device_id) : device_name_(device_name), device_id_(device_id) {}
|
||||
~Executor() = default;
|
||||
bool CreateCommGroup(const std::string &group_name, std::vector<uint32_t> ranks) const { return true; }
|
||||
bool DestroyCommGroup(const std::string &group_name) const { return true; }
|
||||
|
||||
private:
|
||||
std::string device_name_;
|
||||
uint32_t device_id_;
|
||||
};
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H
|
Loading…
Reference in New Issue