add parallel executor stup

This commit is contained in:
yao_yf 2021-04-08 10:35:11 +08:00
parent 183b8f4725
commit 06a13736a6
5 changed files with 192 additions and 27 deletions

View File

@ -18,7 +18,11 @@
#include <algorithm>
#include <vector>
#include <utility>
#if !defined(NO_DLIB) || defined(ENABLE_GPU)
#include "backend/session/executor_manager.h"
#else
#include "frontend/parallel/parallel_stub/executor_manager_stub.h"
#endif
#include "frontend/parallel/device_manager.h"
#include "utils/comm_manager.h"
#include "utils/ms_context.h"
@ -66,6 +70,79 @@ Status Group::GetIndex(size_t *index) {
GroupManager::GroupManager() { groups_.clear(); }
#if !defined(NO_DLIB) || defined(ENABLE_GPU)
bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
const std::vector<uint32_t> ranks, int device_id) {
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
bool ret = executor->CreateCommGroup(group_name, ranks);
return ret;
}
bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name,
int device_id) {
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
bool ret = executor->DestroyCommGroup(group_name);
return ret;
}
Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info) {
// Create group through the executor
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
for (auto &group : group_info) {
bool ret = executor->CreateCommGroup(group.first, group.second);
if (!ret) {
MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second;
return FAILED;
}
MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second;
}
return SUCCESS;
}
#else
bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
const std::vector<uint32_t> ranks, int device_id) {
MS_LOG(WARNING) << "Create group in stub";
auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
return executor->CreateCommGroup(group_name, ranks);
}
bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name,
int device_id) {
MS_LOG(WARNING) << "Destroy group in stub";
auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
return executor->DestroyCommGroup(group_name);
}
Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info) {
// Create group through the executor
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
for (auto &group : group_info) {
bool ret = executor->CreateCommGroup(group.first, group.second);
if (!ret) {
MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second;
return FAILED;
}
MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second;
}
return SUCCESS;
}
#endif
Status GroupManager::CreateGroup(const std::string &group_name, const std::vector<Device> &devices,
mindspore::parallel::Group *const group) {
// it is simple to use size to determine whether it is a world group
@ -102,9 +179,7 @@ Status GroupManager::CreateGroup(const std::string &group_name, const std::vecto
MS_EXCEPTION_IF_NULL(context_ptr);
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
bool ret = executor->CreateCommGroup(group_name, ranks);
bool ret = CreateGroupByExecutor(device_name, group_name, ranks, device_id);
if (!ret) {
MS_LOG(ERROR) << "Create group failed, group name is " << group_name;
return Status::FAILED;
@ -123,9 +198,7 @@ Status GroupManager::DestroyGroup(const std::string &group_name) {
MS_EXCEPTION_IF_NULL(context_ptr);
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
bool ret = executor->DestroyCommGroup(group_name);
bool ret = DestroyGroupByExecutor(device_name, group_name, device_id);
if (!ret) {
return Status::FAILED;
}
@ -192,26 +265,5 @@ Status GroupManager::FindGroup(const std::string &name, mindspore::parallel::Gro
void GroupManager::Clear() { (void)DestroyAllGroups(); }
Status CreateGroups(const std::vector<std::pair<std::string, std::vector<uint32_t>>> &group_info) {
// Create group through the executor
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
std::string device_name = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id);
MS_EXCEPTION_IF_NULL(executor);
for (auto &group : group_info) {
bool ret = executor->CreateCommGroup(group.first, group.second);
if (!ret) {
MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second;
return FAILED;
}
MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second;
}
return SUCCESS;
}
} // namespace parallel
} // namespace mindspore

View File

@ -67,6 +67,9 @@ class GroupManager {
void Clear();
private:
bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name,
const std::vector<uint32_t> ranks, int device_id);
bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, int device_id);
Status DestroyGroup(const std::string &group_name);
// the key is group name (name_)
std::map<std::string, Group> groups_;

View File

@ -0,0 +1,31 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "frontend/parallel/parallel_stub/executor_manager_stub.h"
namespace mindspore {
namespace parallel {
std::shared_ptr<Executor> ExecutorManager::GetExecutor(const std::string &device_name, int device_id) {
std::string device_key = device_name + "_" + std::to_string(device_id);
auto iter = executors_.find(device_key);
if (iter != executors_.end()) {
return iter->second;
}
auto executor = std::make_shared<Executor>(device_name, device_id);
executors_[device_key] = executor;
return executor;
}
} // namespace parallel
} // namespace mindspore

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_
#define MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_
#include <set>
#include <map>
#include <string>
#include <memory>
#include "frontend/parallel/parallel_stub/executor_stub.h"
namespace mindspore {
namespace parallel {
class Executor;
class ExecutorManager {
public:
static ExecutorManager &Instance() {
static ExecutorManager instance;
return instance;
}
std::shared_ptr<Executor> GetExecutor(const std::string &device_name, int device_id);
private:
ExecutorManager() = default;
~ExecutorManager() = default;
std::map<std::string, std::shared_ptr<Executor>> executors_;
};
} // namespace parallel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_

View File

@ -0,0 +1,38 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H
#define MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H
#include <string>
#include <vector>
#include <memory>
namespace mindspore {
namespace parallel {
class Executor {
public:
Executor(const std::string &device_name, uint32_t device_id) : device_name_(device_name), device_id_(device_id) {}
~Executor() = default;
bool CreateCommGroup(const std::string &group_name, std::vector<uint32_t> ranks) const { return true; }
bool DestroyCommGroup(const std::string &group_name) const { return true; }
private:
std::string device_name_;
uint32_t device_id_;
};
} // namespace parallel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H