add kbk features
This commit is contained in:
parent
cbb2c60804
commit
b7dc93c5e2
|
@ -28,6 +28,7 @@
|
|||
#include "ops/framework_ops.h"
|
||||
#include "ir/anf.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "ir/func_graph_cloner.h"
|
||||
#include "utils/shape_utils.h"
|
||||
#include "include/common/utils/utils.h"
|
||||
#include "include/common/utils/parallel_context.h"
|
||||
|
|
|
@ -2749,5 +2749,88 @@ void KernelGraphMgr::SetKernelGraphId(const KernelGraphPtr &kernel_graph) {
|
|||
}
|
||||
|
||||
void KernelGraphMgr::UnifyMindIR(const KernelGraphPtr &graph) { opt::CommonUnifyMindIR(graph); }
|
||||
|
||||
namespace {
|
||||
void CopyCNodeInfo(const FuncGraphPtr &func_graph, const uint32_t &target_graph_id, const AnfNodePtr &ori_node,
|
||||
const AnfNodePtr &new_node) {
|
||||
MS_EXCEPTION_IF_NULL(new_node);
|
||||
MS_EXCEPTION_IF_NULL(ori_node);
|
||||
auto kernel_info = dynamic_cast<device::KernelInfo *>(new_node->kernel_info());
|
||||
// deep copy kernel info
|
||||
if (kernel_info != nullptr && kernel_info->has_build_info()) {
|
||||
// some check
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(kernel_info->MutableKernelMod() == nullptr,
|
||||
"Inline ERROR: " + ori_node->DebugString() + ", kernel mod is not nullptr");
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(kernel_info->output_address_list().empty(),
|
||||
"Inline ERROR: " + ori_node->DebugString() + ", output_address_list is not empty");
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(kernel_info->workspace_address_list().empty(),
|
||||
"Inline ERROR: " + ori_node->DebugString() + ", workspace_address_list is not empty");
|
||||
|
||||
auto new_kernel_info = std::make_shared<device::KernelInfo>();
|
||||
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(
|
||||
AnfRuntimeAlgorithm::GetSelectKernelBuildInfo(new_node));
|
||||
MS_EXCEPTION_IF_NULL(builder);
|
||||
MS_EXCEPTION_IF_NULL(new_kernel_info);
|
||||
new_kernel_info->set_select_kernel_build_info(builder->Build());
|
||||
new_kernel_info->set_graph_id(target_graph_id);
|
||||
new_kernel_info->set_feature_map_flag(kernel_info->is_feature_map());
|
||||
new_kernel_info->set_ref_map(false, kernel_info->out_in_ref_map());
|
||||
new_node->set_kernel_info(new_kernel_info);
|
||||
} else {
|
||||
auto new_kernel_info = std::make_shared<device::KernelInfo>();
|
||||
new_node->set_kernel_info(new_kernel_info);
|
||||
}
|
||||
if (ori_node->isa<CNode>()) {
|
||||
auto ori_cnode = ori_node->cast<CNodePtr>();
|
||||
if (common::AnfAlgo::HasNodeAttr(kAttrIsUBFusionOp, ori_cnode) &&
|
||||
common::AnfAlgo::GetNodeAttr<bool>(ori_node, kAttrIsUBFusionOp)) {
|
||||
// already done fusion compile
|
||||
auto ori_full_name = ori_cnode->fullname_with_scope();
|
||||
common::AnfAlgo::SetNodeAttr(kAttrOriFusionName, MakeValue(ori_full_name), new_node);
|
||||
}
|
||||
common::AnfAlgo::SetNodeAttr(kAttrNeedInline, MakeValue(ori_node->fullname_with_scope()), new_node);
|
||||
common::AnfAlgo::SetNodeAttr(kAttrPreKernelGraph, MakeValue(func_graph), new_node);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
AnfNodePtr KernelGraphMgr::DoInline(const FuncGraphPtr &func_graph, const FuncGraphPtr &target_func_graph,
|
||||
const AnfNodePtrList &func_graph_args, const ScopePtr &scope,
|
||||
const uint32_t &target_graph_id,
|
||||
const std::map<session::AnfWithOutIndex, session::AnfWithOutIndex> &ref_map,
|
||||
const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(func_graph);
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(target_func_graph);
|
||||
Cloner cloner({}, false);
|
||||
if (scope != nullptr) {
|
||||
cloner.set_scope(scope);
|
||||
}
|
||||
cloner.AddClone(func_graph, target_func_graph, func_graph_args, kInline);
|
||||
auto node_list = TopoSort(func_graph->output());
|
||||
for (auto &ori_node : node_list) {
|
||||
MS_EXCEPTION_IF_NULL(ori_node);
|
||||
if (ori_node->isa<Parameter>()) {
|
||||
continue;
|
||||
}
|
||||
auto new_node = cloner[ori_node];
|
||||
MS_EXCEPTION_IF_NULL(new_node);
|
||||
if (new_node->isa<ValueNode>()) {
|
||||
auto value_node = new_node->cast<ValueNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(value_node);
|
||||
graph->AddValueNodeToGraph(value_node);
|
||||
}
|
||||
CopyCNodeInfo(func_graph, target_graph_id, ori_node, new_node);
|
||||
}
|
||||
for (const auto &kv : ref_map) {
|
||||
auto final_pair = kv.first;
|
||||
auto origin_pair = kv.second;
|
||||
final_pair.first = cloner[final_pair.first];
|
||||
origin_pair.first = cloner[origin_pair.first];
|
||||
auto new_origin_pair = common::AnfAlgo::VisitKernel(origin_pair.first, origin_pair.second);
|
||||
graph->AddRefCorrespondPairs(final_pair, new_origin_pair);
|
||||
}
|
||||
return cloner[func_graph->output()];
|
||||
}
|
||||
} // namespace session
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -98,6 +98,12 @@ class BACKEND_EXPORT KernelGraphMgr {
|
|||
|
||||
mindspore::HashMap<FuncGraph *, KernelGraphPtr> GetFrontBackendGraphMap() const { return front_backend_graph_map_; }
|
||||
void CacheKernelGraph(const KernelGraphPtr &kg);
|
||||
// do inline
|
||||
static AnfNodePtr DoInline(const FuncGraphPtr &func_graph, const FuncGraphPtr &target_func_graph,
|
||||
const AnfNodePtrList &func_graph_args, const ScopePtr &scope,
|
||||
const uint32_t &target_graph_id,
|
||||
const std::map<session::AnfWithOutIndex, session::AnfWithOutIndex> &ref_map,
|
||||
const KernelGraphPtr &graph);
|
||||
|
||||
private:
|
||||
void GetCNodeInfo(const CNodePtr &cnode, std::vector<AnfNodePtr> *cnode_inputs) const;
|
||||
|
|
|
@ -812,14 +812,13 @@ void Somas::InitCommonNodeInputs(const CNodePtr &kernel) {
|
|||
if ((op_name == kDynamicRNNOpName || op_name == kDynamicGRUV2OpName) && input_origin_type == kMetaTypeNone) {
|
||||
continue;
|
||||
}
|
||||
auto index = AnfAlgo::GetInputKernelIdxByGraphIdx(kernel, i);
|
||||
size_t input_size = 0;
|
||||
if (index >= input_size_list.size()) {
|
||||
MS_LOG(INFO) << "Node: " << kernel->fullname_with_scope() << " input idx: " << index
|
||||
if (i >= input_size_list.size()) {
|
||||
MS_LOG(INFO) << "Node: " << kernel->fullname_with_scope() << " input idx: " << i
|
||||
<< " greater than the size of input_size_list: " << input_size_list.size()
|
||||
<< ", so use 0 as parameter size.";
|
||||
} else {
|
||||
input_size = input_size_list.at(index);
|
||||
input_size = input_size_list.at(i);
|
||||
}
|
||||
auto parameter =
|
||||
GetSomasParameter(prenode_index.first, prenode_index.second, input_size, kernel->fullname_with_scope());
|
||||
|
@ -1018,6 +1017,7 @@ void Somas::SummaryInputProcess(const session::KernelGraph &graph) {
|
|||
#endif
|
||||
|
||||
void Somas::GraphOutputProcess(const session::KernelGraph &graph) {
|
||||
bool need_reuse_graph_output = NeedReuseGraphOutput();
|
||||
size_t count = 0;
|
||||
auto outputs = common::AnfAlgo::GetAllOutputWithIndex(graph.output());
|
||||
for (auto &output : outputs) {
|
||||
|
@ -1045,8 +1045,12 @@ void Somas::GraphOutputProcess(const session::KernelGraph &graph) {
|
|||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (output_index <= node->output_tensors_.size()) {
|
||||
auto &tensor = node->output_tensors_[output_index];
|
||||
tensor->aligned_size_ = 0;
|
||||
tensor->type_ = kGraphOutput;
|
||||
if (need_reuse_graph_output) {
|
||||
tensor->lifelong_value_ = kLifeLongGraphEnd;
|
||||
} else {
|
||||
tensor->aligned_size_ = 0;
|
||||
tensor->type_ = kGraphOutput;
|
||||
}
|
||||
count++;
|
||||
} else {
|
||||
MS_LOG(INTERNAL_EXCEPTION) << "Graph's output node " << output_kernel->fullname_with_scope()
|
||||
|
@ -1707,6 +1711,7 @@ void Somas::UpdateUnionTensorsOffset() {
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Disjoint-set
|
||||
size_t find_father(std::vector<size_t> *father, size_t x) {
|
||||
MS_EXCEPTION_IF_NULL(father);
|
||||
|
@ -1720,15 +1725,13 @@ size_t find_father(std::vector<size_t> *father, size_t x) {
|
|||
return (*father)[x];
|
||||
}
|
||||
|
||||
void Somas::UpdateUnionTensorsConflict() {
|
||||
// Keep all constraints for first tensor in list
|
||||
MS_EXCEPTION_IF_NULL(tensors_list_.back());
|
||||
size_t cnt = tensors_list_.back()->GetId() + 1;
|
||||
std::vector<vector<size_t>> GetRegularUnionTensorsList(size_t cnt,
|
||||
const std::vector<vector<size_t>> &union_tensors_list) {
|
||||
std::vector<size_t> father;
|
||||
for (size_t i = 0; i < cnt; i++) {
|
||||
father.push_back(i);
|
||||
}
|
||||
for (auto union_node_list : union_tensors_list_) {
|
||||
for (auto union_node_list : union_tensors_list) {
|
||||
if (union_node_list.empty()) {
|
||||
MS_LOG(INTERNAL_EXCEPTION) << "union node list is empty.";
|
||||
}
|
||||
|
@ -1740,19 +1743,32 @@ void Somas::UpdateUnionTensorsConflict() {
|
|||
}
|
||||
|
||||
std::map<size_t, size_t> kv;
|
||||
std::vector<vector<size_t>> tmp_union;
|
||||
for (const auto &union_node_list : union_tensors_list_) {
|
||||
std::vector<vector<size_t>> ret_union_list;
|
||||
std::vector<std::set<size_t>> union_tensor_sets;
|
||||
for (const auto &union_node_list : union_tensors_list) {
|
||||
for (size_t tid : union_node_list) {
|
||||
size_t fa = find_father(&father, tid);
|
||||
if (kv.find(fa) == kv.end()) {
|
||||
tmp_union.emplace_back();
|
||||
kv.emplace(fa, tmp_union.size() - 1);
|
||||
ret_union_list.emplace_back();
|
||||
union_tensor_sets.emplace_back();
|
||||
kv.emplace(fa, ret_union_list.size() - 1);
|
||||
}
|
||||
auto &union_tensor_set = union_tensor_sets[kv.at(fa)];
|
||||
if (union_tensor_set.find(tid) == union_tensor_set.end()) {
|
||||
ret_union_list[kv.at(fa)].push_back(tid);
|
||||
union_tensor_set.insert(tid);
|
||||
}
|
||||
tmp_union[kv.at(fa)].push_back(tid);
|
||||
}
|
||||
}
|
||||
return ret_union_list;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
union_tensors_list_ = tmp_union;
|
||||
void Somas::UpdateUnionTensorsConflict() {
|
||||
// Keep all constraints for first tensor in list
|
||||
MS_EXCEPTION_IF_NULL(tensors_list_.back());
|
||||
size_t cnt = tensors_list_.back()->GetId() + 1;
|
||||
union_tensors_list_ = GetRegularUnionTensorsList(cnt, union_tensors_list_);
|
||||
|
||||
for (auto union_node_list : union_tensors_list_) {
|
||||
size_t tid_0 = union_node_list[0];
|
||||
|
|
|
@ -106,6 +106,7 @@ class BACKEND_EXPORT Somas {
|
|||
virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0;
|
||||
virtual void CommunicationTensorProcess(const std::vector<SomasTensorPtr> &tensors) const;
|
||||
virtual bool NeedContiguous(const std::vector<size_t> &inputs) const = 0;
|
||||
virtual bool NeedReuseGraphOutput() const { return false; }
|
||||
// end
|
||||
|
||||
// SOMAS Configuration
|
||||
|
|
|
@ -161,7 +161,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap
|
|||
}
|
||||
}
|
||||
}
|
||||
common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
|
||||
BestInfo best_info;
|
||||
FindBest(total_sol, solvers, &best_info);
|
||||
auto end = std::chrono::system_clock::now();
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "include/common/thread_pool.h"
|
||||
#include <exception>
|
||||
#include "thread/threadlog.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/ms_exception.h"
|
||||
|
||||
|
@ -36,11 +37,12 @@ void ThreadPool::SyncRunLoop(const std::shared_ptr<ThreadContext> &context) {
|
|||
}
|
||||
|
||||
if (!context->task) {
|
||||
MS_EXCEPTION_IF_NULL(context->cond_var);
|
||||
++yield_count;
|
||||
if (yield_count > kYieldThreshold) {
|
||||
yield_count = 0;
|
||||
std::unique_lock<std::mutex> lock(context->mutex);
|
||||
context->cond_var.wait(lock, [&context, this] { return context->task != nullptr || exit_run_; });
|
||||
context->cond_var->wait(lock, [&context, this] { return context->task != nullptr || exit_run_; });
|
||||
} else {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
|
@ -82,7 +84,7 @@ bool ThreadPool::SyncRun(const std::vector<Task> &tasks) {
|
|||
contexts_.resize(new_thread_num);
|
||||
for (size_t i = thread_num; i < new_thread_num; ++i) {
|
||||
contexts_[i] = std::make_shared<ThreadContext>();
|
||||
sync_run_threads_.emplace_back(std::thread(&ThreadPool::SyncRunLoop, this, contexts_[i]));
|
||||
sync_run_threads_.emplace_back(std::make_unique<std::thread>(&ThreadPool::SyncRunLoop, this, contexts_[i]));
|
||||
}
|
||||
}
|
||||
if (contexts_.empty()) {
|
||||
|
@ -98,13 +100,14 @@ bool ThreadPool::SyncRun(const std::vector<Task> &tasks) {
|
|||
running = false;
|
||||
for (size_t i = 0; i < used_thread_num; ++i) {
|
||||
MS_EXCEPTION_IF_NULL(contexts_[i]);
|
||||
MS_EXCEPTION_IF_NULL(contexts_[i]->cond_var);
|
||||
auto &task_run = contexts_[i]->task;
|
||||
if (task_run) {
|
||||
running = true;
|
||||
} else if (task_index < task_num) {
|
||||
std::lock_guard<std::mutex> task_lock(contexts_[i]->mutex);
|
||||
contexts_[i]->task = &(tasks[task_index]);
|
||||
contexts_[i]->cond_var.notify_one();
|
||||
contexts_[i]->cond_var->notify_one();
|
||||
running = true;
|
||||
++task_index;
|
||||
}
|
||||
|
@ -129,11 +132,30 @@ void ThreadPool::ClearThreadPool() {
|
|||
exit_run_ = true;
|
||||
for (auto &context : contexts_) {
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
context->cond_var.notify_one();
|
||||
context->cond_var->notify_one();
|
||||
}
|
||||
for (auto &it : sync_run_threads_) {
|
||||
if (it.joinable()) {
|
||||
it.join();
|
||||
MS_EXCEPTION_IF_NULL(it);
|
||||
if (it->joinable()) {
|
||||
it->join();
|
||||
}
|
||||
}
|
||||
sync_run_threads_.clear();
|
||||
}
|
||||
|
||||
void ThreadPool::ChildAfterFork() {
|
||||
THREAD_INFO("common thread pool clear thread after fork in child process");
|
||||
for (auto &context : contexts_) {
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (context->cond_var != nullptr) {
|
||||
(void)context->cond_var.release();
|
||||
context->task = nullptr;
|
||||
}
|
||||
}
|
||||
contexts_.clear();
|
||||
for (auto &it : sync_run_threads_) {
|
||||
if (it != nullptr) {
|
||||
(void)it.release();
|
||||
}
|
||||
}
|
||||
sync_run_threads_.clear();
|
||||
|
|
|
@ -64,20 +64,21 @@ struct SomasInfo {
|
|||
std::map<size_t, void *> merged_base_addresses_;
|
||||
|
||||
// Used to keep the graph output address when somas block memory free.
|
||||
void InsertGraphOutputInfo(device::DeviceAddress *graph_output_device_address, size_t graph_output_address_size) {
|
||||
void InsertGraphOutputInfo(device::DeviceAddress *graph_output_device_address, size_t graph_output_address_offset,
|
||||
size_t graph_output_address_size) {
|
||||
// Not insert the repeat size.
|
||||
if (graph_output_address_sizes_set_.count(graph_output_address_size) > 0) {
|
||||
if (graph_output_address_offsets_set_.count(graph_output_address_offset) > 0) {
|
||||
MS_LOG(INFO) << "The graph:" << graph_id_
|
||||
<< " output somas device is same for size: " << graph_output_address_size;
|
||||
<< " output somas device is same for offset: " << graph_output_address_offset;
|
||||
return;
|
||||
}
|
||||
(void)graph_output_device_addresses_.emplace_back(graph_output_device_address);
|
||||
(void)graph_output_address_sizes_.emplace_back(graph_output_address_size);
|
||||
(void)graph_output_address_sizes_set_.insert(graph_output_address_size);
|
||||
(void)graph_output_address_offsets_set_.insert(graph_output_address_offset);
|
||||
}
|
||||
std::vector<device::DeviceAddress *> graph_output_device_addresses_;
|
||||
std::vector<size_t> graph_output_address_sizes_;
|
||||
std::set<size_t> graph_output_address_sizes_set_;
|
||||
std::set<size_t> graph_output_address_offsets_set_;
|
||||
|
||||
// The owner graph id.
|
||||
uint32_t graph_id_{0};
|
||||
|
|
|
@ -38,8 +38,13 @@ using Task = std::function<Status()>;
|
|||
|
||||
struct ThreadContext {
|
||||
std::mutex mutex;
|
||||
std::condition_variable cond_var;
|
||||
std::unique_ptr<std::condition_variable> cond_var{nullptr};
|
||||
const Task *task{nullptr};
|
||||
|
||||
ThreadContext() {
|
||||
cond_var = std::make_unique<std::condition_variable>();
|
||||
MS_EXCEPTION_IF_NULL(cond_var);
|
||||
}
|
||||
};
|
||||
|
||||
class COMMON_EXPORT ThreadPool {
|
||||
|
@ -51,6 +56,7 @@ class COMMON_EXPORT ThreadPool {
|
|||
bool SyncRun(const std::vector<Task> &tasks);
|
||||
size_t GetSyncRunThreadNum() const { return max_thread_num_; }
|
||||
void ClearThreadPool();
|
||||
void ChildAfterFork();
|
||||
|
||||
private:
|
||||
ThreadPool();
|
||||
|
@ -59,7 +65,7 @@ class COMMON_EXPORT ThreadPool {
|
|||
size_t max_thread_num_{1};
|
||||
std::mutex pool_mtx_;
|
||||
std::atomic_bool exit_run_ = {false};
|
||||
std::vector<std::thread> sync_run_threads_{};
|
||||
std::vector<std::unique_ptr<std::thread>> sync_run_threads_{};
|
||||
std::vector<std::shared_ptr<ThreadContext>> contexts_;
|
||||
};
|
||||
} // namespace common
|
||||
|
|
|
@ -46,7 +46,6 @@ file(GLOB_RECURSE UNUSED_SRC_IN_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
|||
"./dump/kernel_dumper.cc"
|
||||
"./dump/data_dumper.cc"
|
||||
"./dump/dumper_base.cc"
|
||||
"./ascend_stream_assign.cc"
|
||||
"./ascend_host_queue.cc"
|
||||
)
|
||||
list(REMOVE_ITEM MS_DEVICE_910B ${UNUSED_SRC_IN_910B})
|
||||
|
|
|
@ -68,7 +68,7 @@ bool NoAdditionalMemory() {
|
|||
MS_EXCEPTION_IF_NULL(context);
|
||||
const auto is_cell_reuse = context->CellReuseLevel() != CellReuseLevel::kNoCellReuse;
|
||||
const auto is_multi_graph_sink = context->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK);
|
||||
return is_cell_reuse || is_multi_graph_sink;
|
||||
return (is_cell_reuse || is_multi_graph_sink) && !context->IsKByKExecutorMode();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
|
|
@ -133,6 +133,16 @@ void AscendStreamMng::CreateStreamWithFlags(size_t *stream_id, uint32_t flags, i
|
|||
AscendGmemAdapter::GetInstance().AddCallbackThread(stream);
|
||||
}
|
||||
|
||||
aclrtEvent AscendStreamMng::ApplyRtEvent() {
|
||||
aclrtEvent rt_event = nullptr;
|
||||
auto ret = aclrtCreateEvent(&rt_event);
|
||||
if (ret != ACL_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "aclrtCreateEvent failed, ret:" << ret;
|
||||
}
|
||||
(void)events_.emplace_back(rt_event);
|
||||
return rt_event;
|
||||
}
|
||||
|
||||
bool AscendStreamMng::DestroyStream(size_t stream_id) {
|
||||
std::lock_guard<std::mutex> lock_streams(stream_mutex_);
|
||||
if (stream_id >= streams_.size()) {
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#ifndef ENABLE_SECURITY
|
||||
#include "include/backend/debug/data_dump/dump_json_parser.h"
|
||||
#include "include/backend/optimizer/helper.h"
|
||||
#include "mindspore/core/ops/framework_ops.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_kernel_task.h"
|
||||
#include "plugin/device/ascend/kernel/opapi/aclnn_kernel_build.h"
|
||||
#include "plugin/device/ascend/kernel/acl/acl_kernel_build.h"
|
||||
|
@ -117,121 +118,6 @@ TypeId GetInputDeviceType(const AnfNodePtr &kernel_node, size_t input_idx) {
|
|||
return type;
|
||||
}
|
||||
|
||||
bool IsEmptyTupleInput(const CNodePtr &kernel, const size_t i, const TypeId cur_type_id) {
|
||||
auto input_node = common::AnfAlgo::GetPrevNodeOutput(kernel, i).first;
|
||||
if (input_node->isa<ValueNode>()) {
|
||||
auto value_node = input_node->cast<ValueNodePtr>();
|
||||
if (cur_type_id == kTypeUnknown && value_node->value() != nullptr && value_node->value()->isa<ValueTuple>() &&
|
||||
value_node->value()->cast<ValueTuplePtr>()->size() == 0) {
|
||||
MS_LOG(DEBUG) << "Set int64 type for empty value tuple node:" << value_node->DebugString();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void GenerateKernelBuildInfo(const CNodePtr &kernel, const KernelType &kernel_type) {
|
||||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
std::vector<std::string> input_formats;
|
||||
std::vector<std::string> output_formats;
|
||||
std::vector<std::string> input_reshape_types;
|
||||
std::vector<std::string> output_reshape_types;
|
||||
auto input_num = common::AnfAlgo::GetInputTensorNum(kernel);
|
||||
auto output_num = AnfUtils::GetOutputTensorNum(kernel);
|
||||
auto output_object_type = kernel::KernelObjectType::TENSOR;
|
||||
if (kernel_type == ACL_KERNEL) {
|
||||
transform::AclHelper::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
|
||||
&output_reshape_types);
|
||||
// NOTE: acl default output objecttype is tensor, here are 2 special case:
|
||||
// case 1: when cnode output is tuple, and ge ops prototype output num is 1, the real output objecttype is tuple;
|
||||
// case 2: when cnode output is scalar, the real output objecttype is scalar
|
||||
// others: output objecttype is tensor
|
||||
std::string name = GetCNodeFuncName(kernel);
|
||||
const auto &info = transform::GeAdapterManager::GetInstance().GetInfo(name, true);
|
||||
auto adapter_output_num = info->GetNumStaticOutputsOfMsOpProto();
|
||||
auto cnode_output_object_type =
|
||||
kernel::TypeIdToKernelObjectType(AnfAlgo::GetAbstractObjectType(kernel->abstract()));
|
||||
if (adapter_output_num == 1 && cnode_output_object_type == kernel::KernelObjectType::TUPLE) {
|
||||
MS_LOG(INFO) << "acl node " << kernel->fullname_with_scope() << " output is real tuple";
|
||||
output_object_type = cnode_output_object_type;
|
||||
output_num = 1;
|
||||
auto output_format = output_formats[kFirstItem];
|
||||
auto output_reshape_type = output_reshape_types[kFirstItem];
|
||||
output_formats.clear();
|
||||
output_reshape_types.clear();
|
||||
output_formats.push_back(output_format);
|
||||
output_reshape_types.push_back(output_reshape_type);
|
||||
} else if (cnode_output_object_type == kernel::KernelObjectType::SCALAR) {
|
||||
output_object_type = cnode_output_object_type;
|
||||
}
|
||||
} else if (kernel_type == OPAPI_KERNEL) {
|
||||
transform::OpApiUtil::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
|
||||
&output_reshape_types);
|
||||
} else {
|
||||
auto cand_format = GetValidFormat(input_num, output_num);
|
||||
if (cand_format.empty()) {
|
||||
MS_LOG(EXCEPTION) << "The kernel: " << kernel->fullname_with_scope()
|
||||
<< " does not have a supported dynamic shape format on the Ascend platform.";
|
||||
}
|
||||
input_formats = cand_format.at(kFirstItem).first;
|
||||
output_formats = cand_format.at(kFirstItem).second;
|
||||
input_reshape_types.assign(input_num, "");
|
||||
output_reshape_types.assign(output_num, "");
|
||||
for (size_t i = 0; i < common::AnfAlgo::GetInputTensorNum(kernel); i++) {
|
||||
auto input_format = AnfAlgo::GetPrevNodeOutputFormat(kernel, i);
|
||||
if ((!transform::AclHelper::CheckDefaultSupportFormat(input_format)) && (kernel_type != HCCL_KERNEL)) {
|
||||
MS_LOG(EXCEPTION) << "Aicpu kernel input not support this format: " << input_format
|
||||
<< ", kernel: " << kernel->fullname_with_scope() << ", input idx: " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TypeId> input_types;
|
||||
input_types.reserve(input_num);
|
||||
std::vector<TypeId> output_types;
|
||||
output_types.reserve(output_num);
|
||||
std::vector<kernel::KernelObjectType> output_object_types;
|
||||
output_object_types.reserve(output_num);
|
||||
auto input_object_types = kernel::TypeIdToKernelObjectType(AnfAlgo::GetAllInputObjectType(kernel));
|
||||
|
||||
for (size_t i = 0; i < input_num; i++) {
|
||||
auto cur_input_type = GetInputDeviceType(kernel, i);
|
||||
if (IsEmptyTupleInput(kernel, i, cur_input_type)) {
|
||||
cur_input_type = TypeId::kNumberTypeInt64;
|
||||
}
|
||||
(void)input_types.push_back(cur_input_type);
|
||||
}
|
||||
for (size_t i = 0; i < output_num; i++) {
|
||||
(void)output_types.push_back(common::AnfAlgo::GetOutputInferDataType(kernel, i));
|
||||
// no tuple in PyNative dynamic shape
|
||||
(void)output_object_types.push_back(output_object_type);
|
||||
}
|
||||
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
|
||||
MS_EXCEPTION_IF_NULL(builder);
|
||||
builder->SetKernelType(kernel_type);
|
||||
builder->SetInputsFormat(input_formats);
|
||||
builder->SetInputsDeviceType(input_types);
|
||||
builder->SetInputsKernelObjectType(input_object_types);
|
||||
builder->SetOutputsFormat(output_formats);
|
||||
builder->SetOutputsDeviceType(output_types);
|
||||
builder->SetOutputsKernelObjectType(output_object_types);
|
||||
builder->SetInputsReshapeType(input_reshape_types);
|
||||
builder->SetOutputsReshapeType(output_reshape_types);
|
||||
if (input_formats.size() != input_types.size() || input_formats.size() != input_object_types.size()) {
|
||||
MS_LOG(EXCEPTION) << "The input buildInfo size kernel: " << kernel->fullname_with_scope()
|
||||
<< "is not equal, input_formats size: " << input_formats.size()
|
||||
<< ", input_types size: " << input_types.size()
|
||||
<< ", input_object_types size: " << input_object_types.size();
|
||||
}
|
||||
if (output_formats.size() != output_types.size() || output_formats.size() != output_object_types.size()) {
|
||||
MS_LOG(EXCEPTION) << "The output buildInfo size kernel: " << kernel->fullname_with_scope()
|
||||
<< "is not equal, output_formats size: " << output_formats.size()
|
||||
<< ", output_types size: " << output_types.size()
|
||||
<< ", output_object_types size: " << output_object_types.size();
|
||||
}
|
||||
AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel.get());
|
||||
}
|
||||
|
||||
void SetWeightFormat(const AnfNodePtr &real_input_node, std::vector<string> output_format, const CNodePtr &kernel_node,
|
||||
size_t input_index, bool force_fresh = false) {
|
||||
MS_EXCEPTION_IF_NULL(real_input_node);
|
||||
|
@ -492,8 +378,123 @@ bool SetMatchKernelInfo(const CNodePtr &kernel_node, const std::vector<kernel::K
|
|||
}
|
||||
return find_valid;
|
||||
}
|
||||
|
||||
bool IsEmptyTupleInput(const CNodePtr &kernel, const size_t i, const TypeId cur_type_id) {
|
||||
auto input_node = common::AnfAlgo::GetPrevNodeOutput(kernel, i).first;
|
||||
if (input_node->isa<ValueNode>()) {
|
||||
auto value_node = input_node->cast<ValueNodePtr>();
|
||||
if (cur_type_id == kTypeUnknown && value_node->value() != nullptr && value_node->value()->isa<ValueTuple>() &&
|
||||
value_node->value()->cast<ValueTuplePtr>()->size() == 0) {
|
||||
MS_LOG(DEBUG) << "Set int64 type for empty value tuple node:" << value_node->DebugString();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void GenerateKernelBuildInfo(const CNodePtr &kernel, const KernelType &kernel_type) {
|
||||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
std::vector<std::string> input_formats;
|
||||
std::vector<std::string> output_formats;
|
||||
std::vector<std::string> input_reshape_types;
|
||||
std::vector<std::string> output_reshape_types;
|
||||
auto input_num = common::AnfAlgo::GetInputTensorNum(kernel);
|
||||
auto output_num = AnfUtils::GetOutputTensorNum(kernel);
|
||||
auto output_object_type = kernel::KernelObjectType::TENSOR;
|
||||
if (kernel_type == ACL_KERNEL) {
|
||||
transform::AclHelper::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
|
||||
&output_reshape_types);
|
||||
// NOTE: acl default output objecttype is tensor, here are 2 special case:
|
||||
// case 1: when cnode output is tuple, and ge ops prototype output num is 1, the real output objecttype is tuple;
|
||||
// case 2: when cnode output is scalar, the real output objecttype is scalar
|
||||
// others: output objecttype is tensor
|
||||
std::string name = GetCNodeFuncName(kernel);
|
||||
const auto &info = transform::GeAdapterManager::GetInstance().GetInfo(name, true);
|
||||
auto adapter_output_num = info->GetNumStaticOutputsOfMsOpProto();
|
||||
auto cnode_output_object_type =
|
||||
kernel::TypeIdToKernelObjectType(AnfAlgo::GetAbstractObjectType(kernel->abstract()));
|
||||
if (adapter_output_num == 1 && cnode_output_object_type == kernel::KernelObjectType::TUPLE) {
|
||||
MS_LOG(INFO) << "acl node " << kernel->fullname_with_scope() << " output is real tuple";
|
||||
output_object_type = cnode_output_object_type;
|
||||
output_num = 1;
|
||||
auto output_format = output_formats[kFirstItem];
|
||||
auto output_reshape_type = output_reshape_types[kFirstItem];
|
||||
output_formats.clear();
|
||||
output_reshape_types.clear();
|
||||
output_formats.push_back(output_format);
|
||||
output_reshape_types.push_back(output_reshape_type);
|
||||
} else if (cnode_output_object_type == kernel::KernelObjectType::SCALAR) {
|
||||
output_object_type = cnode_output_object_type;
|
||||
}
|
||||
} else if (kernel_type == OPAPI_KERNEL) {
|
||||
transform::OpApiUtil::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
|
||||
&output_reshape_types);
|
||||
} else {
|
||||
auto cand_format = GetValidFormat(input_num, output_num);
|
||||
if (cand_format.empty()) {
|
||||
MS_LOG(EXCEPTION) << "The kernel: " << kernel->fullname_with_scope()
|
||||
<< " does not have a supported dynamic shape format on the Ascend platform.";
|
||||
}
|
||||
input_formats = cand_format.at(kFirstItem).first;
|
||||
output_formats = cand_format.at(kFirstItem).second;
|
||||
input_reshape_types.assign(input_num, "");
|
||||
output_reshape_types.assign(output_num, "");
|
||||
for (size_t i = 0; i < common::AnfAlgo::GetInputTensorNum(kernel); i++) {
|
||||
auto input_format = AnfAlgo::GetPrevNodeOutputFormat(kernel, i);
|
||||
if ((!transform::AclHelper::CheckDefaultSupportFormat(input_format)) && (kernel_type != HCCL_KERNEL)) {
|
||||
MS_LOG(EXCEPTION) << "Aicpu kernel input not support this format: " << input_format
|
||||
<< ", kernel: " << kernel->fullname_with_scope() << ", input idx: " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TypeId> input_types;
|
||||
input_types.reserve(input_num);
|
||||
std::vector<TypeId> output_types;
|
||||
output_types.reserve(output_num);
|
||||
std::vector<kernel::KernelObjectType> output_object_types;
|
||||
output_object_types.reserve(output_num);
|
||||
auto input_object_types = kernel::TypeIdToKernelObjectType(AnfAlgo::GetAllInputObjectType(kernel));
|
||||
|
||||
for (size_t i = 0; i < input_num; i++) {
|
||||
auto cur_input_type = GetInputDeviceType(kernel, i);
|
||||
if (IsEmptyTupleInput(kernel, i, cur_input_type)) {
|
||||
cur_input_type = TypeId::kNumberTypeInt64;
|
||||
}
|
||||
(void)input_types.push_back(cur_input_type);
|
||||
}
|
||||
for (size_t i = 0; i < output_num; i++) {
|
||||
(void)output_types.push_back(common::AnfAlgo::GetOutputInferDataType(kernel, i));
|
||||
// no tuple in PyNative dynamic shape
|
||||
(void)output_object_types.push_back(output_object_type);
|
||||
}
|
||||
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
|
||||
MS_EXCEPTION_IF_NULL(builder);
|
||||
builder->SetKernelType(kernel_type);
|
||||
builder->SetInputsFormat(input_formats);
|
||||
builder->SetInputsDeviceType(input_types);
|
||||
builder->SetInputsKernelObjectType(input_object_types);
|
||||
builder->SetOutputsFormat(output_formats);
|
||||
builder->SetOutputsDeviceType(output_types);
|
||||
builder->SetOutputsKernelObjectType(output_object_types);
|
||||
builder->SetInputsReshapeType(input_reshape_types);
|
||||
builder->SetOutputsReshapeType(output_reshape_types);
|
||||
if (input_formats.size() != input_types.size() || input_formats.size() != input_object_types.size()) {
|
||||
MS_LOG(EXCEPTION) << "The input buildInfo size kernel: " << kernel->fullname_with_scope()
|
||||
<< "is not equal, input_formats size: " << input_formats.size()
|
||||
<< ", input_types size: " << input_types.size()
|
||||
<< ", input_object_types size: " << input_object_types.size();
|
||||
}
|
||||
if (output_formats.size() != output_types.size() || output_formats.size() != output_object_types.size()) {
|
||||
MS_LOG(EXCEPTION) << "The output buildInfo size kernel: " << kernel->fullname_with_scope()
|
||||
<< "is not equal, output_formats size: " << output_formats.size()
|
||||
<< ", output_types size: " << output_types.size()
|
||||
<< ", output_object_types size: " << output_object_types.size();
|
||||
}
|
||||
AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel.get());
|
||||
}
|
||||
|
||||
void HandleKernelSelectFailure(const KernelGraphPtr &graph, const CNodePtr &node,
|
||||
const std::pair<std::string, ExceptionType> &failure_info) {
|
||||
auto msg = TryBackoffCpu(graph, node, failure_info);
|
||||
|
@ -522,6 +523,12 @@ std::tuple<bool, std::string, ExceptionType> SelectKernelInfoWithMsg(const CNode
|
|||
<< " is enabled dispatch in yaml, but not registered an aclnn kernelmod.";
|
||||
}
|
||||
|
||||
// for backend inline
|
||||
if (IsPrimitiveCNode(node, prim::kPrimCallInline)) {
|
||||
GenerateKernelBuildInfo(node, KernelType::UNKNOWN_KERNEL_TYPE);
|
||||
return result;
|
||||
}
|
||||
|
||||
auto kernel_type = transform::AclHelper::GetKernelInfoFromGe(node, &acl_err_type);
|
||||
if (kernel_type == KernelType::ACL_KERNEL) {
|
||||
GenerateKernelBuildInfo(node, kernel_type);
|
||||
|
|
|
@ -30,6 +30,7 @@ void HandleKernelSelectFailure(const KernelGraphPtr &graph, const CNodePtr &node
|
|||
const std::pair<std::string, ExceptionType> &failure_info);
|
||||
std::tuple<bool, std::string, ExceptionType> SelectKernelInfoWithMsg(const CNodePtr &node, bool enable_aclnn);
|
||||
void SetKernelInfoBeforeCreateKernel(const std::vector<CNodePtr> &nodes);
|
||||
void GenerateKernelBuildInfo(const CNodePtr &kernel, const KernelType &kernel_type);
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -19,6 +19,8 @@ file(GLOB_RECURSE MS_HARDWARE_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
|||
"ge_kernel_executor.cc"
|
||||
"ge_utils.cc"
|
||||
"ge_graph_optimization.cc"
|
||||
"acl_somas.cc"
|
||||
"acl_stream_assign.cc"
|
||||
)
|
||||
|
||||
set_property(SOURCE ${MS_HARDWARE_910B} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/ascend/hal/hardware/acl_somas.h"
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "include/backend/optimizer/helper.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
|
||||
#include "ops/framework_op_name.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
using TensorType = somas::TensorType;
|
||||
using LifeLongType = somas::LifeLongType;
|
||||
constexpr size_t ALONE = 1;
|
||||
|
||||
bool AclSomas::Initialize() { return true; }
|
||||
|
||||
std::string AclSomas::GetDeviceName() const { return "Ascend"; }
|
||||
|
||||
size_t AclSomas::GetAlignSize(size_t original_size) const {
|
||||
constexpr size_t alignment = 512;
|
||||
constexpr size_t alignment_complement = 31;
|
||||
size_t aligned_size =
|
||||
(original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0;
|
||||
return aligned_size;
|
||||
}
|
||||
|
||||
bool AclSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto opt_level = ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL);
|
||||
return opt_level != kOptimizeO0;
|
||||
}
|
||||
|
||||
bool AclSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
|
||||
InitEventInfo(graph);
|
||||
return true;
|
||||
}
|
||||
|
||||
void AclSomas::InitEventInfo(const session::KernelGraph &graph) {
|
||||
MS_LOG(DEBUG) << "Acl Somas InitEventInfo start.";
|
||||
event_map_ = {};
|
||||
auto &kernels = graph.execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
auto type = common::AnfAlgo::GetCNodeName(kernel);
|
||||
if (type == kStreamSendOpName) {
|
||||
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
|
||||
auto iter = event_map_.find(event);
|
||||
if (iter == event_map_.end()) {
|
||||
auto pair = somas::EventPair();
|
||||
pair.send_ = kernel;
|
||||
event_map_[event] = pair;
|
||||
} else {
|
||||
iter->second.send_ = kernel;
|
||||
}
|
||||
} else if (type == kStreamRecvOpName) {
|
||||
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
|
||||
auto iter = event_map_.find(event);
|
||||
if (iter == event_map_.end()) {
|
||||
auto pair = somas::EventPair();
|
||||
pair.recv_ = kernel;
|
||||
event_map_[event] = pair;
|
||||
} else {
|
||||
iter->second.recv_ = kernel;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &event : event_map_) {
|
||||
auto send_iter = nodes_map_.find(event.second.send_.get());
|
||||
auto recv_iter = nodes_map_.find(event.second.recv_.get());
|
||||
if (send_iter == nodes_map_.end()) {
|
||||
MS_LOG(WARNING) << "Can't find Acl somas node for " << event.second.send_->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
if (recv_iter == nodes_map_.end()) {
|
||||
MS_LOG(WARNING) << "Can't find Acl somas node for " << event.second.recv_->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
AddControlTensor(send_iter->second.at(0), recv_iter->second.at(0));
|
||||
}
|
||||
MS_LOG(DEBUG) << "Acl Somas InitEventInfo end.";
|
||||
}
|
||||
|
||||
bool AclSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; }
|
||||
|
||||
void AclSomas::CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const {
|
||||
if (tensors.size() != ALONE) {
|
||||
for (auto &tensor : tensors) {
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(tensor->aligned_size_ != 0, "The size of communication tensor is zero, tensor id: " +
|
||||
std::to_string(tensor->GetId()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool AclSomas::NeedContiguous(const std::vector<size_t> &inputs) const { return inputs.size() > ALONE; }
|
||||
|
||||
bool AclSomas::NeedReuseGraphOutput() const { return true; }
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_SOMAS_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_SOMAS_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include "backend/common/somas/somas.h"
|
||||
#include "include/backend/device_type.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
using KernelGraph = session::KernelGraph;
|
||||
using UnReuseType = somas::UnReuseType;
|
||||
class AclSomas : public somas::Somas {
|
||||
private:
|
||||
bool Initialize() override;
|
||||
string GetDeviceName() const override;
|
||||
void CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const override;
|
||||
bool NeedContiguous(const std::vector<size_t> &inputs) const override;
|
||||
bool NeedReuseGraphOutput() const override;
|
||||
size_t GetAlignSize(size_t original_size) const override;
|
||||
|
||||
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
|
||||
|
||||
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
|
||||
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
|
||||
|
||||
void InitEventInfo(const session::KernelGraph &graph);
|
||||
std::map<uint32_t, somas::EventPair> event_map_;
|
||||
};
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_SOMAS_H_
|
|
@ -0,0 +1,322 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "plugin/device/ascend/hal/hardware/acl_stream_assign.h"
|
||||
#include <algorithm>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
|
||||
#include "include/backend/anf_runtime_algorithm.h"
|
||||
#include "include/backend/optimizer/helper.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "include/common/utils/parallel_context.h"
|
||||
#include "include/common/utils/utils.h"
|
||||
#include "ops/ascend_op_name.h"
|
||||
#include "ops/framework_op_name.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
void AclStreamAssign::AssignStream(const NotNull<KernelGraphPtr> &kernel_graph) const {
|
||||
auto kernels = kernel_graph->execution_order();
|
||||
if (kernels.empty()) {
|
||||
return;
|
||||
}
|
||||
if (kernel_graph->is_from_single_op()) {
|
||||
MS_LOG(INFO) << "Not stream assign when pynative forward.";
|
||||
return;
|
||||
}
|
||||
for (const auto &node : kernels) {
|
||||
if (AnfAlgo::IsKernelSelectBackoffOp(node)) {
|
||||
continue;
|
||||
}
|
||||
if (common::AnfAlgo::IsCommunicationOp(node)) {
|
||||
AnfAlgo::SetStreamId(kWorldGroupStreamIndex, node.get());
|
||||
common::AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(kWorldGroupStreamIndex), node);
|
||||
} else {
|
||||
AnfAlgo::SetStreamId(kDefaultStreamIndex, node.get());
|
||||
common::AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(kDefaultStreamIndex), node);
|
||||
}
|
||||
}
|
||||
for (size_t i = 1; i < kernels.size(); ++i) {
|
||||
if (common::AnfAlgo::GetCNodeName(kernels[i - 1]) == kMemSetOpName) {
|
||||
auto stream_id = AnfAlgo::GetStreamId(kernels[i]);
|
||||
AnfAlgo::SetStreamId(stream_id, kernels[i - 1].get());
|
||||
common::AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(stream_id), kernels[i - 1]);
|
||||
}
|
||||
}
|
||||
InsertEventForNonTaskSink(kernel_graph);
|
||||
}
|
||||
|
||||
void AclStreamAssign::GenKernelIoExecInfoMap(
|
||||
const NotNull<KernelGraphPtr> &kernel_graph,
|
||||
mindspore::HashMap<CNodePtr, NodeIoExecInfoPtr> *kernel_io_exec_info_map) const {
|
||||
auto &exec_kernels = kernel_graph->execution_order();
|
||||
for (size_t i = 0; i < exec_kernels.size(); ++i) {
|
||||
auto &process_kernel = exec_kernels[i];
|
||||
MS_EXCEPTION_IF_NULL(process_kernel);
|
||||
auto process_exec_info = std::make_shared<NodeExecInfo>();
|
||||
MS_EXCEPTION_IF_NULL(process_exec_info);
|
||||
process_exec_info->node = process_kernel;
|
||||
process_exec_info->stream_id = AnfAlgo::GetStreamId(process_kernel);
|
||||
process_exec_info->execution_order_index = i;
|
||||
auto process_io_exec_info = std::make_shared<NodeIoExecInfo>();
|
||||
MS_EXCEPTION_IF_NULL(process_io_exec_info);
|
||||
process_io_exec_info->node_exec_info = process_exec_info;
|
||||
process_io_exec_info->inputs = {};
|
||||
process_io_exec_info->outputs = {};
|
||||
(*kernel_io_exec_info_map)[process_kernel] = process_io_exec_info;
|
||||
}
|
||||
|
||||
for (auto &process_kernel : exec_kernels) {
|
||||
MS_EXCEPTION_IF_NULL(process_kernel);
|
||||
auto process_iter = kernel_io_exec_info_map->find(process_kernel);
|
||||
if (process_iter == kernel_io_exec_info_map->end()) {
|
||||
MS_LOG(INFO) << "Can't get kernel io execution info for " << process_kernel->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
auto process_io_exec_info = process_iter->second;
|
||||
MS_EXCEPTION_IF_NULL(process_io_exec_info);
|
||||
auto process_exec_info = process_iter->second->node_exec_info;
|
||||
MS_EXCEPTION_IF_NULL(process_exec_info);
|
||||
auto inputs = process_kernel->inputs();
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
auto input_node = common::AnfAlgo::VisitKernelWithReturnType(inputs[i], 0).first;
|
||||
MS_EXCEPTION_IF_NULL(input_node);
|
||||
if (AnfUtils::IsRealCNodeKernel(input_node)) {
|
||||
auto input_kernel = input_node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(input_kernel);
|
||||
auto iter = kernel_io_exec_info_map->find(input_kernel);
|
||||
if (iter == kernel_io_exec_info_map->end()) {
|
||||
MS_LOG(INFO) << "Can't get kernel io execution info for " << process_kernel->fullname_with_scope()
|
||||
<< "'s input node " << input_kernel->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
auto input_io_exec_info = iter->second;
|
||||
auto input_exec_info = iter->second->node_exec_info;
|
||||
MS_EXCEPTION_IF_NULL(input_io_exec_info);
|
||||
process_io_exec_info->inputs.push_back(input_exec_info);
|
||||
input_io_exec_info->outputs.push_back(process_exec_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AclStreamAssign::UpdateEventsToExecutionOrder(
|
||||
const NotNull<KernelGraphPtr> &kernel_graph,
|
||||
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &send_after_node,
|
||||
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &recv_before_node) const {
|
||||
MS_LOG(DEBUG) << "Start UpdateEventsToExecutionOrder...";
|
||||
auto exec_kernels = kernel_graph->execution_order();
|
||||
std::vector<CNodePtr> new_exec_orders;
|
||||
for (auto &kernel : exec_kernels) {
|
||||
auto before_iter = recv_before_node.find(kernel);
|
||||
if (before_iter != recv_before_node.end()) {
|
||||
(void)std::copy(before_iter->second.begin(), before_iter->second.end(), std::back_inserter(new_exec_orders));
|
||||
}
|
||||
new_exec_orders.push_back(kernel);
|
||||
auto after_iter = send_after_node.find(kernel);
|
||||
if (after_iter != send_after_node.end()) {
|
||||
(void)std::copy(after_iter->second.begin(), after_iter->second.end(), std::back_inserter(new_exec_orders));
|
||||
}
|
||||
}
|
||||
auto graph_output = kernel_graph->output();
|
||||
auto graph_output_iter = recv_before_node.find(graph_output);
|
||||
if (graph_output_iter != recv_before_node.end()) {
|
||||
(void)std::copy(graph_output_iter->second.begin(), graph_output_iter->second.end(),
|
||||
std::back_inserter(new_exec_orders));
|
||||
}
|
||||
|
||||
kernel_graph->set_execution_order(new_exec_orders);
|
||||
MS_LOG(DEBUG) << "Finish UpdateEventsToExecutionOrder.";
|
||||
}
|
||||
|
||||
void AclStreamAssign::InsertEventsForInputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
|
||||
const NodeIoExecInfoPtr &io_exec_info,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const {
|
||||
MS_EXCEPTION_IF_NULL(io_exec_info);
|
||||
auto process_stream_id = AnfAlgo::GetStreamId(kernel);
|
||||
auto input_exec_info_list = io_exec_info->inputs;
|
||||
mindspore::HashMap<uint32_t, NodeExecInfoPtr> stream_max_exec_node_map;
|
||||
|
||||
for (auto &input : input_exec_info_list) {
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
auto input_stream_id = input->stream_id;
|
||||
auto iter = stream_max_exec_node_map.find(input_stream_id);
|
||||
if (iter == stream_max_exec_node_map.end()) {
|
||||
stream_max_exec_node_map[input_stream_id] = input;
|
||||
} else {
|
||||
MS_EXCEPTION_IF_NULL(iter->second);
|
||||
if (input->execution_order_index > iter->second->execution_order_index) {
|
||||
iter->second = input;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto input_exec : stream_max_exec_node_map) {
|
||||
MS_EXCEPTION_IF_NULL(input_exec.second);
|
||||
if (input_exec.second->stream_id == process_stream_id) {
|
||||
continue;
|
||||
}
|
||||
InsertEvents(kernel_graph, kernel, input_exec.second->node, kernel_send, kernel_recv, kernel);
|
||||
}
|
||||
}
|
||||
|
||||
void AclStreamAssign::InsertEventsForOutputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
|
||||
const NodeIoExecInfoPtr &io_exec_info,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const {
|
||||
MS_EXCEPTION_IF_NULL(io_exec_info);
|
||||
auto process_stream_id = AnfAlgo::GetStreamId(kernel);
|
||||
auto output_exec_info_list = io_exec_info->outputs;
|
||||
mindspore::HashMap<uint32_t, NodeExecInfoPtr> stream_min_exec_node_map;
|
||||
for (auto &output : output_exec_info_list) {
|
||||
MS_EXCEPTION_IF_NULL(output);
|
||||
auto output_stream_id = output->stream_id;
|
||||
auto iter = stream_min_exec_node_map.find(output_stream_id);
|
||||
if (iter == stream_min_exec_node_map.end()) {
|
||||
stream_min_exec_node_map[output_stream_id] = output;
|
||||
} else {
|
||||
MS_EXCEPTION_IF_NULL(iter->second);
|
||||
if (output->execution_order_index < iter->second->execution_order_index) {
|
||||
iter->second = output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto output_exec : stream_min_exec_node_map) {
|
||||
MS_EXCEPTION_IF_NULL(output_exec.second);
|
||||
if (output_exec.second->stream_id == process_stream_id) {
|
||||
continue;
|
||||
}
|
||||
InsertEvents(kernel_graph, kernel, kernel, kernel_send, kernel_recv, output_exec.second->node);
|
||||
}
|
||||
|
||||
// parallel op has output tensor, and it didn't connect to other kernel, it's output is graph output, sync it.
|
||||
if (output_exec_info_list.empty() && (AnfAlgo::GetOutputTensorNum(kernel) != 0)) {
|
||||
InsertEvents(kernel_graph, kernel, kernel, kernel_send, kernel_recv, kernel_graph->output());
|
||||
}
|
||||
}
|
||||
|
||||
CNodePtr AclStreamAssign::CreateSendApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id,
|
||||
uint32_t stream_id) const {
|
||||
auto send_op = std::make_shared<Primitive>(kStreamSendOpName);
|
||||
MS_EXCEPTION_IF_NULL(send_op);
|
||||
auto send_apply = std::make_shared<ValueNode>(send_op);
|
||||
MS_EXCEPTION_IF_NULL(send_apply);
|
||||
auto send_node_ptr = graph_ptr->NewCNode({send_apply});
|
||||
MS_EXCEPTION_IF_NULL(send_node_ptr);
|
||||
common::AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr);
|
||||
AnfAlgo::SetStreamId(stream_id, send_node_ptr.get());
|
||||
return send_node_ptr;
|
||||
}
|
||||
|
||||
CNodePtr AclStreamAssign::CreateRecvApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id,
|
||||
uint32_t stream_id) const {
|
||||
auto recv_op = std::make_shared<Primitive>(kStreamRecvOpName);
|
||||
MS_EXCEPTION_IF_NULL(recv_op);
|
||||
auto recv_apply = std::make_shared<ValueNode>(recv_op);
|
||||
MS_EXCEPTION_IF_NULL(recv_apply);
|
||||
auto recv_node_ptr = graph_ptr->NewCNode({recv_apply});
|
||||
MS_EXCEPTION_IF_NULL(recv_node_ptr);
|
||||
common::AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr);
|
||||
AnfAlgo::SetStreamId(stream_id, recv_node_ptr.get());
|
||||
return recv_node_ptr;
|
||||
}
|
||||
|
||||
void AclStreamAssign::InsertEvents(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr ¶llel_cnode,
|
||||
const AnfNodePtr &node_before_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv,
|
||||
const AnfNodePtr &node_after_recv) const {
|
||||
MS_EXCEPTION_IF_NULL(kernel_send);
|
||||
MS_EXCEPTION_IF_NULL(kernel_recv);
|
||||
AscendStreamMng &resource_manager = AscendStreamMng::GetInstance();
|
||||
uint32_t event_id = resource_manager.ApplyNewEvent();
|
||||
auto event = resource_manager.ApplyRtEvent();
|
||||
auto send_cnode = CreateSendApplyKernel(kernel_graph, event_id, AnfAlgo::GetStreamId(node_before_send));
|
||||
common::AnfAlgo::SetNodeAttr(kAttrRecordEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), send_cnode);
|
||||
auto send_iter = kernel_send->find(node_before_send);
|
||||
if (send_iter == kernel_send->end()) {
|
||||
(*kernel_send)[node_before_send] = {send_cnode};
|
||||
} else {
|
||||
send_iter->second.push_back(send_cnode);
|
||||
}
|
||||
|
||||
CNodePtr recv_cnode = CreateRecvApplyKernel(kernel_graph, event_id, AnfAlgo::GetStreamId(node_after_recv));
|
||||
common::AnfAlgo::SetNodeAttr(kAttrWaitEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), recv_cnode);
|
||||
auto process_iter = kernel_recv->find(node_after_recv);
|
||||
if (process_iter == kernel_recv->end()) {
|
||||
(*kernel_recv)[node_after_recv] = {recv_cnode};
|
||||
} else {
|
||||
process_iter->second.push_back(recv_cnode);
|
||||
}
|
||||
|
||||
if (parallel_cnode == node_before_send) {
|
||||
kernel_graph->InsertSendRecvPairForParallelOpOutputs(parallel_cnode, std::make_pair(send_cnode, recv_cnode));
|
||||
MS_LOG(INFO) << "Generate send/recv for parallel op " << parallel_cnode->fullname_with_scope() << "'s output."
|
||||
<< "Send node " << send_cnode->fullname_with_scope() << " after "
|
||||
<< node_before_send->fullname_with_scope() << ", recv node " << recv_cnode->fullname_with_scope()
|
||||
<< " before " << node_after_recv->fullname_with_scope();
|
||||
} else {
|
||||
kernel_graph->InsertSendRecvPairForParallelOpInputs(parallel_cnode, std::make_pair(send_cnode, recv_cnode));
|
||||
MS_LOG(INFO) << "Generate send/recv for parallel op " << parallel_cnode->fullname_with_scope() << "'s input."
|
||||
<< "Send node " << send_cnode->fullname_with_scope() << " after "
|
||||
<< node_before_send->fullname_with_scope() << ", recv node " << recv_cnode->fullname_with_scope()
|
||||
<< " before " << node_after_recv->fullname_with_scope();
|
||||
}
|
||||
}
|
||||
|
||||
void AclStreamAssign::GenEventsForParallelOp(const NotNull<KernelGraphPtr> &kernel_graph,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const {
|
||||
MS_LOG(DEBUG) << "Start GenEventsForParallelOp...";
|
||||
auto exec_kernels = kernel_graph->execution_order();
|
||||
mindspore::HashMap<CNodePtr, NodeIoExecInfoPtr> kernel_io_exec_info_map;
|
||||
GenKernelIoExecInfoMap(kernel_graph, &kernel_io_exec_info_map);
|
||||
for (auto &process_kernel : exec_kernels) {
|
||||
if (AnfAlgo::IsKernelSelectBackoffOp(process_kernel)) {
|
||||
continue;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(process_kernel);
|
||||
auto process_stream_id = AnfAlgo::GetStreamId(process_kernel);
|
||||
if (process_stream_id == kDefaultStreamIndex) {
|
||||
continue;
|
||||
}
|
||||
MS_LOG(DEBUG) << "Start GenEvents For ParallelOp " << process_kernel->fullname_with_scope();
|
||||
auto process_iter = kernel_io_exec_info_map.find(process_kernel);
|
||||
if (process_iter == kernel_io_exec_info_map.end()) {
|
||||
MS_LOG(INFO) << "Can't get node io execution info for " << process_kernel->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
auto process_io_exec_info = process_iter->second;
|
||||
InsertEventsForInputs(kernel_graph, process_kernel, process_io_exec_info, kernel_send, kernel_recv);
|
||||
InsertEventsForOutputs(kernel_graph, process_kernel, process_io_exec_info, kernel_send, kernel_recv);
|
||||
}
|
||||
MS_LOG(DEBUG) << "Finish GenEventsForParallelOp.";
|
||||
}
|
||||
|
||||
void AclStreamAssign::InsertEventForNonTaskSink(const NotNull<KernelGraphPtr> &kernel_graph) const {
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> kernel_send;
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> kernel_recv;
|
||||
AnfAlgo::SetStreamId(kDefaultStreamIndex, kernel_graph->output().get());
|
||||
GenEventsForParallelOp(kernel_graph, &kernel_send, &kernel_recv);
|
||||
UpdateEventsToExecutionOrder(kernel_graph, kernel_send, kernel_recv);
|
||||
}
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,104 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_STREAM_ASSIGN_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_STREAM_ASSIGN_H_
|
||||
|
||||
#include <functional>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include "include/backend/kernel_graph.h"
|
||||
#include "include/common/utils/contract.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
struct NodeExecInfo {
|
||||
CNodePtr node;
|
||||
uint32_t stream_id;
|
||||
size_t execution_order_index;
|
||||
};
|
||||
using NodeExecInfoPtr = std::shared_ptr<NodeExecInfo>;
|
||||
|
||||
struct NodeIoExecInfo {
|
||||
NodeExecInfoPtr node_exec_info;
|
||||
std::vector<NodeExecInfoPtr> inputs;
|
||||
std::vector<NodeExecInfoPtr> outputs;
|
||||
};
|
||||
using NodeIoExecInfoPtr = std::shared_ptr<NodeIoExecInfo>;
|
||||
|
||||
class AclStreamAssign {
|
||||
public:
|
||||
static AclStreamAssign &GetInstance() {
|
||||
static AclStreamAssign instance; // Guaranteed to be destroyed.
|
||||
return instance;
|
||||
}
|
||||
|
||||
AclStreamAssign(const AclStreamAssign &) = delete;
|
||||
AclStreamAssign &operator=(const AclStreamAssign &) = delete;
|
||||
|
||||
void AssignStream(const NotNull<KernelGraphPtr> &kernel_graph) const;
|
||||
|
||||
private:
|
||||
AclStreamAssign() = default;
|
||||
~AclStreamAssign() = default;
|
||||
|
||||
void GenKernelIoExecInfoMap(const NotNull<KernelGraphPtr> &kernel_graph,
|
||||
mindspore::HashMap<CNodePtr, NodeIoExecInfoPtr> *kernel_io_exec_info_map) const;
|
||||
|
||||
void UpdateEventsToExecutionOrder(
|
||||
const NotNull<KernelGraphPtr> &kernel_graph,
|
||||
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &send_after_node,
|
||||
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &recv_before_node) const;
|
||||
|
||||
void GenEventsForParallelOp(const NotNull<KernelGraphPtr> &kernel_graph,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const;
|
||||
|
||||
void InsertEventForNonTaskSink(const NotNull<KernelGraphPtr> &kernel_graph) const;
|
||||
|
||||
void InsertEventsForInputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
|
||||
const NodeIoExecInfoPtr &io_exec_info,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const;
|
||||
|
||||
void InsertEventsForOutputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
|
||||
const NodeIoExecInfoPtr &io_exec_info,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const;
|
||||
|
||||
void InsertEvents(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr ¶llel_cnode,
|
||||
const AnfNodePtr &node_before_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
|
||||
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv,
|
||||
const AnfNodePtr &node_after_recv) const;
|
||||
|
||||
CNodePtr CreateSendApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id, uint32_t stream_id) const;
|
||||
|
||||
CNodePtr CreateRecvApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id, uint32_t stream_id) const;
|
||||
};
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_STREAM_ASSIGN_H_
|
|
@ -88,8 +88,14 @@ bool AscendCollectiveCommLib::InitializeHccl() {
|
|||
MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str;
|
||||
|
||||
auto mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
|
||||
bool ret = hccl::HcclAdapter::GetInstance().InitHccl(
|
||||
device_id, rank_id_str, full_path, mode == kGraphMode ? hccl::HcclMode::kGraph : hccl::HcclMode::kPynative);
|
||||
hccl::HcclMode hccl_mode = hccl::HcclMode::kGraph;
|
||||
if (mode == kPynativeMode) {
|
||||
hccl_mode = hccl::HcclMode::kPynative;
|
||||
} else if (ms_context->IsKByKExecutorMode()) {
|
||||
hccl_mode = hccl::HcclMode::kKernelByKernel;
|
||||
}
|
||||
|
||||
bool ret = hccl::HcclAdapter::GetInstance().InitHccl(device_id, rank_id_str, full_path, hccl_mode);
|
||||
free(full_path);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Hcom init failed.";
|
||||
|
|
|
@ -50,8 +50,13 @@ void GEGraphOptimization::OptimizeGEGraph(const KernelGraphPtr &graph) {
|
|||
MS_LOG(DEBUG) << "Status record: end optimize ge graph. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void GEGraphOptimization::OptimizeACLGraph(const KernelGraphPtr &graph) {
|
||||
void GEGraphOptimization::OptimizeACLGraph(const KernelGraphPtr &graph, std::set<KernelGraphPtr> *const memo) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(memo);
|
||||
if (memo->find(graph) != memo->end()) {
|
||||
return;
|
||||
}
|
||||
memo->insert(graph);
|
||||
MS_LOG(DEBUG) << "Status record: start optimize acl graph. graph id: " << graph->graph_id();
|
||||
// empty graph dont entry to backend
|
||||
if (graph->execution_order().empty()) {
|
||||
|
@ -62,11 +67,20 @@ void GEGraphOptimization::OptimizeACLGraph(const KernelGraphPtr &graph) {
|
|||
}
|
||||
opt::AscendUnfoldInputsForSpecialNodes(graph);
|
||||
opt::GEBackendOptimizeACL(graph);
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
OptimizeACLGraph(child_graph.lock(), memo);
|
||||
}
|
||||
MS_LOG(DEBUG) << "Status record: end optimize acl graph. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void GEGraphOptimization::OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph) {
|
||||
void GEGraphOptimization::OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph,
|
||||
std::set<KernelGraphPtr> *const memo) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(memo);
|
||||
if (memo->find(graph) != memo->end()) {
|
||||
return;
|
||||
}
|
||||
memo->insert(graph);
|
||||
MS_LOG(DEBUG) << "Status record: start optimize acl graph after kernel select. graph id: " << graph->graph_id();
|
||||
// empty graph dont entry to backend
|
||||
if (graph->execution_order().empty()) {
|
||||
|
@ -76,9 +90,26 @@ void GEGraphOptimization::OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr
|
|||
MS_LOG(DEBUG) << "Status record: end optimize acl graph after kernel select. graph id: " << graph->graph_id();
|
||||
}
|
||||
opt::GEBackendOptimizeACLAfterKernelSelect(graph);
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
OptimizeACLGraphAfterKernelSelect(child_graph.lock(), memo);
|
||||
}
|
||||
MS_LOG(DEBUG) << "Status record: end optimize acl graph after kernel select. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void GEGraphOptimization::OptimizeACLGraphAfterInline(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(DEBUG) << "Status record: start optimize acl graph after inline. graph id: " << graph->graph_id();
|
||||
// empty graph dont entry to backend
|
||||
if (graph->execution_order().empty()) {
|
||||
MS_LOG(DEBUG) << graph->ToString() << " is empty graph.";
|
||||
AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
|
||||
graph->set_executable(false);
|
||||
MS_LOG(DEBUG) << "Status record: end optimize acl graph after inline. graph id: " << graph->graph_id();
|
||||
}
|
||||
opt::GEAfterInlineOptimize(graph);
|
||||
MS_LOG(DEBUG) << "Status record: end optimize acl graph after inline. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void GEGraphOptimization::UnifyMindIR(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "Status record: start unify mindir. graph id: " << graph->graph_id();
|
||||
|
|
|
@ -31,8 +31,9 @@ class GEGraphOptimization {
|
|||
return instance;
|
||||
}
|
||||
void OptimizeGEGraph(const KernelGraphPtr &graph);
|
||||
void OptimizeACLGraph(const KernelGraphPtr &graph);
|
||||
void OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph);
|
||||
void OptimizeACLGraph(const KernelGraphPtr &graph, std::set<KernelGraphPtr> *const memo);
|
||||
void OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph, std::set<KernelGraphPtr> *const memo);
|
||||
void OptimizeACLGraphAfterInline(const KernelGraphPtr &graph);
|
||||
void UnifyMindIR(const KernelGraphPtr &graph);
|
||||
void GEMindIRPass(const KernelGraphPtr &graph) const;
|
||||
|
||||
|
|
|
@ -16,13 +16,20 @@
|
|||
#include "plugin/device/ascend/hal/hardware/ge_kernel_executor.h"
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include "include/common/utils/parallel_context.h"
|
||||
#include "acl/acl_rt.h"
|
||||
#include "acl/acl_op_compiler.h"
|
||||
#include "include/common/profiler.h"
|
||||
#include "mindspore/core/ops/array_ops.h"
|
||||
#include "mindspore/core/ops/framework_ops.h"
|
||||
#include "backend/common/session/kernel_graph_mgr.h"
|
||||
#include "ops/auto_generate/gen_ops_primitive.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
|
||||
#include "plugin/device/ascend/hal/hardware/ge_graph_optimization.h"
|
||||
#include "plugin/device/ascend/hal/common/ascend_utils.h"
|
||||
#include "plugin/device/ascend/hal/hardware/acl_somas.h"
|
||||
#include "plugin/device/ascend/hal/hardware/acl_stream_assign.h"
|
||||
#include "plugin/device/ascend/kernel/rts/rt_kernel_build.h"
|
||||
#include "plugin/device/ascend/kernel/hccl/hccl_kernel_metadata.h"
|
||||
#include "plugin/device/ascend/kernel/hccl/hccl_kernel_build.h"
|
||||
|
||||
|
@ -121,6 +128,71 @@ void SetAclOpPrecisionMode() {
|
|||
MS_LOG(EXCEPTION) << "Acl set op precision mode failed! Error flag is " << ret;
|
||||
}
|
||||
}
|
||||
|
||||
void SelectKernel(const KernelGraphPtr &kernel_graph, std::set<KernelGraphPtr> *const memo) {
|
||||
// select kernel
|
||||
MS_EXCEPTION_IF_NULL(memo);
|
||||
if (memo->find(kernel_graph) != memo->end()) {
|
||||
return;
|
||||
}
|
||||
memo->insert(kernel_graph);
|
||||
bool aclnn_can_used = !kernel_graph->is_from_single_op();
|
||||
const auto &kernels = kernel_graph->execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
auto [select_res, msg, etype] =
|
||||
device::ascend::SelectKernelInfoWithMsg(kernel, aclnn_can_used && kernel::IsEnabledAclnn(kernel));
|
||||
if (!select_res) {
|
||||
MS_LOG(INFO) << "node is " << kernel->fullname_with_scope() << " should backoff";
|
||||
std::pair<std::string, ExceptionType> failure_info = std::make_pair(msg, etype);
|
||||
device::ascend::HandleKernelSelectFailure(kernel_graph, kernel, failure_info);
|
||||
}
|
||||
}
|
||||
if (!kernel_graph->is_from_single_op()) {
|
||||
kernel_graph->SetKernelObjectTypesForUnrealNodes();
|
||||
}
|
||||
for (auto &child_graph : kernel_graph->child_graph_order()) {
|
||||
SelectKernel(child_graph.lock(), memo);
|
||||
}
|
||||
}
|
||||
|
||||
void InlineSubGraph(const KernelGraphPtr &graph) {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
bool save_graphs = context_ptr->CanDump(kIntroductory);
|
||||
if (save_graphs) {
|
||||
std::string file_name = "hwopt_d_before_inline_graph_" + std::to_string(graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, graph, true, kWholeStack);
|
||||
}
|
||||
#endif
|
||||
auto kernel_cnodes = graph->execution_order();
|
||||
for (auto &kernel_cnode : kernel_cnodes) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_cnode);
|
||||
if (common::AnfAlgo::CheckPrimitiveType(kernel_cnode, prim::kPrimCallInline)) {
|
||||
auto sub_graph = common::AnfAlgo::GetNodeAttr<KernelGraphPtr>(kernel_cnode, kAttrKernelGraph);
|
||||
MS_EXCEPTION_IF_NULL(sub_graph);
|
||||
MS_LOG(INFO) << "InlineSubGraph: " << kernel_cnode->fullname_with_scope()
|
||||
<< ", sub graph: " << sub_graph->graph_id() << ", need inline: " << sub_graph->need_inline();
|
||||
auto main_graph = kernel_cnode->func_graph();
|
||||
MS_EXCEPTION_IF_NULL(main_graph);
|
||||
auto mng = main_graph->manager();
|
||||
auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_cnode->kernel_info());
|
||||
MS_EXCEPTION_IF_NULL(kernel_info);
|
||||
AnfNodePtrList inp(kernel_cnode->inputs().begin() + 1, kernel_cnode->inputs().end());
|
||||
const auto &ref_map = sub_graph->GetRefMap();
|
||||
auto out = session::KernelGraphMgr::DoInline(sub_graph, main_graph, inp, kernel_cnode->input(0)->scope(),
|
||||
kernel_info->graph_id(), ref_map, graph);
|
||||
(void)mng->Replace(kernel_cnode, out);
|
||||
}
|
||||
}
|
||||
GEGraphOptimization::GetInstance().OptimizeACLGraphAfterInline(graph);
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
if (save_graphs) {
|
||||
std::string file_name = "hwopt_d_after_inline_graph_" + std::to_string(graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, graph, true, kWholeStack);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void GeKernelExecutor::Initialize() {
|
||||
|
@ -170,24 +242,15 @@ void GeKernelExecutor::OptimizeGraph(const FuncGraphPtr &graph) const {
|
|||
return;
|
||||
}
|
||||
profiler::CollectHostInfo("Ascend", "Graph Optimization", "GeOptimizeGraph", 1, 0, 0);
|
||||
GEGraphOptimization::GetInstance().OptimizeACLGraph(kernel_graph);
|
||||
bool aclnn_can_used = !kernel_graph->is_from_single_op();
|
||||
// select kernel
|
||||
const auto &kernels = kernel_graph->execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
auto [select_res, msg, etype] =
|
||||
device::ascend::SelectKernelInfoWithMsg(kernel, aclnn_can_used && kernel::IsEnabledAclnn(kernel));
|
||||
if (!select_res) {
|
||||
MS_LOG(INFO) << "node is " << kernel->fullname_with_scope() << " should backoff";
|
||||
std::pair<std::string, ExceptionType> failure_info = std::make_pair(msg, etype);
|
||||
device::ascend::HandleKernelSelectFailure(kernel_graph, kernel, failure_info);
|
||||
}
|
||||
}
|
||||
if (!kernel_graph->is_from_single_op() && !kernel_graph->has_flag(kFlagIsPyNativeBpropKernelGraph)) {
|
||||
kernel_graph->SetKernelObjectTypesForUnrealNodes();
|
||||
}
|
||||
|
||||
GEGraphOptimization::GetInstance().OptimizeACLGraphAfterKernelSelect(kernel_graph);
|
||||
std::set<KernelGraphPtr> memo;
|
||||
GEGraphOptimization::GetInstance().OptimizeACLGraph(kernel_graph, &memo);
|
||||
memo.clear();
|
||||
SelectKernel(kernel_graph, &memo);
|
||||
memo.clear();
|
||||
GEGraphOptimization::GetInstance().OptimizeACLGraphAfterKernelSelect(kernel_graph, &memo);
|
||||
memo.clear();
|
||||
InlineSubGraph(kernel_graph);
|
||||
OptimizeExecutionOrder(NOT_NULL(graph));
|
||||
profiler::CollectHostInfo("Ascend", "Graph Optimization", "GeOptimizeGraph", 1, 0, 1);
|
||||
}
|
||||
|
||||
|
@ -214,6 +277,79 @@ void GeKernelExecutor::CreateKernel(const std::vector<CNodePtr> &nodes) const {
|
|||
MS_LOG(DEBUG) << "Status record: end create kernel.";
|
||||
}
|
||||
|
||||
namespace {
|
||||
void CreateEventKernelMod(const KernelGraphPtr &kernel_graph) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto nodes = kernel_graph->execution_order();
|
||||
for (auto &node : nodes) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (!IsOneOfPrimitiveCNode(node, {prim::kPrimStreamSend, prim::kPrimStreamRecv})) {
|
||||
continue;
|
||||
}
|
||||
device::ascend::GenerateKernelBuildInfo(node, RT_KERNEL);
|
||||
auto kernel_mod_ptr = kernel::RtOpBuild(node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
|
||||
AnfAlgo::SetKernelMod(kernel_mod_ptr, node.get());
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void GeKernelExecutor::DoStreamAssign(const KernelGraphPtr &kernel_graph) {
|
||||
MS_LOG(DEBUG) << "Status record: start stream assign.";
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
// stream assign
|
||||
AclStreamAssign::GetInstance().AssignStream(NOT_NULL(kernel_graph));
|
||||
CreateEventKernelMod(kernel_graph);
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
bool save_graphs = context_ptr->CanDump(kIntroductory);
|
||||
if (save_graphs) {
|
||||
std::string file_name = "hwopt_d_after_stream_assign_" + std::to_string(kernel_graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, kernel_graph, true, kWholeStack);
|
||||
}
|
||||
#endif
|
||||
kernel_graph->PrintGraphExecuteOrder();
|
||||
MS_LOG(DEBUG) << "Status record: end stream assign.";
|
||||
}
|
||||
|
||||
void GeKernelExecutor::DoSomas(const FuncGraphPtr &graph) {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
auto kernel_graph = graph->cast<KernelGraphPtr>();
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
DoStreamAssign(kernel_graph);
|
||||
// somas
|
||||
MS_LOG(DEBUG) << "Status record: start do somas.";
|
||||
if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) != kOptimizeO0) {
|
||||
auto somas = std::make_shared<AclSomas>();
|
||||
bool ret = somas->Assign(kernel_graph);
|
||||
if (ret) {
|
||||
MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
|
||||
<< " somas size: " << kernel_graph->somas_whole_block_size();
|
||||
} else if (somas->IsSupportSomas(*kernel_graph)) {
|
||||
MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
|
||||
}
|
||||
}
|
||||
MS_LOG(DEBUG) << "Status record: end do somas.";
|
||||
}
|
||||
|
||||
void GeKernelExecutor::OptimizeExecutionOrder(const FuncGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
auto kernel_graph = graph->cast<KernelGraphPtr>();
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
MS_LOG(DEBUG) << "Status record: start optimize execution order. graph id: " << kernel_graph->graph_id();
|
||||
auto execution_order = kernel_graph->execution_order();
|
||||
kernel_graph->EnableRuntimeCache();
|
||||
common::AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
|
||||
kernel_graph->DisableRuntimeCache();
|
||||
kernel_graph->set_execution_order(execution_order);
|
||||
MS_LOG(DEBUG) << "Status record: end optimize execution order. graph id: " << kernel_graph->graph_id();
|
||||
}
|
||||
|
||||
void GeKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
profiler::CollectHostInfo("Ascend", "PreprocessBeforeRun", "GePreprocess", 1, 0, 0);
|
||||
|
@ -250,16 +386,16 @@ void GeKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
|
|||
}
|
||||
}
|
||||
|
||||
DoSomas(NOT_NULL(graph));
|
||||
|
||||
profiler::CollectHostInfo("Ascend", "PreprocessBeforeRun", "GePreprocess", 1, 0, 1);
|
||||
}
|
||||
|
||||
bool GeKernelExecutor::PySyncRuning() const {
|
||||
bool GeKernelExecutor::PySyncRuning(size_t stream_id) const {
|
||||
MS_EXCEPTION_IF_NULL(res_manager_);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
MS_EXCEPTION_IF_NULL(res_manager_);
|
||||
if ((ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) &&
|
||||
ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE) &&
|
||||
!res_manager_->SyncStream(kDefaultStreamIndex)) {
|
||||
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE) && !res_manager_->SyncStream(stream_id)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -299,6 +435,7 @@ bool GeKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<KernelT
|
|||
auto stream = AscendStreamMng::GetInstance().GetStream(stream_id);
|
||||
if (stream == nullptr) {
|
||||
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
|
||||
stream_id = kDefaultStreamIndex;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(stream);
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
|
@ -329,10 +466,9 @@ bool GeKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<KernelT
|
|||
}
|
||||
}
|
||||
// for PyNative Sync Run mode
|
||||
auto ret = PySyncRuning();
|
||||
auto ret = PySyncRuning(stream_id);
|
||||
PROFILER_END(start_time, runtime::ProfilerModule::kKernel, runtime::ProfilerEvent::kKernelLaunch,
|
||||
kernel->fullname_with_scope(), false);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -60,6 +60,7 @@ class GeKernelExecutor : public KernelExecutor {
|
|||
// Unify the MindIR, the default behavior uses the common unified MindIR.
|
||||
void UnifyMindIR(const KernelGraphPtr &graph) const override;
|
||||
void AddMindIRPass(const KernelGraphPtr &graph) const override;
|
||||
void OptimizeExecutionOrder(const FuncGraphPtr &graph) const;
|
||||
|
||||
// Get rank id for distributed training.
|
||||
uint32_t GetRankID() const override { return 0; }
|
||||
|
@ -69,10 +70,12 @@ class GeKernelExecutor : public KernelExecutor {
|
|||
const device::DeviceAddressPtrList &output_addr_list, const size_t &stream_id) const override;
|
||||
|
||||
private:
|
||||
static void DoSomas(const FuncGraphPtr &graph);
|
||||
static void DoStreamAssign(const KernelGraphPtr &kernel_graph);
|
||||
// launch
|
||||
bool PySyncRuning() const;
|
||||
bool MemoryCopyAsync(const CNodePtr &node, const vector<KernelTensor *> &inputs,
|
||||
const vector<KernelTensor *> &outputs) const;
|
||||
bool PySyncRuning(size_t stream_id) const;
|
||||
|
||||
mutable std::set<CNodePtr> nop_op_to_memcpy_;
|
||||
// Maybe AscendDeviceResManager and GEDeviceResManager now
|
||||
|
|
|
@ -170,9 +170,10 @@ HcclMode HcclAdapter::GetCurrentHcclMode() const {
|
|||
MS_EXCEPTION_IF_NULL(context);
|
||||
bool is_graph_mode = context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode;
|
||||
bool is_task_sink = context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
|
||||
bool graph_op_run = context->IsKByKExecutorMode();
|
||||
if (!is_graph_mode) {
|
||||
return HcclMode::kPynative;
|
||||
} else if (is_task_sink) {
|
||||
} else if (is_task_sink && !graph_op_run) {
|
||||
return HcclMode::kGraph;
|
||||
} else {
|
||||
return HcclMode::kKernelByKernel;
|
||||
|
@ -190,10 +191,9 @@ void HcclAdapter::CheckExcutionMode() const {
|
|||
}
|
||||
|
||||
std::string HcclAdapter::GetHcclModeString(HcclMode hccl_mode) {
|
||||
static std::map<HcclMode, std::string> kHcclModeString = {
|
||||
{HcclMode::kGraph, "GRAPH_MODE"},
|
||||
{HcclMode::kPynative, "PYNATIVE_MODE"},
|
||||
{HcclMode::kKernelByKernel, "GRAPH_MODE disable TASK_SINK"}};
|
||||
static std::map<HcclMode, std::string> kHcclModeString = {{HcclMode::kGraph, "GRAPH_MODE"},
|
||||
{HcclMode::kPynative, "PYNATIVE_MODE"},
|
||||
{HcclMode::kKernelByKernel, "KERNEL_BY_KERNEL_MODE"}};
|
||||
return kHcclModeString.at(hccl_mode);
|
||||
}
|
||||
|
||||
|
|
|
@ -43,7 +43,11 @@ file(GLOB_RECURSE SRC_IN_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
|||
"pyboost/*.cc"
|
||||
"pyboost/auto_generate/*.cc"
|
||||
"pyboost/ops/*.cc"
|
||||
"ascend_kernel_mod.cc"
|
||||
"rts/send.cc"
|
||||
"rts/recv.cc"
|
||||
"rts/rt_kernel.cc"
|
||||
"rts/rt_kernel_build.cc"
|
||||
"rts/rt_kernel_info.cc"
|
||||
)
|
||||
list(REMOVE_ITEM SRC_IN_910B ${AICPU_OPS_SRC})
|
||||
|
||||
|
|
|
@ -91,9 +91,5 @@ bool ReduceSumAclnnKernelMod::Launch(const std::vector<KernelTensor *> &inputs,
|
|||
RunOp(stream_ptr, workspace);
|
||||
return true;
|
||||
}
|
||||
|
||||
MS_ACLLNN_KERNEL_FACTORY_REG(ReduceAll, ReduceAllAclnnKernelMod);
|
||||
MS_ACLLNN_KERNEL_FACTORY_REG(ReduceAny, ReduceAnyAclnnKernelMod);
|
||||
MS_ACLLNN_KERNEL_FACTORY_REG(ReduceSum, ReduceSumAclnnKernelMod);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
|
||||
#include "plugin/device/ascend/kernel/rts/recv.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "plugin/device/ascend/hal/device/ge_runtime/task_info.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
|
||||
#include "include/backend/anf_runtime_algorithm.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
|
@ -25,8 +24,6 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
using mindspore::ge::model_runner::EventWaitTaskInfo;
|
||||
using EventWaitTaskInfoPtr = std::shared_ptr<EventWaitTaskInfo>;
|
||||
|
||||
RecvKernel::~RecvKernel() {}
|
||||
|
||||
|
@ -62,14 +59,5 @@ bool RecvKernel::Launch(const std::vector<KernelTensor *> &, const std::vector<K
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<TaskInfoPtr> RecvKernel::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &, uint32_t stream_id) {
|
||||
MS_LOG(INFO) << "RecvKernel GenTask event_id_:" << event_id_ << ", stream_id_:" << stream_id;
|
||||
stream_id_ = stream_id;
|
||||
EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(unique_name_, stream_id, event_id_);
|
||||
MS_EXCEPTION_IF_NULL(task_info_ptr);
|
||||
return {task_info_ptr};
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -32,8 +32,7 @@ class RecvKernel : public RtKernel {
|
|||
bool Init(const AnfNodePtr &anf_node) override;
|
||||
bool Launch(const std::vector<KernelTensor *> &, const std::vector<KernelTensor *> &,
|
||||
const std::vector<KernelTensor *> &, void *stream_ptr) override;
|
||||
std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
|
||||
std::vector<KernelAttr> GetOpSupport() override { MS_LOG(EXCEPTION) << "This interface is not support in RtKernel."; }
|
||||
|
||||
private:
|
||||
uint32_t event_id_{0};
|
||||
|
|
|
@ -41,12 +41,5 @@ RtKernel::RtKernel() {}
|
|||
RtKernel::~RtKernel() {}
|
||||
|
||||
bool RtKernel::Init(const mindspore::AnfNodePtr & /* anf_node */) { return true; }
|
||||
|
||||
void RtKernel::SetInputSizeList(const std::vector<size_t> &size_list) { mutable_input_size_list_ = size_list; }
|
||||
void RtKernel::SetOutputSizeList(const std::vector<size_t> &size_list) { mutable_output_size_list_ = size_list; }
|
||||
void RtKernel::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { mutable_workspace_size_list_ = size_list; }
|
||||
|
||||
const std::vector<size_t> &RtKernel::GetOutputSizeList() const { return mutable_output_size_list_; }
|
||||
const std::vector<size_t> &RtKernel::GetWorkspaceSizeList() const { return mutable_workspace_size_list_; }
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -22,26 +22,17 @@
|
|||
#include <memory>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "plugin/device/ascend/kernel/ascend_kernel_mod.h"
|
||||
#include "kernel/task_stream.h"
|
||||
#include "kernel/kernel.h"
|
||||
#include "acl/acl.h"
|
||||
#include "acl/acl_rt.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class RtKernel : public AscendKernelMod {
|
||||
class RtKernel : public KernelMod {
|
||||
public:
|
||||
RtKernel();
|
||||
~RtKernel() override;
|
||||
virtual bool Init(const AnfNodePtr &anf_node);
|
||||
void SetInputSizeList(const std::vector<size_t> &size_list) override;
|
||||
void SetOutputSizeList(const std::vector<size_t> &size_list) override;
|
||||
void SetWorkspaceSizeList(const std::vector<size_t> &size_list) override;
|
||||
const std::vector<size_t> &GetOutputSizeList() const override;
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override;
|
||||
|
||||
protected:
|
||||
mutable std::vector<size_t> mutable_input_size_list_;
|
||||
mutable std::vector<size_t> mutable_output_size_list_;
|
||||
mutable std::vector<size_t> mutable_workspace_size_list_;
|
||||
};
|
||||
|
||||
using RTKernelPtr = std::shared_ptr<RtKernel>;
|
||||
|
|
|
@ -35,7 +35,6 @@ KernelModPtr RtOpBuild(const AnfNodePtr &anf_node) {
|
|||
MS_LOG(ERROR) << "Rt Op initialize failed!";
|
||||
return nullptr;
|
||||
}
|
||||
ker_ptr->SetNode(anf_node);
|
||||
return ker_ptr;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -18,14 +18,10 @@
|
|||
#include "runtime/event.h"
|
||||
#include "acl/acl.h"
|
||||
#include "acl/acl_rt.h"
|
||||
#include "plugin/device/ascend/hal/device/ge_runtime/task_info.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
|
||||
#include "include/backend/anf_runtime_algorithm.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
|
||||
using mindspore::ge::model_runner::EventRecordTaskInfo;
|
||||
using EventRecordTaskInfoPtr = std::shared_ptr<EventRecordTaskInfo>;
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
SendKernel::~SendKernel() {}
|
||||
|
@ -57,14 +53,5 @@ bool SendKernel::Launch(const std::vector<KernelTensor *> &, const std::vector<K
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<TaskInfoPtr> SendKernel::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &, uint32_t stream_id) {
|
||||
MS_LOG(INFO) << "SendKernel GenTask event id:" << event_id_ << ", stream id:" << stream_id;
|
||||
stream_id_ = stream_id;
|
||||
EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(unique_name_, stream_id, event_id_);
|
||||
MS_EXCEPTION_IF_NULL(task_info_ptr);
|
||||
return {task_info_ptr};
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -31,8 +31,7 @@ class SendKernel : public RtKernel {
|
|||
bool Init(const AnfNodePtr &anf_node) override;
|
||||
bool Launch(const std::vector<KernelTensor *> &, const std::vector<KernelTensor *> &,
|
||||
const std::vector<KernelTensor *> &, void *stream_ptr) override;
|
||||
std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &, uint32_t stream_id) override;
|
||||
std::vector<KernelAttr> GetOpSupport() override { MS_LOG(EXCEPTION) << "This interface is not support in RtKernel."; }
|
||||
|
||||
private:
|
||||
uint32_t event_id_{0};
|
||||
|
|
|
@ -39,6 +39,7 @@ file(GLOB_RECURSE MS_OPTIMIZER_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
|||
"./ir_fusion/histogram_fixed_width_fusion.cc"
|
||||
"./ir_fusion/adaptive_max_pool2d_fusion.cc"
|
||||
"./ir_fusion/flash_attention_fusion.cc"
|
||||
"./enhancer/eliminate_maketuple_getitem.cc"
|
||||
)
|
||||
list(REMOVE_ITEM MS_OPTIMIZER_910B
|
||||
"mindir/ascend_vm_op_adapter.cc"
|
||||
|
|
|
@ -24,18 +24,14 @@
|
|||
#include "ir/anf.h"
|
||||
#include "include/backend/optimizer/helper.h"
|
||||
#include "include/backend/optimizer/optimizer.h"
|
||||
#include "plugin/device/ascend/optimizer/ascend_helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class EliminateMaketupleGetitem : public Pass {
|
||||
public:
|
||||
EliminateMaketupleGetitem() : Pass("eliminate_maketuple_getitem"), kernel_select_(std::make_shared<KernelSelect>()) {}
|
||||
EliminateMaketupleGetitem() : Pass("eliminate_maketuple_getitem") {}
|
||||
~EliminateMaketupleGetitem() override = default;
|
||||
bool Run(const FuncGraphPtr &graph) override;
|
||||
|
||||
private:
|
||||
KernelSelectPtr kernel_select_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <vector>
|
||||
#include <utility>
|
||||
#include "mindspore/core/ops/sequence_ops.h"
|
||||
#include "mindspore/core/ops/framework_ops.h"
|
||||
#include "plugin/device/ascend/optimizer/format_type/utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
|
|
@ -39,6 +39,8 @@ CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &
|
|||
builder.SetOutputsReshapeType({reshape_type});
|
||||
builder.SetInputsDeviceType({input_type});
|
||||
builder.SetOutputsDeviceType({output_type});
|
||||
builder.SetInputsKernelObjectType({kernel::KernelObjectType::TENSOR});
|
||||
builder.SetOutputsKernelObjectType({kernel::KernelObjectType::TENSOR});
|
||||
builder.SetFusionType(kernel::kPatternOpaque);
|
||||
builder.SetProcessor(kernel::Processor::AICORE);
|
||||
if (kernel::OpLib::FindOp(prim::kPrimCast->name(), kernel::kImplyTBE) != nullptr) {
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "plugin/device/ascend/optimizer/ge/process_call_inline.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "include/backend/anf_runtime_algorithm.h"
|
||||
#include "include/backend/optimizer/helper.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "mindspore/core/ops/framework_ops.h"
|
||||
#include "utils/anf_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
namespace {
|
||||
bool CheckCallInline(const CNodePtr &cnode) {
|
||||
if (!common::AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimCall)) {
|
||||
return false;
|
||||
}
|
||||
auto call_graph = cnode->input(kIndex1);
|
||||
auto sub_kernel_graph = session::AnfRuntimeAlgorithm::GetValueNodeKernelGraph(call_graph);
|
||||
return sub_kernel_graph->need_inline();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
const BaseRef ProcessCallInline::DefinePattern() const {
|
||||
VarPtr Xs = std::make_shared<SeqVar>();
|
||||
return VectorRef({prim::kPrimCall, Xs});
|
||||
}
|
||||
|
||||
const AnfNodePtr ProcessCallInline::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
if (CheckCallInline(cnode)) {
|
||||
auto call_graph = cnode->input(kIndex1);
|
||||
auto sub_kernel_graph = session::AnfRuntimeAlgorithm::GetValueNodeKernelGraph(call_graph);
|
||||
std::vector<AnfNodePtr> call_inline_inputs = {
|
||||
NewValueNode(std::make_shared<Primitive>(prim::kPrimCallInline->name()))};
|
||||
for (size_t i = kIndex1; i < common::AnfAlgo::GetInputNum(cnode); i++) {
|
||||
call_inline_inputs.emplace_back(common::AnfAlgo::GetInputNode(cnode, i));
|
||||
}
|
||||
auto call_inline = graph->NewCNode(call_inline_inputs);
|
||||
MS_EXCEPTION_IF_NULL(call_inline);
|
||||
call_inline->set_abstract(cnode->abstract());
|
||||
common::AnfAlgo::SetNodeAttr(kAttrKernelGraph, MakeValue(sub_kernel_graph), call_inline);
|
||||
return call_inline;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2023 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_OPTIMIZER_PROCESS_CALL_INLINE_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_OPTIMIZER_PROCESS_CALL_INLINE_H_
|
||||
|
||||
#include "include/backend/optimizer/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class ProcessCallInline : public PatternProcessPass {
|
||||
public:
|
||||
explicit ProcessCallInline(bool multi_graph = true) : PatternProcessPass("process call inline", multi_graph) {}
|
||||
~ProcessCallInline() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &) const override;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_OPTIMIZER_PROCESS_CALL_INLINE_H_
|
|
@ -18,12 +18,13 @@
|
|||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "backend/common/pass/dropout_gen_mask_fusion.h"
|
||||
#include "backend/common/pass/common_subexpression_elimination.h"
|
||||
#include "backend/common/pass/erase_visit_attr.h"
|
||||
#include "include/common/debug/anf_ir_dump.h"
|
||||
#include "include/common/debug/dump_proto.h"
|
||||
#include "include/backend/optimizer/optimizer.h"
|
||||
#include "backend/common/pass/erase_visit_attr.h"
|
||||
#include "include/backend/debug/profiler/profiling.h"
|
||||
#include "plugin/device/ascend/hal/hardware/ge_utils.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/all_to_all_v_for_ge.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/maketuple_depend_remover.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/expand_dims_for_batchnorm.h"
|
||||
|
@ -31,14 +32,14 @@
|
|||
#include "plugin/device/ascend/optimizer/ge/convert_condition_input_to_scalar.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/hcom/add_parallel_group_for_hcom.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/hcom/add_depend_for_all_gather.h"
|
||||
|
||||
#include "plugin/device/ascend/optimizer/ge/expander_fallback.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/trans_depend_value_to_int32.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/expander_fallback.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/insert_identity.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/dropout_gen_mask_depend.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/unfold_maketuple.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/unfold_nested_output.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/resize_bilinear_add_attr.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/process_call_inline.h"
|
||||
#include "plugin/device/ascend/optimizer/format_type/deal_ref_output.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/hcom/insert_load_for_allgather.h"
|
||||
#include "plugin/device/ascend/optimizer/format_type/set_fracz_group_attr.h"
|
||||
|
@ -49,14 +50,14 @@
|
|||
#include "plugin/device/ascend/optimizer/ge/scalar_unify_mindir.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/tuple_unify_mindir.h"
|
||||
#include "plugin/device/ascend/optimizer/ir_fission/seed_adapter.h"
|
||||
#include "plugin/device/ascend/optimizer/ir_fission/bn_split.h"
|
||||
#include "plugin/device/ascend/optimizer/ir_fission/bn_grad_split.h"
|
||||
#include "plugin/device/ascend/optimizer/ir_fission/ascend_convert_tuple_input_to_dynamic_input.h"
|
||||
#include "plugin/device/ascend/optimizer/backend_common_unify_mindir.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/remove_tensor_to_scalar_or_tuple_ops.h"
|
||||
#include "plugin/device/ascend/optimizer/ge/scalar_ops_output_unify_mindir.h"
|
||||
#include "backend/common/pass/convert_const_input_to_tensor_input.h"
|
||||
#include "backend/common/pass/insert_type_transform_op.h"
|
||||
#include "backend/common/pass/insert_tensor_move_for_communication.h"
|
||||
#include "plugin/device/ascend/optimizer/enhancer/eliminate_maketuple_getitem.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
|
@ -120,6 +121,7 @@ void GEBackendOptimizeACL(const KernelGraphPtr &kernel_graph) {
|
|||
opt_acl_pm->AddPass(std::make_shared<SeedAdapter>());
|
||||
opt_acl_pm->AddPass(std::make_shared<opt::AICpuLibSelectPass>());
|
||||
opt_acl_pm->AddPass(std::make_shared<opt::TransDependValueToInt32>());
|
||||
opt_acl_pm->AddPass(std::make_shared<opt::ProcessCallInline>());
|
||||
opt_acl_pm->AddPass(std::make_shared<opt::ExpanderFallback>());
|
||||
optimizer->AddPassManager(opt_acl_pm);
|
||||
(void)optimizer->Optimize(kernel_graph);
|
||||
|
@ -155,6 +157,7 @@ void GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr &kernel_graph) {
|
|||
auto opt_acl_after_kernel_select_pm = std::make_shared<PassManager>("opt_acl_after_kernel_select_pm");
|
||||
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
|
||||
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<InsertIdentity>());
|
||||
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<InsertTensorMoveForCommunication>());
|
||||
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<EraseVisitAttr>());
|
||||
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<DealRefOutput>());
|
||||
if (!kernel_graph->is_from_single_op() && !kernel_graph->has_flag(kFlagIsPyNativeBpropKernelGraph)) {
|
||||
|
@ -201,6 +204,36 @@ void GEUnifyMindIR(const KernelGraphPtr &kernel_graph) {
|
|||
profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_UnifyMindIR", 0, 0, 1);
|
||||
}
|
||||
|
||||
void GEAfterInlineOptimize(const KernelGraphPtr &kernel_graph) {
|
||||
profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 0);
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
if (context_ptr->CanDump(kIntroductory)) {
|
||||
std::string file_name =
|
||||
"hwopt_d_before_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, kernel_graph);
|
||||
}
|
||||
#endif
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto after_inline_pm = std::make_shared<PassManager>("after_inline_pm");
|
||||
after_inline_pm->AddPass(std::make_shared<DropoutGenMaskFusion>());
|
||||
after_inline_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
|
||||
after_inline_pm->AddPass(std::make_shared<EliminateMaketupleGetitem>());
|
||||
optimizer->AddPassManager(after_inline_pm);
|
||||
(void)optimizer->Optimize(kernel_graph);
|
||||
kernel_graph->SetExecOrderByDefault();
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
if (context_ptr->CanDump(kIntroductory)) {
|
||||
std::string file_name =
|
||||
"hwopt_d_after_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, kernel_graph);
|
||||
}
|
||||
#endif
|
||||
profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 1);
|
||||
}
|
||||
|
||||
void GEDynamicUnifyMindIR(const FuncGraphPtr &func_graph) {
|
||||
profiler::CollectHostInfo("GE", "GE Dynamic Shape Unify MindIR", "GEBackend_Dynamic_UnifyMindIR", 0, 0, 0);
|
||||
MS_EXCEPTION_IF_NULL(func_graph);
|
||||
|
|
|
@ -24,6 +24,7 @@ namespace opt {
|
|||
void GEBackendOptimization(const KernelGraphPtr &kernel_graph);
|
||||
void GEBackendOptimizeACL(const KernelGraphPtr &kernel_graph);
|
||||
void GEUnifyMindIR(const KernelGraphPtr &kernel_graph);
|
||||
void GEAfterInlineOptimize(const KernelGraphPtr &kernel_graph);
|
||||
void GEDynamicUnifyMindIR(const FuncGraphPtr &func_graph);
|
||||
void GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr &kernel_graph);
|
||||
PassManagerPtr GetGEUnifyMindIRPassManager();
|
||||
|
|
|
@ -44,7 +44,8 @@ const AnfNodePtr AscendConvertTupleInputToDynamicInput::Process(const FuncGraphP
|
|||
static const PrimitiveSet need_unfold_calculate_node = {
|
||||
prim::kPrimAddN, prim::kPrimConcatD, prim::kPrimPack, prim::kPrimStack,
|
||||
prim::kPrimPrint, prim::kPrimConcat, prim::kPrimAccumulateNV2, prim::kPrimMeshgrid,
|
||||
prim::kPrimTensorSummary, prim::kPrimDynamicStitch, prim::kPrimParallelConcat, prim::kPrimIncreFlashAttention};
|
||||
prim::kPrimTensorSummary, prim::kPrimDynamicStitch, prim::kPrimParallelConcat, prim::kPrimIncreFlashAttention,
|
||||
prim::kPrimIdentityN};
|
||||
|
||||
static const PrimitiveSet need_unfold_control_node = {prim::kPrimSwitchLayer, prim::kPrimCall, prim::kPrimSwitch,
|
||||
prim::kPrimCallInline};
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "runtime/pynative/op_executor.h"
|
||||
#include "pipeline/pynative/pynative_execute.h"
|
||||
#include "pipeline/jit/ps/pipeline.h"
|
||||
#include "include/common/thread_pool.h"
|
||||
#include "include/common/pybind_api/api_register.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -38,6 +39,10 @@ void RegisterForkCallbacks() {
|
|||
MS_LOG(DEBUG) << "Register ActorMgr fork callbacks.";
|
||||
ForkUtils::GetInstance().RegisterCallbacks(ActorMgr::GetActorMgrRef(), static_cast<void (ActorMgr::*)()>(nullptr),
|
||||
static_cast<void (ActorMgr::*)()>(nullptr), &ActorMgr::ChildAfterFork);
|
||||
MS_LOG(DEBUG) << "Register Common ThreadPool fork callbacks.";
|
||||
ForkUtils::GetInstance().RegisterCallbacks(
|
||||
&common::ThreadPool::GetInstance(), static_cast<void (common::ThreadPool::*)()>(nullptr),
|
||||
static_cast<void (common::ThreadPool::*)()>(nullptr), &common::ThreadPool::ChildAfterFork);
|
||||
MS_LOG(DEBUG) << "Register PyNativeExecutor fork callbacks.";
|
||||
ForkUtils::GetInstance().RegisterCallbacks(
|
||||
pynative::PyNativeExecutor::GetInstance(), &pynative::PyNativeExecutor::ParentBeforeFork,
|
||||
|
|
|
@ -68,17 +68,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
|
|||
}
|
||||
}
|
||||
|
||||
bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (strategy == GraphExecutionStrategy::kStep) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node->isa<CNode>() && common::AnfAlgo::IsGetNextNode(node)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool IsDeviceQueueDSActor(const AnfNodePtr &, GraphExecutionStrategy) { return false; }
|
||||
|
||||
bool IsHostQueueDSActor(const AnfNodePtr &node, const KernelGraphPtr &graph,
|
||||
const std::vector<AnfNodePtr> &host_parameters, GraphExecutionStrategy strategy) {
|
||||
|
@ -136,7 +126,7 @@ bool IsCustomActor(const AnfNodePtr &node) {
|
|||
return AnfUtils::IsCustomActorNode(node);
|
||||
}
|
||||
|
||||
bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
|
||||
bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (IsCustomActor(node)) {
|
||||
return false;
|
||||
|
@ -146,11 +136,7 @@ bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (strategy == GraphExecutionStrategy::kStep) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return !common::AnfAlgo::IsGetNextNode(node);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IsSkippedKernelActor(const AnfNodePtr &node) {
|
||||
|
|
|
@ -129,7 +129,7 @@ void KernelActor::InitOutputInfo() {
|
|||
}
|
||||
// Used to keep graph output address when somas block memory free, and reused by the ref conut in other graphs.
|
||||
if (somas_graph_output_indexes_.count(i) > 0) {
|
||||
(void)somas_info_->InsertGraphOutputInfo(output_address.get(), somas_outputs[i].second);
|
||||
(void)somas_info_->InsertGraphOutputInfo(output_address.get(), somas_outputs[i].first, somas_outputs[i].second);
|
||||
} else {
|
||||
UpdateRefCount(output_address.get(), true);
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <functional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include "acl/acl_base.h"
|
||||
#include "acl/acl.h"
|
||||
#include "transform/acl_ir/op_api_convert.h"
|
||||
|
@ -133,9 +134,32 @@ class ReleaseCall {
|
|||
T converted_params_;
|
||||
};
|
||||
|
||||
class ApiCachePool {
|
||||
public:
|
||||
ApiCachePool() = default;
|
||||
~ApiCachePool() = default;
|
||||
|
||||
const char *get(const std::string &str) {
|
||||
auto it = pool_.find(str);
|
||||
if (it != pool_.end()) {
|
||||
return it->second.c_str();
|
||||
}
|
||||
auto [map_iter, inserted] = pool_.emplace(str, str);
|
||||
if (!inserted) {
|
||||
MS_LOG(EXCEPTION) << "Failed to cache api.";
|
||||
}
|
||||
return map_iter->second.c_str();
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<std::string, std::string> pool_;
|
||||
};
|
||||
|
||||
// For normal generate executor.
|
||||
#define GEN_EXECUTOR(aclnn_api, ...) \
|
||||
[](const char *api_name, const std::string &workspace_api_name, const auto &... args) -> auto { \
|
||||
[](const std::string &api_str, const std::string &workspace_api_name, const auto &... args) -> auto { \
|
||||
static transform::ApiCachePool api_cache_pool; \
|
||||
const char *api_name = api_cache_pool.get(api_str); \
|
||||
static const auto get_workspace_size_func_ptr = transform::GetOpApiFunc(workspace_api_name.c_str()); \
|
||||
if (get_workspace_size_func_ptr == nullptr) { \
|
||||
MS_LOG(EXCEPTION) << workspace_api_name << " not in " << transform::GetOpApiLibName() << ", please check!"; \
|
||||
|
@ -163,7 +187,7 @@ class ReleaseCall {
|
|||
release_func = std::function<void()>(releas_call); \
|
||||
return std::make_tuple(workspace_size, executor, release_func); \
|
||||
} \
|
||||
(aclnn_api.c_str(), aclnn_api + "GetWorkspaceSize", __VA_ARGS__)
|
||||
(aclnn_api, aclnn_api + "GetWorkspaceSize", __VA_ARGS__)
|
||||
|
||||
// For custom generate executor.
|
||||
#define GEN_EXECUTOR_CUST(aclnn_api, ...) \
|
||||
|
|
|
@ -129,7 +129,12 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
|
|||
set_param<bool>(MS_CTX_ENABLE_GRAD_COMM_OPT, false);
|
||||
set_param<bool>(MS_CTX_INTERLEAVED_MATMUL_COMM, false);
|
||||
set_param<bool>(MS_CTX_INTERLEAVED_LAYERNORM_COMM, false);
|
||||
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
|
||||
if (target == kAscendDevice || target == kDavinciDevice) {
|
||||
// enable somas by default on ascend
|
||||
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO1);
|
||||
} else {
|
||||
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
|
||||
}
|
||||
set_param<uint32_t>(MS_CTX_OP_TIMEOUT, kOpTimeout);
|
||||
set_param<int>(MS_CTX_JIT_SYNTAX_LEVEL, kLax);
|
||||
set_param<std::string>(MS_CTX_CONV_FPROP_ALGO, "normal");
|
||||
|
|
|
@ -166,7 +166,7 @@ def init(backend_name=None):
|
|||
|
||||
if backend_name == "hccl":
|
||||
if _is_ps_mode():
|
||||
# Use MindSpore cluster to build network for Parameter Server traning.
|
||||
# Use MindSpore cluster to build network for Parameter Server training.
|
||||
init_cluster()
|
||||
if _is_role_sched() or _is_role_pserver():
|
||||
raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'")
|
||||
|
|
2
setup.py
2
setup.py
|
@ -120,6 +120,8 @@ package_data = {
|
|||
'lib/plugin/*/*/*/*/*/*',
|
||||
'lib/plugin/*/*/*/*/*/*/*',
|
||||
'lib/plugin/*/*/*/*/*/*/*/*',
|
||||
'lib/plugin/*/*/*/*/*/*/*/*/*',
|
||||
'lib/plugin/*/*/*/*/*/*/*/*/*/*',
|
||||
'lib/*.so*',
|
||||
'lib/*.a',
|
||||
'lib/*.dylib*',
|
||||
|
|
|
@ -70,7 +70,7 @@ def test_fastgelu_op_forward_ascend(context_mode, data_type):
|
|||
x = ms.Tensor(np.array([1., 2., 3.]).astype(data_type))
|
||||
out = fastgelu_forward_func(x)
|
||||
expect_out = np.array([0.845703, 1.9375, 2.982422]).astype(np.float32)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
|
@ -107,7 +107,7 @@ def test_fastgelu_op_backward_ascend(context_mode, data_type):
|
|||
x = ms.Tensor(np.array([1., 2., 3.]).astype(data_type))
|
||||
grads = fastgelu_backward_func(x)
|
||||
expect_out = np.array([1.069909, 1.072501, 1.022826]).astype(np.float32)
|
||||
np.testing.assert_allclose(grads.asnumpy(), expect_out, rtol=1e-4)
|
||||
np.testing.assert_allclose(grads.asnumpy(), expect_out, rtol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
|
@ -144,4 +144,4 @@ def test_fastgelu_op_vmap_ascend(context_mode, data_type):
|
|||
x = ms.Tensor(np.array([1., 2., 3.]).astype(data_type))
|
||||
out = fastgelu_vmap_func(x)
|
||||
expect_out = np.array([0.845703, 1.9375, 2.982422]).astype(np.float32)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)
|
||||
|
|
|
@ -67,7 +67,7 @@ def test_fastgelugrad_op_forward_ascend(context_mode, data_type):
|
|||
dy = ms.Tensor(np.array([1, 2, 3]).astype(data_type))
|
||||
out = fastgelugrad_forward_func(dy, x)
|
||||
expect_out = np.array([1.069909, 2.145001, 3.068479]).astype(np.float32)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
|
@ -106,4 +106,4 @@ def test_fastgelugrad_op_vmap_ascend(context_mode, data_type):
|
|||
dy = ms.Tensor(np.array([1, 2, 3]).astype(data_type))
|
||||
out = fastgelugrad_vmap_func(dy, x)
|
||||
expect_out = np.array([1.069909, 2.145001, 3.068479]).astype(np.float32)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
|
||||
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
# ============================================================================
|
||||
"""burgers pinns"""
|
||||
import time
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
@ -55,6 +56,7 @@ def test_mindflow_burgers_pinns():
|
|||
Description: test train and eval
|
||||
Expectation: success
|
||||
"""
|
||||
os.environ['GRAPH_OP_RUN'] = "0"
|
||||
context.set_context(mode=context.GRAPH_MODE)
|
||||
model = Net()
|
||||
optimizer = nn.Adam(model.trainable_params(), 0.0001)
|
||||
|
@ -132,3 +134,4 @@ def test_mindflow_burgers_pinns():
|
|||
assert epoch_time < 0.01
|
||||
assert train_loss < 0.6
|
||||
assert eval_error < 0.8
|
||||
del os.environ['GRAPH_OP_RUN']
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
# ============================================================================
|
||||
"""navier stokes pinns"""
|
||||
import time
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
@ -55,6 +56,7 @@ def test_mindflow_navier_stokes():
|
|||
Description: test train
|
||||
Expectation: success
|
||||
"""
|
||||
os.environ['GRAPH_OP_RUN'] = "0"
|
||||
context.set_context(mode=context.GRAPH_MODE)
|
||||
model = Net(in_channels=3, out_channels=3)
|
||||
optimizer = nn.Adam(model.trainable_params(), 0.0001)
|
||||
|
@ -127,3 +129,4 @@ def test_mindflow_navier_stokes():
|
|||
else:
|
||||
assert epoch_time < 0.01
|
||||
assert train_loss < 0.8
|
||||
del os.environ['GRAPH_OP_RUN']
|
||||
|
|
|
@ -35,7 +35,7 @@ def run_testcase(testcase_name, expect_memory_usage):
|
|||
os.remove(log_filename)
|
||||
assert not os.path.exists(log_filename)
|
||||
|
||||
cmd = f"export GLOG_v=1; pytest -s test_recompute.py::" + testcase_name + " > " \
|
||||
cmd = f"export GLOG_v=1; export GRAPH_OP_RUN=0; pytest -s test_recompute.py::" + testcase_name + " > " \
|
||||
+ log_filename + " 2>&1"
|
||||
subprocess.check_output(cmd, shell=True)
|
||||
assert os.path.exists(log_filename)
|
||||
|
|
Loading…
Reference in New Issue