add kbk features

This commit is contained in:
reku1997 2023-12-18 14:10:31 +08:00 committed by limingqi107
parent cbb2c60804
commit b7dc93c5e2
55 changed files with 1329 additions and 280 deletions

View File

@ -28,6 +28,7 @@
#include "ops/framework_ops.h"
#include "ir/anf.h"
#include "utils/log_adapter.h"
#include "ir/func_graph_cloner.h"
#include "utils/shape_utils.h"
#include "include/common/utils/utils.h"
#include "include/common/utils/parallel_context.h"

View File

@ -2749,5 +2749,88 @@ void KernelGraphMgr::SetKernelGraphId(const KernelGraphPtr &kernel_graph) {
}
void KernelGraphMgr::UnifyMindIR(const KernelGraphPtr &graph) { opt::CommonUnifyMindIR(graph); }
namespace {
void CopyCNodeInfo(const FuncGraphPtr &func_graph, const uint32_t &target_graph_id, const AnfNodePtr &ori_node,
const AnfNodePtr &new_node) {
MS_EXCEPTION_IF_NULL(new_node);
MS_EXCEPTION_IF_NULL(ori_node);
auto kernel_info = dynamic_cast<device::KernelInfo *>(new_node->kernel_info());
// deep copy kernel info
if (kernel_info != nullptr && kernel_info->has_build_info()) {
// some check
MS_EXCEPTION_IF_CHECK_FAIL(kernel_info->MutableKernelMod() == nullptr,
"Inline ERROR: " + ori_node->DebugString() + ", kernel mod is not nullptr");
MS_EXCEPTION_IF_CHECK_FAIL(kernel_info->output_address_list().empty(),
"Inline ERROR: " + ori_node->DebugString() + ", output_address_list is not empty");
MS_EXCEPTION_IF_CHECK_FAIL(kernel_info->workspace_address_list().empty(),
"Inline ERROR: " + ori_node->DebugString() + ", workspace_address_list is not empty");
auto new_kernel_info = std::make_shared<device::KernelInfo>();
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(
AnfRuntimeAlgorithm::GetSelectKernelBuildInfo(new_node));
MS_EXCEPTION_IF_NULL(builder);
MS_EXCEPTION_IF_NULL(new_kernel_info);
new_kernel_info->set_select_kernel_build_info(builder->Build());
new_kernel_info->set_graph_id(target_graph_id);
new_kernel_info->set_feature_map_flag(kernel_info->is_feature_map());
new_kernel_info->set_ref_map(false, kernel_info->out_in_ref_map());
new_node->set_kernel_info(new_kernel_info);
} else {
auto new_kernel_info = std::make_shared<device::KernelInfo>();
new_node->set_kernel_info(new_kernel_info);
}
if (ori_node->isa<CNode>()) {
auto ori_cnode = ori_node->cast<CNodePtr>();
if (common::AnfAlgo::HasNodeAttr(kAttrIsUBFusionOp, ori_cnode) &&
common::AnfAlgo::GetNodeAttr<bool>(ori_node, kAttrIsUBFusionOp)) {
// already done fusion compile
auto ori_full_name = ori_cnode->fullname_with_scope();
common::AnfAlgo::SetNodeAttr(kAttrOriFusionName, MakeValue(ori_full_name), new_node);
}
common::AnfAlgo::SetNodeAttr(kAttrNeedInline, MakeValue(ori_node->fullname_with_scope()), new_node);
common::AnfAlgo::SetNodeAttr(kAttrPreKernelGraph, MakeValue(func_graph), new_node);
}
}
} // namespace
AnfNodePtr KernelGraphMgr::DoInline(const FuncGraphPtr &func_graph, const FuncGraphPtr &target_func_graph,
const AnfNodePtrList &func_graph_args, const ScopePtr &scope,
const uint32_t &target_graph_id,
const std::map<session::AnfWithOutIndex, session::AnfWithOutIndex> &ref_map,
const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(func_graph);
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(target_func_graph);
Cloner cloner({}, false);
if (scope != nullptr) {
cloner.set_scope(scope);
}
cloner.AddClone(func_graph, target_func_graph, func_graph_args, kInline);
auto node_list = TopoSort(func_graph->output());
for (auto &ori_node : node_list) {
MS_EXCEPTION_IF_NULL(ori_node);
if (ori_node->isa<Parameter>()) {
continue;
}
auto new_node = cloner[ori_node];
MS_EXCEPTION_IF_NULL(new_node);
if (new_node->isa<ValueNode>()) {
auto value_node = new_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
graph->AddValueNodeToGraph(value_node);
}
CopyCNodeInfo(func_graph, target_graph_id, ori_node, new_node);
}
for (const auto &kv : ref_map) {
auto final_pair = kv.first;
auto origin_pair = kv.second;
final_pair.first = cloner[final_pair.first];
origin_pair.first = cloner[origin_pair.first];
auto new_origin_pair = common::AnfAlgo::VisitKernel(origin_pair.first, origin_pair.second);
graph->AddRefCorrespondPairs(final_pair, new_origin_pair);
}
return cloner[func_graph->output()];
}
} // namespace session
} // namespace mindspore

View File

@ -98,6 +98,12 @@ class BACKEND_EXPORT KernelGraphMgr {
mindspore::HashMap<FuncGraph *, KernelGraphPtr> GetFrontBackendGraphMap() const { return front_backend_graph_map_; }
void CacheKernelGraph(const KernelGraphPtr &kg);
// do inline
static AnfNodePtr DoInline(const FuncGraphPtr &func_graph, const FuncGraphPtr &target_func_graph,
const AnfNodePtrList &func_graph_args, const ScopePtr &scope,
const uint32_t &target_graph_id,
const std::map<session::AnfWithOutIndex, session::AnfWithOutIndex> &ref_map,
const KernelGraphPtr &graph);
private:
void GetCNodeInfo(const CNodePtr &cnode, std::vector<AnfNodePtr> *cnode_inputs) const;

View File

@ -812,14 +812,13 @@ void Somas::InitCommonNodeInputs(const CNodePtr &kernel) {
if ((op_name == kDynamicRNNOpName || op_name == kDynamicGRUV2OpName) && input_origin_type == kMetaTypeNone) {
continue;
}
auto index = AnfAlgo::GetInputKernelIdxByGraphIdx(kernel, i);
size_t input_size = 0;
if (index >= input_size_list.size()) {
MS_LOG(INFO) << "Node: " << kernel->fullname_with_scope() << " input idx: " << index
if (i >= input_size_list.size()) {
MS_LOG(INFO) << "Node: " << kernel->fullname_with_scope() << " input idx: " << i
<< " greater than the size of input_size_list: " << input_size_list.size()
<< ", so use 0 as parameter size.";
} else {
input_size = input_size_list.at(index);
input_size = input_size_list.at(i);
}
auto parameter =
GetSomasParameter(prenode_index.first, prenode_index.second, input_size, kernel->fullname_with_scope());
@ -1018,6 +1017,7 @@ void Somas::SummaryInputProcess(const session::KernelGraph &graph) {
#endif
void Somas::GraphOutputProcess(const session::KernelGraph &graph) {
bool need_reuse_graph_output = NeedReuseGraphOutput();
size_t count = 0;
auto outputs = common::AnfAlgo::GetAllOutputWithIndex(graph.output());
for (auto &output : outputs) {
@ -1045,8 +1045,12 @@ void Somas::GraphOutputProcess(const session::KernelGraph &graph) {
MS_EXCEPTION_IF_NULL(node);
if (output_index <= node->output_tensors_.size()) {
auto &tensor = node->output_tensors_[output_index];
tensor->aligned_size_ = 0;
tensor->type_ = kGraphOutput;
if (need_reuse_graph_output) {
tensor->lifelong_value_ = kLifeLongGraphEnd;
} else {
tensor->aligned_size_ = 0;
tensor->type_ = kGraphOutput;
}
count++;
} else {
MS_LOG(INTERNAL_EXCEPTION) << "Graph's output node " << output_kernel->fullname_with_scope()
@ -1707,6 +1711,7 @@ void Somas::UpdateUnionTensorsOffset() {
}
}
namespace {
// Disjoint-set
size_t find_father(std::vector<size_t> *father, size_t x) {
MS_EXCEPTION_IF_NULL(father);
@ -1720,15 +1725,13 @@ size_t find_father(std::vector<size_t> *father, size_t x) {
return (*father)[x];
}
void Somas::UpdateUnionTensorsConflict() {
// Keep all constraints for first tensor in list
MS_EXCEPTION_IF_NULL(tensors_list_.back());
size_t cnt = tensors_list_.back()->GetId() + 1;
std::vector<vector<size_t>> GetRegularUnionTensorsList(size_t cnt,
const std::vector<vector<size_t>> &union_tensors_list) {
std::vector<size_t> father;
for (size_t i = 0; i < cnt; i++) {
father.push_back(i);
}
for (auto union_node_list : union_tensors_list_) {
for (auto union_node_list : union_tensors_list) {
if (union_node_list.empty()) {
MS_LOG(INTERNAL_EXCEPTION) << "union node list is empty.";
}
@ -1740,19 +1743,32 @@ void Somas::UpdateUnionTensorsConflict() {
}
std::map<size_t, size_t> kv;
std::vector<vector<size_t>> tmp_union;
for (const auto &union_node_list : union_tensors_list_) {
std::vector<vector<size_t>> ret_union_list;
std::vector<std::set<size_t>> union_tensor_sets;
for (const auto &union_node_list : union_tensors_list) {
for (size_t tid : union_node_list) {
size_t fa = find_father(&father, tid);
if (kv.find(fa) == kv.end()) {
tmp_union.emplace_back();
kv.emplace(fa, tmp_union.size() - 1);
ret_union_list.emplace_back();
union_tensor_sets.emplace_back();
kv.emplace(fa, ret_union_list.size() - 1);
}
auto &union_tensor_set = union_tensor_sets[kv.at(fa)];
if (union_tensor_set.find(tid) == union_tensor_set.end()) {
ret_union_list[kv.at(fa)].push_back(tid);
union_tensor_set.insert(tid);
}
tmp_union[kv.at(fa)].push_back(tid);
}
}
return ret_union_list;
}
} // namespace
union_tensors_list_ = tmp_union;
void Somas::UpdateUnionTensorsConflict() {
// Keep all constraints for first tensor in list
MS_EXCEPTION_IF_NULL(tensors_list_.back());
size_t cnt = tensors_list_.back()->GetId() + 1;
union_tensors_list_ = GetRegularUnionTensorsList(cnt, union_tensors_list_);
for (auto union_node_list : union_tensors_list_) {
size_t tid_0 = union_node_list[0];

View File

@ -106,6 +106,7 @@ class BACKEND_EXPORT Somas {
virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0;
virtual void CommunicationTensorProcess(const std::vector<SomasTensorPtr> &tensors) const;
virtual bool NeedContiguous(const std::vector<size_t> &inputs) const = 0;
virtual bool NeedReuseGraphOutput() const { return false; }
// end
// SOMAS Configuration

View File

@ -161,7 +161,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap
}
}
}
common::ThreadPool::GetInstance().SyncRun(tasks);
(void)common::ThreadPool::GetInstance().SyncRun(tasks);
BestInfo best_info;
FindBest(total_sol, solvers, &best_info);
auto end = std::chrono::system_clock::now();

View File

@ -16,6 +16,7 @@
#include "include/common/thread_pool.h"
#include <exception>
#include "thread/threadlog.h"
#include "utils/log_adapter.h"
#include "utils/ms_exception.h"
@ -36,11 +37,12 @@ void ThreadPool::SyncRunLoop(const std::shared_ptr<ThreadContext> &context) {
}
if (!context->task) {
MS_EXCEPTION_IF_NULL(context->cond_var);
++yield_count;
if (yield_count > kYieldThreshold) {
yield_count = 0;
std::unique_lock<std::mutex> lock(context->mutex);
context->cond_var.wait(lock, [&context, this] { return context->task != nullptr || exit_run_; });
context->cond_var->wait(lock, [&context, this] { return context->task != nullptr || exit_run_; });
} else {
std::this_thread::yield();
continue;
@ -82,7 +84,7 @@ bool ThreadPool::SyncRun(const std::vector<Task> &tasks) {
contexts_.resize(new_thread_num);
for (size_t i = thread_num; i < new_thread_num; ++i) {
contexts_[i] = std::make_shared<ThreadContext>();
sync_run_threads_.emplace_back(std::thread(&ThreadPool::SyncRunLoop, this, contexts_[i]));
sync_run_threads_.emplace_back(std::make_unique<std::thread>(&ThreadPool::SyncRunLoop, this, contexts_[i]));
}
}
if (contexts_.empty()) {
@ -98,13 +100,14 @@ bool ThreadPool::SyncRun(const std::vector<Task> &tasks) {
running = false;
for (size_t i = 0; i < used_thread_num; ++i) {
MS_EXCEPTION_IF_NULL(contexts_[i]);
MS_EXCEPTION_IF_NULL(contexts_[i]->cond_var);
auto &task_run = contexts_[i]->task;
if (task_run) {
running = true;
} else if (task_index < task_num) {
std::lock_guard<std::mutex> task_lock(contexts_[i]->mutex);
contexts_[i]->task = &(tasks[task_index]);
contexts_[i]->cond_var.notify_one();
contexts_[i]->cond_var->notify_one();
running = true;
++task_index;
}
@ -129,11 +132,30 @@ void ThreadPool::ClearThreadPool() {
exit_run_ = true;
for (auto &context : contexts_) {
MS_EXCEPTION_IF_NULL(context);
context->cond_var.notify_one();
context->cond_var->notify_one();
}
for (auto &it : sync_run_threads_) {
if (it.joinable()) {
it.join();
MS_EXCEPTION_IF_NULL(it);
if (it->joinable()) {
it->join();
}
}
sync_run_threads_.clear();
}
void ThreadPool::ChildAfterFork() {
THREAD_INFO("common thread pool clear thread after fork in child process");
for (auto &context : contexts_) {
MS_EXCEPTION_IF_NULL(context);
if (context->cond_var != nullptr) {
(void)context->cond_var.release();
context->task = nullptr;
}
}
contexts_.clear();
for (auto &it : sync_run_threads_) {
if (it != nullptr) {
(void)it.release();
}
}
sync_run_threads_.clear();

View File

@ -64,20 +64,21 @@ struct SomasInfo {
std::map<size_t, void *> merged_base_addresses_;
// Used to keep the graph output address when somas block memory free.
void InsertGraphOutputInfo(device::DeviceAddress *graph_output_device_address, size_t graph_output_address_size) {
void InsertGraphOutputInfo(device::DeviceAddress *graph_output_device_address, size_t graph_output_address_offset,
size_t graph_output_address_size) {
// Not insert the repeat size.
if (graph_output_address_sizes_set_.count(graph_output_address_size) > 0) {
if (graph_output_address_offsets_set_.count(graph_output_address_offset) > 0) {
MS_LOG(INFO) << "The graph:" << graph_id_
<< " output somas device is same for size: " << graph_output_address_size;
<< " output somas device is same for offset: " << graph_output_address_offset;
return;
}
(void)graph_output_device_addresses_.emplace_back(graph_output_device_address);
(void)graph_output_address_sizes_.emplace_back(graph_output_address_size);
(void)graph_output_address_sizes_set_.insert(graph_output_address_size);
(void)graph_output_address_offsets_set_.insert(graph_output_address_offset);
}
std::vector<device::DeviceAddress *> graph_output_device_addresses_;
std::vector<size_t> graph_output_address_sizes_;
std::set<size_t> graph_output_address_sizes_set_;
std::set<size_t> graph_output_address_offsets_set_;
// The owner graph id.
uint32_t graph_id_{0};

View File

@ -38,8 +38,13 @@ using Task = std::function<Status()>;
struct ThreadContext {
std::mutex mutex;
std::condition_variable cond_var;
std::unique_ptr<std::condition_variable> cond_var{nullptr};
const Task *task{nullptr};
ThreadContext() {
cond_var = std::make_unique<std::condition_variable>();
MS_EXCEPTION_IF_NULL(cond_var);
}
};
class COMMON_EXPORT ThreadPool {
@ -51,6 +56,7 @@ class COMMON_EXPORT ThreadPool {
bool SyncRun(const std::vector<Task> &tasks);
size_t GetSyncRunThreadNum() const { return max_thread_num_; }
void ClearThreadPool();
void ChildAfterFork();
private:
ThreadPool();
@ -59,7 +65,7 @@ class COMMON_EXPORT ThreadPool {
size_t max_thread_num_{1};
std::mutex pool_mtx_;
std::atomic_bool exit_run_ = {false};
std::vector<std::thread> sync_run_threads_{};
std::vector<std::unique_ptr<std::thread>> sync_run_threads_{};
std::vector<std::shared_ptr<ThreadContext>> contexts_;
};
} // namespace common

View File

@ -46,7 +46,6 @@ file(GLOB_RECURSE UNUSED_SRC_IN_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"./dump/kernel_dumper.cc"
"./dump/data_dumper.cc"
"./dump/dumper_base.cc"
"./ascend_stream_assign.cc"
"./ascend_host_queue.cc"
)
list(REMOVE_ITEM MS_DEVICE_910B ${UNUSED_SRC_IN_910B})

View File

@ -68,7 +68,7 @@ bool NoAdditionalMemory() {
MS_EXCEPTION_IF_NULL(context);
const auto is_cell_reuse = context->CellReuseLevel() != CellReuseLevel::kNoCellReuse;
const auto is_multi_graph_sink = context->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK);
return is_cell_reuse || is_multi_graph_sink;
return (is_cell_reuse || is_multi_graph_sink) && !context->IsKByKExecutorMode();
}
} // namespace

View File

@ -133,6 +133,16 @@ void AscendStreamMng::CreateStreamWithFlags(size_t *stream_id, uint32_t flags, i
AscendGmemAdapter::GetInstance().AddCallbackThread(stream);
}
aclrtEvent AscendStreamMng::ApplyRtEvent() {
aclrtEvent rt_event = nullptr;
auto ret = aclrtCreateEvent(&rt_event);
if (ret != ACL_ERROR_NONE) {
MS_LOG(EXCEPTION) << "aclrtCreateEvent failed, ret:" << ret;
}
(void)events_.emplace_back(rt_event);
return rt_event;
}
bool AscendStreamMng::DestroyStream(size_t stream_id) {
std::lock_guard<std::mutex> lock_streams(stream_mutex_);
if (stream_id >= streams_.size()) {

View File

@ -25,6 +25,7 @@
#ifndef ENABLE_SECURITY
#include "include/backend/debug/data_dump/dump_json_parser.h"
#include "include/backend/optimizer/helper.h"
#include "mindspore/core/ops/framework_ops.h"
#include "plugin/device/ascend/hal/device/ascend_kernel_task.h"
#include "plugin/device/ascend/kernel/opapi/aclnn_kernel_build.h"
#include "plugin/device/ascend/kernel/acl/acl_kernel_build.h"
@ -117,121 +118,6 @@ TypeId GetInputDeviceType(const AnfNodePtr &kernel_node, size_t input_idx) {
return type;
}
bool IsEmptyTupleInput(const CNodePtr &kernel, const size_t i, const TypeId cur_type_id) {
auto input_node = common::AnfAlgo::GetPrevNodeOutput(kernel, i).first;
if (input_node->isa<ValueNode>()) {
auto value_node = input_node->cast<ValueNodePtr>();
if (cur_type_id == kTypeUnknown && value_node->value() != nullptr && value_node->value()->isa<ValueTuple>() &&
value_node->value()->cast<ValueTuplePtr>()->size() == 0) {
MS_LOG(DEBUG) << "Set int64 type for empty value tuple node:" << value_node->DebugString();
return true;
}
}
return false;
}
void GenerateKernelBuildInfo(const CNodePtr &kernel, const KernelType &kernel_type) {
MS_EXCEPTION_IF_NULL(kernel);
std::vector<std::string> input_formats;
std::vector<std::string> output_formats;
std::vector<std::string> input_reshape_types;
std::vector<std::string> output_reshape_types;
auto input_num = common::AnfAlgo::GetInputTensorNum(kernel);
auto output_num = AnfUtils::GetOutputTensorNum(kernel);
auto output_object_type = kernel::KernelObjectType::TENSOR;
if (kernel_type == ACL_KERNEL) {
transform::AclHelper::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
&output_reshape_types);
// NOTE: acl default output objecttype is tensor, here are 2 special case:
// case 1: when cnode output is tuple, and ge ops prototype output num is 1, the real output objecttype is tuple;
// case 2: when cnode output is scalar, the real output objecttype is scalar
// others: output objecttype is tensor
std::string name = GetCNodeFuncName(kernel);
const auto &info = transform::GeAdapterManager::GetInstance().GetInfo(name, true);
auto adapter_output_num = info->GetNumStaticOutputsOfMsOpProto();
auto cnode_output_object_type =
kernel::TypeIdToKernelObjectType(AnfAlgo::GetAbstractObjectType(kernel->abstract()));
if (adapter_output_num == 1 && cnode_output_object_type == kernel::KernelObjectType::TUPLE) {
MS_LOG(INFO) << "acl node " << kernel->fullname_with_scope() << " output is real tuple";
output_object_type = cnode_output_object_type;
output_num = 1;
auto output_format = output_formats[kFirstItem];
auto output_reshape_type = output_reshape_types[kFirstItem];
output_formats.clear();
output_reshape_types.clear();
output_formats.push_back(output_format);
output_reshape_types.push_back(output_reshape_type);
} else if (cnode_output_object_type == kernel::KernelObjectType::SCALAR) {
output_object_type = cnode_output_object_type;
}
} else if (kernel_type == OPAPI_KERNEL) {
transform::OpApiUtil::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
&output_reshape_types);
} else {
auto cand_format = GetValidFormat(input_num, output_num);
if (cand_format.empty()) {
MS_LOG(EXCEPTION) << "The kernel: " << kernel->fullname_with_scope()
<< " does not have a supported dynamic shape format on the Ascend platform.";
}
input_formats = cand_format.at(kFirstItem).first;
output_formats = cand_format.at(kFirstItem).second;
input_reshape_types.assign(input_num, "");
output_reshape_types.assign(output_num, "");
for (size_t i = 0; i < common::AnfAlgo::GetInputTensorNum(kernel); i++) {
auto input_format = AnfAlgo::GetPrevNodeOutputFormat(kernel, i);
if ((!transform::AclHelper::CheckDefaultSupportFormat(input_format)) && (kernel_type != HCCL_KERNEL)) {
MS_LOG(EXCEPTION) << "Aicpu kernel input not support this format: " << input_format
<< ", kernel: " << kernel->fullname_with_scope() << ", input idx: " << i;
}
}
}
std::vector<TypeId> input_types;
input_types.reserve(input_num);
std::vector<TypeId> output_types;
output_types.reserve(output_num);
std::vector<kernel::KernelObjectType> output_object_types;
output_object_types.reserve(output_num);
auto input_object_types = kernel::TypeIdToKernelObjectType(AnfAlgo::GetAllInputObjectType(kernel));
for (size_t i = 0; i < input_num; i++) {
auto cur_input_type = GetInputDeviceType(kernel, i);
if (IsEmptyTupleInput(kernel, i, cur_input_type)) {
cur_input_type = TypeId::kNumberTypeInt64;
}
(void)input_types.push_back(cur_input_type);
}
for (size_t i = 0; i < output_num; i++) {
(void)output_types.push_back(common::AnfAlgo::GetOutputInferDataType(kernel, i));
// no tuple in PyNative dynamic shape
(void)output_object_types.push_back(output_object_type);
}
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
MS_EXCEPTION_IF_NULL(builder);
builder->SetKernelType(kernel_type);
builder->SetInputsFormat(input_formats);
builder->SetInputsDeviceType(input_types);
builder->SetInputsKernelObjectType(input_object_types);
builder->SetOutputsFormat(output_formats);
builder->SetOutputsDeviceType(output_types);
builder->SetOutputsKernelObjectType(output_object_types);
builder->SetInputsReshapeType(input_reshape_types);
builder->SetOutputsReshapeType(output_reshape_types);
if (input_formats.size() != input_types.size() || input_formats.size() != input_object_types.size()) {
MS_LOG(EXCEPTION) << "The input buildInfo size kernel: " << kernel->fullname_with_scope()
<< "is not equal, input_formats size: " << input_formats.size()
<< ", input_types size: " << input_types.size()
<< ", input_object_types size: " << input_object_types.size();
}
if (output_formats.size() != output_types.size() || output_formats.size() != output_object_types.size()) {
MS_LOG(EXCEPTION) << "The output buildInfo size kernel: " << kernel->fullname_with_scope()
<< "is not equal, output_formats size: " << output_formats.size()
<< ", output_types size: " << output_types.size()
<< ", output_object_types size: " << output_object_types.size();
}
AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel.get());
}
void SetWeightFormat(const AnfNodePtr &real_input_node, std::vector<string> output_format, const CNodePtr &kernel_node,
size_t input_index, bool force_fresh = false) {
MS_EXCEPTION_IF_NULL(real_input_node);
@ -492,8 +378,123 @@ bool SetMatchKernelInfo(const CNodePtr &kernel_node, const std::vector<kernel::K
}
return find_valid;
}
bool IsEmptyTupleInput(const CNodePtr &kernel, const size_t i, const TypeId cur_type_id) {
auto input_node = common::AnfAlgo::GetPrevNodeOutput(kernel, i).first;
if (input_node->isa<ValueNode>()) {
auto value_node = input_node->cast<ValueNodePtr>();
if (cur_type_id == kTypeUnknown && value_node->value() != nullptr && value_node->value()->isa<ValueTuple>() &&
value_node->value()->cast<ValueTuplePtr>()->size() == 0) {
MS_LOG(DEBUG) << "Set int64 type for empty value tuple node:" << value_node->DebugString();
return true;
}
}
return false;
}
} // namespace
void GenerateKernelBuildInfo(const CNodePtr &kernel, const KernelType &kernel_type) {
MS_EXCEPTION_IF_NULL(kernel);
std::vector<std::string> input_formats;
std::vector<std::string> output_formats;
std::vector<std::string> input_reshape_types;
std::vector<std::string> output_reshape_types;
auto input_num = common::AnfAlgo::GetInputTensorNum(kernel);
auto output_num = AnfUtils::GetOutputTensorNum(kernel);
auto output_object_type = kernel::KernelObjectType::TENSOR;
if (kernel_type == ACL_KERNEL) {
transform::AclHelper::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
&output_reshape_types);
// NOTE: acl default output objecttype is tensor, here are 2 special case:
// case 1: when cnode output is tuple, and ge ops prototype output num is 1, the real output objecttype is tuple;
// case 2: when cnode output is scalar, the real output objecttype is scalar
// others: output objecttype is tensor
std::string name = GetCNodeFuncName(kernel);
const auto &info = transform::GeAdapterManager::GetInstance().GetInfo(name, true);
auto adapter_output_num = info->GetNumStaticOutputsOfMsOpProto();
auto cnode_output_object_type =
kernel::TypeIdToKernelObjectType(AnfAlgo::GetAbstractObjectType(kernel->abstract()));
if (adapter_output_num == 1 && cnode_output_object_type == kernel::KernelObjectType::TUPLE) {
MS_LOG(INFO) << "acl node " << kernel->fullname_with_scope() << " output is real tuple";
output_object_type = cnode_output_object_type;
output_num = 1;
auto output_format = output_formats[kFirstItem];
auto output_reshape_type = output_reshape_types[kFirstItem];
output_formats.clear();
output_reshape_types.clear();
output_formats.push_back(output_format);
output_reshape_types.push_back(output_reshape_type);
} else if (cnode_output_object_type == kernel::KernelObjectType::SCALAR) {
output_object_type = cnode_output_object_type;
}
} else if (kernel_type == OPAPI_KERNEL) {
transform::OpApiUtil::GetValidKernelBuildInfo(kernel, &input_formats, &output_formats, &input_reshape_types,
&output_reshape_types);
} else {
auto cand_format = GetValidFormat(input_num, output_num);
if (cand_format.empty()) {
MS_LOG(EXCEPTION) << "The kernel: " << kernel->fullname_with_scope()
<< " does not have a supported dynamic shape format on the Ascend platform.";
}
input_formats = cand_format.at(kFirstItem).first;
output_formats = cand_format.at(kFirstItem).second;
input_reshape_types.assign(input_num, "");
output_reshape_types.assign(output_num, "");
for (size_t i = 0; i < common::AnfAlgo::GetInputTensorNum(kernel); i++) {
auto input_format = AnfAlgo::GetPrevNodeOutputFormat(kernel, i);
if ((!transform::AclHelper::CheckDefaultSupportFormat(input_format)) && (kernel_type != HCCL_KERNEL)) {
MS_LOG(EXCEPTION) << "Aicpu kernel input not support this format: " << input_format
<< ", kernel: " << kernel->fullname_with_scope() << ", input idx: " << i;
}
}
}
std::vector<TypeId> input_types;
input_types.reserve(input_num);
std::vector<TypeId> output_types;
output_types.reserve(output_num);
std::vector<kernel::KernelObjectType> output_object_types;
output_object_types.reserve(output_num);
auto input_object_types = kernel::TypeIdToKernelObjectType(AnfAlgo::GetAllInputObjectType(kernel));
for (size_t i = 0; i < input_num; i++) {
auto cur_input_type = GetInputDeviceType(kernel, i);
if (IsEmptyTupleInput(kernel, i, cur_input_type)) {
cur_input_type = TypeId::kNumberTypeInt64;
}
(void)input_types.push_back(cur_input_type);
}
for (size_t i = 0; i < output_num; i++) {
(void)output_types.push_back(common::AnfAlgo::GetOutputInferDataType(kernel, i));
// no tuple in PyNative dynamic shape
(void)output_object_types.push_back(output_object_type);
}
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
MS_EXCEPTION_IF_NULL(builder);
builder->SetKernelType(kernel_type);
builder->SetInputsFormat(input_formats);
builder->SetInputsDeviceType(input_types);
builder->SetInputsKernelObjectType(input_object_types);
builder->SetOutputsFormat(output_formats);
builder->SetOutputsDeviceType(output_types);
builder->SetOutputsKernelObjectType(output_object_types);
builder->SetInputsReshapeType(input_reshape_types);
builder->SetOutputsReshapeType(output_reshape_types);
if (input_formats.size() != input_types.size() || input_formats.size() != input_object_types.size()) {
MS_LOG(EXCEPTION) << "The input buildInfo size kernel: " << kernel->fullname_with_scope()
<< "is not equal, input_formats size: " << input_formats.size()
<< ", input_types size: " << input_types.size()
<< ", input_object_types size: " << input_object_types.size();
}
if (output_formats.size() != output_types.size() || output_formats.size() != output_object_types.size()) {
MS_LOG(EXCEPTION) << "The output buildInfo size kernel: " << kernel->fullname_with_scope()
<< "is not equal, output_formats size: " << output_formats.size()
<< ", output_types size: " << output_types.size()
<< ", output_object_types size: " << output_object_types.size();
}
AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel.get());
}
void HandleKernelSelectFailure(const KernelGraphPtr &graph, const CNodePtr &node,
const std::pair<std::string, ExceptionType> &failure_info) {
auto msg = TryBackoffCpu(graph, node, failure_info);
@ -522,6 +523,12 @@ std::tuple<bool, std::string, ExceptionType> SelectKernelInfoWithMsg(const CNode
<< " is enabled dispatch in yaml, but not registered an aclnn kernelmod.";
}
// for backend inline
if (IsPrimitiveCNode(node, prim::kPrimCallInline)) {
GenerateKernelBuildInfo(node, KernelType::UNKNOWN_KERNEL_TYPE);
return result;
}
auto kernel_type = transform::AclHelper::GetKernelInfoFromGe(node, &acl_err_type);
if (kernel_type == KernelType::ACL_KERNEL) {
GenerateKernelBuildInfo(node, kernel_type);

View File

@ -30,6 +30,7 @@ void HandleKernelSelectFailure(const KernelGraphPtr &graph, const CNodePtr &node
const std::pair<std::string, ExceptionType> &failure_info);
std::tuple<bool, std::string, ExceptionType> SelectKernelInfoWithMsg(const CNodePtr &node, bool enable_aclnn);
void SetKernelInfoBeforeCreateKernel(const std::vector<CNodePtr> &nodes);
void GenerateKernelBuildInfo(const CNodePtr &kernel, const KernelType &kernel_type);
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -19,6 +19,8 @@ file(GLOB_RECURSE MS_HARDWARE_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"ge_kernel_executor.cc"
"ge_utils.cc"
"ge_graph_optimization.cc"
"acl_somas.cc"
"acl_stream_assign.cc"
)
set_property(SOURCE ${MS_HARDWARE_910B} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)

View File

@ -0,0 +1,120 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/ascend/hal/hardware/acl_somas.h"
#include <string>
#include <map>
#include <utility>
#include <vector>
#include "include/backend/optimizer/helper.h"
#include "utils/ms_context.h"
#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
#include "ops/framework_op_name.h"
namespace mindspore {
namespace device {
namespace ascend {
using TensorType = somas::TensorType;
using LifeLongType = somas::LifeLongType;
constexpr size_t ALONE = 1;
bool AclSomas::Initialize() { return true; }
std::string AclSomas::GetDeviceName() const { return "Ascend"; }
size_t AclSomas::GetAlignSize(size_t original_size) const {
constexpr size_t alignment = 512;
constexpr size_t alignment_complement = 31;
size_t aligned_size =
(original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0;
return aligned_size;
}
bool AclSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto opt_level = ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL);
return opt_level != kOptimizeO0;
}
bool AclSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
InitEventInfo(graph);
return true;
}
void AclSomas::InitEventInfo(const session::KernelGraph &graph) {
MS_LOG(DEBUG) << "Acl Somas InitEventInfo start.";
event_map_ = {};
auto &kernels = graph.execution_order();
for (const auto &kernel : kernels) {
auto type = common::AnfAlgo::GetCNodeName(kernel);
if (type == kStreamSendOpName) {
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
auto iter = event_map_.find(event);
if (iter == event_map_.end()) {
auto pair = somas::EventPair();
pair.send_ = kernel;
event_map_[event] = pair;
} else {
iter->second.send_ = kernel;
}
} else if (type == kStreamRecvOpName) {
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
auto iter = event_map_.find(event);
if (iter == event_map_.end()) {
auto pair = somas::EventPair();
pair.recv_ = kernel;
event_map_[event] = pair;
} else {
iter->second.recv_ = kernel;
}
}
}
for (auto &event : event_map_) {
auto send_iter = nodes_map_.find(event.second.send_.get());
auto recv_iter = nodes_map_.find(event.second.recv_.get());
if (send_iter == nodes_map_.end()) {
MS_LOG(WARNING) << "Can't find Acl somas node for " << event.second.send_->fullname_with_scope();
continue;
}
if (recv_iter == nodes_map_.end()) {
MS_LOG(WARNING) << "Can't find Acl somas node for " << event.second.recv_->fullname_with_scope();
continue;
}
AddControlTensor(send_iter->second.at(0), recv_iter->second.at(0));
}
MS_LOG(DEBUG) << "Acl Somas InitEventInfo end.";
}
bool AclSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; }
void AclSomas::CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const {
if (tensors.size() != ALONE) {
for (auto &tensor : tensors) {
MS_EXCEPTION_IF_NULL(tensor);
MS_EXCEPTION_IF_CHECK_FAIL(tensor->aligned_size_ != 0, "The size of communication tensor is zero, tensor id: " +
std::to_string(tensor->GetId()));
}
}
}
bool AclSomas::NeedContiguous(const std::vector<size_t> &inputs) const { return inputs.size() > ALONE; }
bool AclSomas::NeedReuseGraphOutput() const { return true; }
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_SOMAS_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_SOMAS_H_
#include <vector>
#include <string>
#include <map>
#include <utility>
#include <memory>
#include "backend/common/somas/somas.h"
#include "include/backend/device_type.h"
namespace mindspore {
namespace device {
namespace ascend {
using KernelGraph = session::KernelGraph;
using UnReuseType = somas::UnReuseType;
class AclSomas : public somas::Somas {
private:
bool Initialize() override;
string GetDeviceName() const override;
void CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const override;
bool NeedContiguous(const std::vector<size_t> &inputs) const override;
bool NeedReuseGraphOutput() const override;
size_t GetAlignSize(size_t original_size) const override;
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
void InitEventInfo(const session::KernelGraph &graph);
std::map<uint32_t, somas::EventPair> event_map_;
};
} // namespace ascend
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_SOMAS_H_

View File

@ -0,0 +1,322 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/ascend/hal/hardware/acl_stream_assign.h"
#include <algorithm>
#include <unordered_set>
#include <utility>
#include "include/backend/anf_runtime_algorithm.h"
#include "include/backend/optimizer/helper.h"
#include "include/common/utils/anfalgo.h"
#include "include/common/utils/parallel_context.h"
#include "include/common/utils/utils.h"
#include "ops/ascend_op_name.h"
#include "ops/framework_op_name.h"
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
namespace mindspore {
namespace device {
namespace ascend {
void AclStreamAssign::AssignStream(const NotNull<KernelGraphPtr> &kernel_graph) const {
auto kernels = kernel_graph->execution_order();
if (kernels.empty()) {
return;
}
if (kernel_graph->is_from_single_op()) {
MS_LOG(INFO) << "Not stream assign when pynative forward.";
return;
}
for (const auto &node : kernels) {
if (AnfAlgo::IsKernelSelectBackoffOp(node)) {
continue;
}
if (common::AnfAlgo::IsCommunicationOp(node)) {
AnfAlgo::SetStreamId(kWorldGroupStreamIndex, node.get());
common::AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(kWorldGroupStreamIndex), node);
} else {
AnfAlgo::SetStreamId(kDefaultStreamIndex, node.get());
common::AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(kDefaultStreamIndex), node);
}
}
for (size_t i = 1; i < kernels.size(); ++i) {
if (common::AnfAlgo::GetCNodeName(kernels[i - 1]) == kMemSetOpName) {
auto stream_id = AnfAlgo::GetStreamId(kernels[i]);
AnfAlgo::SetStreamId(stream_id, kernels[i - 1].get());
common::AnfAlgo::SetNodeAttr(kAttrStreamId, MakeValue(stream_id), kernels[i - 1]);
}
}
InsertEventForNonTaskSink(kernel_graph);
}
void AclStreamAssign::GenKernelIoExecInfoMap(
const NotNull<KernelGraphPtr> &kernel_graph,
mindspore::HashMap<CNodePtr, NodeIoExecInfoPtr> *kernel_io_exec_info_map) const {
auto &exec_kernels = kernel_graph->execution_order();
for (size_t i = 0; i < exec_kernels.size(); ++i) {
auto &process_kernel = exec_kernels[i];
MS_EXCEPTION_IF_NULL(process_kernel);
auto process_exec_info = std::make_shared<NodeExecInfo>();
MS_EXCEPTION_IF_NULL(process_exec_info);
process_exec_info->node = process_kernel;
process_exec_info->stream_id = AnfAlgo::GetStreamId(process_kernel);
process_exec_info->execution_order_index = i;
auto process_io_exec_info = std::make_shared<NodeIoExecInfo>();
MS_EXCEPTION_IF_NULL(process_io_exec_info);
process_io_exec_info->node_exec_info = process_exec_info;
process_io_exec_info->inputs = {};
process_io_exec_info->outputs = {};
(*kernel_io_exec_info_map)[process_kernel] = process_io_exec_info;
}
for (auto &process_kernel : exec_kernels) {
MS_EXCEPTION_IF_NULL(process_kernel);
auto process_iter = kernel_io_exec_info_map->find(process_kernel);
if (process_iter == kernel_io_exec_info_map->end()) {
MS_LOG(INFO) << "Can't get kernel io execution info for " << process_kernel->fullname_with_scope();
continue;
}
auto process_io_exec_info = process_iter->second;
MS_EXCEPTION_IF_NULL(process_io_exec_info);
auto process_exec_info = process_iter->second->node_exec_info;
MS_EXCEPTION_IF_NULL(process_exec_info);
auto inputs = process_kernel->inputs();
for (size_t i = 1; i < inputs.size(); i++) {
auto input_node = common::AnfAlgo::VisitKernelWithReturnType(inputs[i], 0).first;
MS_EXCEPTION_IF_NULL(input_node);
if (AnfUtils::IsRealCNodeKernel(input_node)) {
auto input_kernel = input_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(input_kernel);
auto iter = kernel_io_exec_info_map->find(input_kernel);
if (iter == kernel_io_exec_info_map->end()) {
MS_LOG(INFO) << "Can't get kernel io execution info for " << process_kernel->fullname_with_scope()
<< "'s input node " << input_kernel->fullname_with_scope();
continue;
}
auto input_io_exec_info = iter->second;
auto input_exec_info = iter->second->node_exec_info;
MS_EXCEPTION_IF_NULL(input_io_exec_info);
process_io_exec_info->inputs.push_back(input_exec_info);
input_io_exec_info->outputs.push_back(process_exec_info);
}
}
}
}
void AclStreamAssign::UpdateEventsToExecutionOrder(
const NotNull<KernelGraphPtr> &kernel_graph,
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &send_after_node,
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &recv_before_node) const {
MS_LOG(DEBUG) << "Start UpdateEventsToExecutionOrder...";
auto exec_kernels = kernel_graph->execution_order();
std::vector<CNodePtr> new_exec_orders;
for (auto &kernel : exec_kernels) {
auto before_iter = recv_before_node.find(kernel);
if (before_iter != recv_before_node.end()) {
(void)std::copy(before_iter->second.begin(), before_iter->second.end(), std::back_inserter(new_exec_orders));
}
new_exec_orders.push_back(kernel);
auto after_iter = send_after_node.find(kernel);
if (after_iter != send_after_node.end()) {
(void)std::copy(after_iter->second.begin(), after_iter->second.end(), std::back_inserter(new_exec_orders));
}
}
auto graph_output = kernel_graph->output();
auto graph_output_iter = recv_before_node.find(graph_output);
if (graph_output_iter != recv_before_node.end()) {
(void)std::copy(graph_output_iter->second.begin(), graph_output_iter->second.end(),
std::back_inserter(new_exec_orders));
}
kernel_graph->set_execution_order(new_exec_orders);
MS_LOG(DEBUG) << "Finish UpdateEventsToExecutionOrder.";
}
void AclStreamAssign::InsertEventsForInputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
const NodeIoExecInfoPtr &io_exec_info,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const {
MS_EXCEPTION_IF_NULL(io_exec_info);
auto process_stream_id = AnfAlgo::GetStreamId(kernel);
auto input_exec_info_list = io_exec_info->inputs;
mindspore::HashMap<uint32_t, NodeExecInfoPtr> stream_max_exec_node_map;
for (auto &input : input_exec_info_list) {
MS_EXCEPTION_IF_NULL(input);
auto input_stream_id = input->stream_id;
auto iter = stream_max_exec_node_map.find(input_stream_id);
if (iter == stream_max_exec_node_map.end()) {
stream_max_exec_node_map[input_stream_id] = input;
} else {
MS_EXCEPTION_IF_NULL(iter->second);
if (input->execution_order_index > iter->second->execution_order_index) {
iter->second = input;
}
}
}
for (auto input_exec : stream_max_exec_node_map) {
MS_EXCEPTION_IF_NULL(input_exec.second);
if (input_exec.second->stream_id == process_stream_id) {
continue;
}
InsertEvents(kernel_graph, kernel, input_exec.second->node, kernel_send, kernel_recv, kernel);
}
}
void AclStreamAssign::InsertEventsForOutputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
const NodeIoExecInfoPtr &io_exec_info,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const {
MS_EXCEPTION_IF_NULL(io_exec_info);
auto process_stream_id = AnfAlgo::GetStreamId(kernel);
auto output_exec_info_list = io_exec_info->outputs;
mindspore::HashMap<uint32_t, NodeExecInfoPtr> stream_min_exec_node_map;
for (auto &output : output_exec_info_list) {
MS_EXCEPTION_IF_NULL(output);
auto output_stream_id = output->stream_id;
auto iter = stream_min_exec_node_map.find(output_stream_id);
if (iter == stream_min_exec_node_map.end()) {
stream_min_exec_node_map[output_stream_id] = output;
} else {
MS_EXCEPTION_IF_NULL(iter->second);
if (output->execution_order_index < iter->second->execution_order_index) {
iter->second = output;
}
}
}
for (auto output_exec : stream_min_exec_node_map) {
MS_EXCEPTION_IF_NULL(output_exec.second);
if (output_exec.second->stream_id == process_stream_id) {
continue;
}
InsertEvents(kernel_graph, kernel, kernel, kernel_send, kernel_recv, output_exec.second->node);
}
// parallel op has output tensor, and it didn't connect to other kernel, it's output is graph output, sync it.
if (output_exec_info_list.empty() && (AnfAlgo::GetOutputTensorNum(kernel) != 0)) {
InsertEvents(kernel_graph, kernel, kernel, kernel_send, kernel_recv, kernel_graph->output());
}
}
CNodePtr AclStreamAssign::CreateSendApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id,
uint32_t stream_id) const {
auto send_op = std::make_shared<Primitive>(kStreamSendOpName);
MS_EXCEPTION_IF_NULL(send_op);
auto send_apply = std::make_shared<ValueNode>(send_op);
MS_EXCEPTION_IF_NULL(send_apply);
auto send_node_ptr = graph_ptr->NewCNode({send_apply});
MS_EXCEPTION_IF_NULL(send_node_ptr);
common::AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr);
AnfAlgo::SetStreamId(stream_id, send_node_ptr.get());
return send_node_ptr;
}
CNodePtr AclStreamAssign::CreateRecvApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id,
uint32_t stream_id) const {
auto recv_op = std::make_shared<Primitive>(kStreamRecvOpName);
MS_EXCEPTION_IF_NULL(recv_op);
auto recv_apply = std::make_shared<ValueNode>(recv_op);
MS_EXCEPTION_IF_NULL(recv_apply);
auto recv_node_ptr = graph_ptr->NewCNode({recv_apply});
MS_EXCEPTION_IF_NULL(recv_node_ptr);
common::AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr);
AnfAlgo::SetStreamId(stream_id, recv_node_ptr.get());
return recv_node_ptr;
}
void AclStreamAssign::InsertEvents(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &parallel_cnode,
const AnfNodePtr &node_before_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv,
const AnfNodePtr &node_after_recv) const {
MS_EXCEPTION_IF_NULL(kernel_send);
MS_EXCEPTION_IF_NULL(kernel_recv);
AscendStreamMng &resource_manager = AscendStreamMng::GetInstance();
uint32_t event_id = resource_manager.ApplyNewEvent();
auto event = resource_manager.ApplyRtEvent();
auto send_cnode = CreateSendApplyKernel(kernel_graph, event_id, AnfAlgo::GetStreamId(node_before_send));
common::AnfAlgo::SetNodeAttr(kAttrRecordEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), send_cnode);
auto send_iter = kernel_send->find(node_before_send);
if (send_iter == kernel_send->end()) {
(*kernel_send)[node_before_send] = {send_cnode};
} else {
send_iter->second.push_back(send_cnode);
}
CNodePtr recv_cnode = CreateRecvApplyKernel(kernel_graph, event_id, AnfAlgo::GetStreamId(node_after_recv));
common::AnfAlgo::SetNodeAttr(kAttrWaitEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), recv_cnode);
auto process_iter = kernel_recv->find(node_after_recv);
if (process_iter == kernel_recv->end()) {
(*kernel_recv)[node_after_recv] = {recv_cnode};
} else {
process_iter->second.push_back(recv_cnode);
}
if (parallel_cnode == node_before_send) {
kernel_graph->InsertSendRecvPairForParallelOpOutputs(parallel_cnode, std::make_pair(send_cnode, recv_cnode));
MS_LOG(INFO) << "Generate send/recv for parallel op " << parallel_cnode->fullname_with_scope() << "'s output."
<< "Send node " << send_cnode->fullname_with_scope() << " after "
<< node_before_send->fullname_with_scope() << ", recv node " << recv_cnode->fullname_with_scope()
<< " before " << node_after_recv->fullname_with_scope();
} else {
kernel_graph->InsertSendRecvPairForParallelOpInputs(parallel_cnode, std::make_pair(send_cnode, recv_cnode));
MS_LOG(INFO) << "Generate send/recv for parallel op " << parallel_cnode->fullname_with_scope() << "'s input."
<< "Send node " << send_cnode->fullname_with_scope() << " after "
<< node_before_send->fullname_with_scope() << ", recv node " << recv_cnode->fullname_with_scope()
<< " before " << node_after_recv->fullname_with_scope();
}
}
void AclStreamAssign::GenEventsForParallelOp(const NotNull<KernelGraphPtr> &kernel_graph,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const {
MS_LOG(DEBUG) << "Start GenEventsForParallelOp...";
auto exec_kernels = kernel_graph->execution_order();
mindspore::HashMap<CNodePtr, NodeIoExecInfoPtr> kernel_io_exec_info_map;
GenKernelIoExecInfoMap(kernel_graph, &kernel_io_exec_info_map);
for (auto &process_kernel : exec_kernels) {
if (AnfAlgo::IsKernelSelectBackoffOp(process_kernel)) {
continue;
}
MS_EXCEPTION_IF_NULL(process_kernel);
auto process_stream_id = AnfAlgo::GetStreamId(process_kernel);
if (process_stream_id == kDefaultStreamIndex) {
continue;
}
MS_LOG(DEBUG) << "Start GenEvents For ParallelOp " << process_kernel->fullname_with_scope();
auto process_iter = kernel_io_exec_info_map.find(process_kernel);
if (process_iter == kernel_io_exec_info_map.end()) {
MS_LOG(INFO) << "Can't get node io execution info for " << process_kernel->fullname_with_scope();
continue;
}
auto process_io_exec_info = process_iter->second;
InsertEventsForInputs(kernel_graph, process_kernel, process_io_exec_info, kernel_send, kernel_recv);
InsertEventsForOutputs(kernel_graph, process_kernel, process_io_exec_info, kernel_send, kernel_recv);
}
MS_LOG(DEBUG) << "Finish GenEventsForParallelOp.";
}
void AclStreamAssign::InsertEventForNonTaskSink(const NotNull<KernelGraphPtr> &kernel_graph) const {
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> kernel_send;
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> kernel_recv;
AnfAlgo::SetStreamId(kDefaultStreamIndex, kernel_graph->output().get());
GenEventsForParallelOp(kernel_graph, &kernel_send, &kernel_recv);
UpdateEventsToExecutionOrder(kernel_graph, kernel_send, kernel_recv);
}
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,104 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_STREAM_ASSIGN_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_STREAM_ASSIGN_H_
#include <functional>
#include <unordered_map>
#include <map>
#include <set>
#include <string>
#include <queue>
#include <vector>
#include <memory>
#include <unordered_set>
#include <utility>
#include "include/backend/kernel_graph.h"
#include "include/common/utils/contract.h"
namespace mindspore {
namespace device {
namespace ascend {
struct NodeExecInfo {
CNodePtr node;
uint32_t stream_id;
size_t execution_order_index;
};
using NodeExecInfoPtr = std::shared_ptr<NodeExecInfo>;
struct NodeIoExecInfo {
NodeExecInfoPtr node_exec_info;
std::vector<NodeExecInfoPtr> inputs;
std::vector<NodeExecInfoPtr> outputs;
};
using NodeIoExecInfoPtr = std::shared_ptr<NodeIoExecInfo>;
class AclStreamAssign {
public:
static AclStreamAssign &GetInstance() {
static AclStreamAssign instance; // Guaranteed to be destroyed.
return instance;
}
AclStreamAssign(const AclStreamAssign &) = delete;
AclStreamAssign &operator=(const AclStreamAssign &) = delete;
void AssignStream(const NotNull<KernelGraphPtr> &kernel_graph) const;
private:
AclStreamAssign() = default;
~AclStreamAssign() = default;
void GenKernelIoExecInfoMap(const NotNull<KernelGraphPtr> &kernel_graph,
mindspore::HashMap<CNodePtr, NodeIoExecInfoPtr> *kernel_io_exec_info_map) const;
void UpdateEventsToExecutionOrder(
const NotNull<KernelGraphPtr> &kernel_graph,
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &send_after_node,
const mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> &recv_before_node) const;
void GenEventsForParallelOp(const NotNull<KernelGraphPtr> &kernel_graph,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const;
void InsertEventForNonTaskSink(const NotNull<KernelGraphPtr> &kernel_graph) const;
void InsertEventsForInputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
const NodeIoExecInfoPtr &io_exec_info,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const;
void InsertEventsForOutputs(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &kernel,
const NodeIoExecInfoPtr &io_exec_info,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv) const;
void InsertEvents(const NotNull<KernelGraphPtr> &kernel_graph, const CNodePtr &parallel_cnode,
const AnfNodePtr &node_before_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_send,
mindspore::HashMap<AnfNodePtr, std::vector<CNodePtr>> *kernel_recv,
const AnfNodePtr &node_after_recv) const;
CNodePtr CreateSendApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id, uint32_t stream_id) const;
CNodePtr CreateRecvApplyKernel(const NotNull<KernelGraphPtr> &graph_ptr, uint32_t event_id, uint32_t stream_id) const;
};
} // namespace ascend
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ACL_STREAM_ASSIGN_H_

View File

@ -88,8 +88,14 @@ bool AscendCollectiveCommLib::InitializeHccl() {
MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str;
auto mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
bool ret = hccl::HcclAdapter::GetInstance().InitHccl(
device_id, rank_id_str, full_path, mode == kGraphMode ? hccl::HcclMode::kGraph : hccl::HcclMode::kPynative);
hccl::HcclMode hccl_mode = hccl::HcclMode::kGraph;
if (mode == kPynativeMode) {
hccl_mode = hccl::HcclMode::kPynative;
} else if (ms_context->IsKByKExecutorMode()) {
hccl_mode = hccl::HcclMode::kKernelByKernel;
}
bool ret = hccl::HcclAdapter::GetInstance().InitHccl(device_id, rank_id_str, full_path, hccl_mode);
free(full_path);
if (!ret) {
MS_LOG(ERROR) << "Hcom init failed.";

View File

@ -50,8 +50,13 @@ void GEGraphOptimization::OptimizeGEGraph(const KernelGraphPtr &graph) {
MS_LOG(DEBUG) << "Status record: end optimize ge graph. graph id: " << graph->graph_id();
}
void GEGraphOptimization::OptimizeACLGraph(const KernelGraphPtr &graph) {
void GEGraphOptimization::OptimizeACLGraph(const KernelGraphPtr &graph, std::set<KernelGraphPtr> *const memo) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(memo);
if (memo->find(graph) != memo->end()) {
return;
}
memo->insert(graph);
MS_LOG(DEBUG) << "Status record: start optimize acl graph. graph id: " << graph->graph_id();
// empty graph dont entry to backend
if (graph->execution_order().empty()) {
@ -62,11 +67,20 @@ void GEGraphOptimization::OptimizeACLGraph(const KernelGraphPtr &graph) {
}
opt::AscendUnfoldInputsForSpecialNodes(graph);
opt::GEBackendOptimizeACL(graph);
for (auto &child_graph : graph->child_graph_order()) {
OptimizeACLGraph(child_graph.lock(), memo);
}
MS_LOG(DEBUG) << "Status record: end optimize acl graph. graph id: " << graph->graph_id();
}
void GEGraphOptimization::OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph) {
void GEGraphOptimization::OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph,
std::set<KernelGraphPtr> *const memo) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(memo);
if (memo->find(graph) != memo->end()) {
return;
}
memo->insert(graph);
MS_LOG(DEBUG) << "Status record: start optimize acl graph after kernel select. graph id: " << graph->graph_id();
// empty graph dont entry to backend
if (graph->execution_order().empty()) {
@ -76,9 +90,26 @@ void GEGraphOptimization::OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr
MS_LOG(DEBUG) << "Status record: end optimize acl graph after kernel select. graph id: " << graph->graph_id();
}
opt::GEBackendOptimizeACLAfterKernelSelect(graph);
for (auto &child_graph : graph->child_graph_order()) {
OptimizeACLGraphAfterKernelSelect(child_graph.lock(), memo);
}
MS_LOG(DEBUG) << "Status record: end optimize acl graph after kernel select. graph id: " << graph->graph_id();
}
void GEGraphOptimization::OptimizeACLGraphAfterInline(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(DEBUG) << "Status record: start optimize acl graph after inline. graph id: " << graph->graph_id();
// empty graph dont entry to backend
if (graph->execution_order().empty()) {
MS_LOG(DEBUG) << graph->ToString() << " is empty graph.";
AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
graph->set_executable(false);
MS_LOG(DEBUG) << "Status record: end optimize acl graph after inline. graph id: " << graph->graph_id();
}
opt::GEAfterInlineOptimize(graph);
MS_LOG(DEBUG) << "Status record: end optimize acl graph after inline. graph id: " << graph->graph_id();
}
void GEGraphOptimization::UnifyMindIR(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Status record: start unify mindir. graph id: " << graph->graph_id();

View File

@ -31,8 +31,9 @@ class GEGraphOptimization {
return instance;
}
void OptimizeGEGraph(const KernelGraphPtr &graph);
void OptimizeACLGraph(const KernelGraphPtr &graph);
void OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph);
void OptimizeACLGraph(const KernelGraphPtr &graph, std::set<KernelGraphPtr> *const memo);
void OptimizeACLGraphAfterKernelSelect(const KernelGraphPtr &graph, std::set<KernelGraphPtr> *const memo);
void OptimizeACLGraphAfterInline(const KernelGraphPtr &graph);
void UnifyMindIR(const KernelGraphPtr &graph);
void GEMindIRPass(const KernelGraphPtr &graph) const;

View File

@ -16,13 +16,20 @@
#include "plugin/device/ascend/hal/hardware/ge_kernel_executor.h"
#include <utility>
#include <algorithm>
#include "include/common/utils/parallel_context.h"
#include "acl/acl_rt.h"
#include "acl/acl_op_compiler.h"
#include "include/common/profiler.h"
#include "mindspore/core/ops/array_ops.h"
#include "mindspore/core/ops/framework_ops.h"
#include "backend/common/session/kernel_graph_mgr.h"
#include "ops/auto_generate/gen_ops_primitive.h"
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
#include "plugin/device/ascend/hal/hardware/ge_graph_optimization.h"
#include "plugin/device/ascend/hal/common/ascend_utils.h"
#include "plugin/device/ascend/hal/hardware/acl_somas.h"
#include "plugin/device/ascend/hal/hardware/acl_stream_assign.h"
#include "plugin/device/ascend/kernel/rts/rt_kernel_build.h"
#include "plugin/device/ascend/kernel/hccl/hccl_kernel_metadata.h"
#include "plugin/device/ascend/kernel/hccl/hccl_kernel_build.h"
@ -121,6 +128,71 @@ void SetAclOpPrecisionMode() {
MS_LOG(EXCEPTION) << "Acl set op precision mode failed! Error flag is " << ret;
}
}
void SelectKernel(const KernelGraphPtr &kernel_graph, std::set<KernelGraphPtr> *const memo) {
// select kernel
MS_EXCEPTION_IF_NULL(memo);
if (memo->find(kernel_graph) != memo->end()) {
return;
}
memo->insert(kernel_graph);
bool aclnn_can_used = !kernel_graph->is_from_single_op();
const auto &kernels = kernel_graph->execution_order();
for (const auto &kernel : kernels) {
auto [select_res, msg, etype] =
device::ascend::SelectKernelInfoWithMsg(kernel, aclnn_can_used && kernel::IsEnabledAclnn(kernel));
if (!select_res) {
MS_LOG(INFO) << "node is " << kernel->fullname_with_scope() << " should backoff";
std::pair<std::string, ExceptionType> failure_info = std::make_pair(msg, etype);
device::ascend::HandleKernelSelectFailure(kernel_graph, kernel, failure_info);
}
}
if (!kernel_graph->is_from_single_op()) {
kernel_graph->SetKernelObjectTypesForUnrealNodes();
}
for (auto &child_graph : kernel_graph->child_graph_order()) {
SelectKernel(child_graph.lock(), memo);
}
}
void InlineSubGraph(const KernelGraphPtr &graph) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
#ifdef ENABLE_DUMP_IR
bool save_graphs = context_ptr->CanDump(kIntroductory);
if (save_graphs) {
std::string file_name = "hwopt_d_before_inline_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph, true, kWholeStack);
}
#endif
auto kernel_cnodes = graph->execution_order();
for (auto &kernel_cnode : kernel_cnodes) {
MS_EXCEPTION_IF_NULL(kernel_cnode);
if (common::AnfAlgo::CheckPrimitiveType(kernel_cnode, prim::kPrimCallInline)) {
auto sub_graph = common::AnfAlgo::GetNodeAttr<KernelGraphPtr>(kernel_cnode, kAttrKernelGraph);
MS_EXCEPTION_IF_NULL(sub_graph);
MS_LOG(INFO) << "InlineSubGraph: " << kernel_cnode->fullname_with_scope()
<< ", sub graph: " << sub_graph->graph_id() << ", need inline: " << sub_graph->need_inline();
auto main_graph = kernel_cnode->func_graph();
MS_EXCEPTION_IF_NULL(main_graph);
auto mng = main_graph->manager();
auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_cnode->kernel_info());
MS_EXCEPTION_IF_NULL(kernel_info);
AnfNodePtrList inp(kernel_cnode->inputs().begin() + 1, kernel_cnode->inputs().end());
const auto &ref_map = sub_graph->GetRefMap();
auto out = session::KernelGraphMgr::DoInline(sub_graph, main_graph, inp, kernel_cnode->input(0)->scope(),
kernel_info->graph_id(), ref_map, graph);
(void)mng->Replace(kernel_cnode, out);
}
}
GEGraphOptimization::GetInstance().OptimizeACLGraphAfterInline(graph);
#ifdef ENABLE_DUMP_IR
if (save_graphs) {
std::string file_name = "hwopt_d_after_inline_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph, true, kWholeStack);
}
#endif
}
} // namespace
void GeKernelExecutor::Initialize() {
@ -170,24 +242,15 @@ void GeKernelExecutor::OptimizeGraph(const FuncGraphPtr &graph) const {
return;
}
profiler::CollectHostInfo("Ascend", "Graph Optimization", "GeOptimizeGraph", 1, 0, 0);
GEGraphOptimization::GetInstance().OptimizeACLGraph(kernel_graph);
bool aclnn_can_used = !kernel_graph->is_from_single_op();
// select kernel
const auto &kernels = kernel_graph->execution_order();
for (const auto &kernel : kernels) {
auto [select_res, msg, etype] =
device::ascend::SelectKernelInfoWithMsg(kernel, aclnn_can_used && kernel::IsEnabledAclnn(kernel));
if (!select_res) {
MS_LOG(INFO) << "node is " << kernel->fullname_with_scope() << " should backoff";
std::pair<std::string, ExceptionType> failure_info = std::make_pair(msg, etype);
device::ascend::HandleKernelSelectFailure(kernel_graph, kernel, failure_info);
}
}
if (!kernel_graph->is_from_single_op() && !kernel_graph->has_flag(kFlagIsPyNativeBpropKernelGraph)) {
kernel_graph->SetKernelObjectTypesForUnrealNodes();
}
GEGraphOptimization::GetInstance().OptimizeACLGraphAfterKernelSelect(kernel_graph);
std::set<KernelGraphPtr> memo;
GEGraphOptimization::GetInstance().OptimizeACLGraph(kernel_graph, &memo);
memo.clear();
SelectKernel(kernel_graph, &memo);
memo.clear();
GEGraphOptimization::GetInstance().OptimizeACLGraphAfterKernelSelect(kernel_graph, &memo);
memo.clear();
InlineSubGraph(kernel_graph);
OptimizeExecutionOrder(NOT_NULL(graph));
profiler::CollectHostInfo("Ascend", "Graph Optimization", "GeOptimizeGraph", 1, 0, 1);
}
@ -214,6 +277,79 @@ void GeKernelExecutor::CreateKernel(const std::vector<CNodePtr> &nodes) const {
MS_LOG(DEBUG) << "Status record: end create kernel.";
}
namespace {
void CreateEventKernelMod(const KernelGraphPtr &kernel_graph) {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto nodes = kernel_graph->execution_order();
for (auto &node : nodes) {
MS_EXCEPTION_IF_NULL(node);
if (!IsOneOfPrimitiveCNode(node, {prim::kPrimStreamSend, prim::kPrimStreamRecv})) {
continue;
}
device::ascend::GenerateKernelBuildInfo(node, RT_KERNEL);
auto kernel_mod_ptr = kernel::RtOpBuild(node);
MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
AnfAlgo::SetKernelMod(kernel_mod_ptr, node.get());
}
}
} // namespace
void GeKernelExecutor::DoStreamAssign(const KernelGraphPtr &kernel_graph) {
MS_LOG(DEBUG) << "Status record: start stream assign.";
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
MS_EXCEPTION_IF_NULL(kernel_graph);
// stream assign
AclStreamAssign::GetInstance().AssignStream(NOT_NULL(kernel_graph));
CreateEventKernelMod(kernel_graph);
#ifdef ENABLE_DUMP_IR
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool save_graphs = context_ptr->CanDump(kIntroductory);
if (save_graphs) {
std::string file_name = "hwopt_d_after_stream_assign_" + std::to_string(kernel_graph->graph_id()) + ".ir";
DumpIR(file_name, kernel_graph, true, kWholeStack);
}
#endif
kernel_graph->PrintGraphExecuteOrder();
MS_LOG(DEBUG) << "Status record: end stream assign.";
}
void GeKernelExecutor::DoSomas(const FuncGraphPtr &graph) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
MS_EXCEPTION_IF_NULL(graph);
auto kernel_graph = graph->cast<KernelGraphPtr>();
MS_EXCEPTION_IF_NULL(kernel_graph);
DoStreamAssign(kernel_graph);
// somas
MS_LOG(DEBUG) << "Status record: start do somas.";
if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) != kOptimizeO0) {
auto somas = std::make_shared<AclSomas>();
bool ret = somas->Assign(kernel_graph);
if (ret) {
MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
<< " somas size: " << kernel_graph->somas_whole_block_size();
} else if (somas->IsSupportSomas(*kernel_graph)) {
MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
}
}
MS_LOG(DEBUG) << "Status record: end do somas.";
}
void GeKernelExecutor::OptimizeExecutionOrder(const FuncGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
auto kernel_graph = graph->cast<KernelGraphPtr>();
MS_EXCEPTION_IF_NULL(kernel_graph);
MS_LOG(DEBUG) << "Status record: start optimize execution order. graph id: " << kernel_graph->graph_id();
auto execution_order = kernel_graph->execution_order();
kernel_graph->EnableRuntimeCache();
common::AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
kernel_graph->DisableRuntimeCache();
kernel_graph->set_execution_order(execution_order);
MS_LOG(DEBUG) << "Status record: end optimize execution order. graph id: " << kernel_graph->graph_id();
}
void GeKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
profiler::CollectHostInfo("Ascend", "PreprocessBeforeRun", "GePreprocess", 1, 0, 0);
@ -250,16 +386,16 @@ void GeKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
}
}
DoSomas(NOT_NULL(graph));
profiler::CollectHostInfo("Ascend", "PreprocessBeforeRun", "GePreprocess", 1, 0, 1);
}
bool GeKernelExecutor::PySyncRuning() const {
bool GeKernelExecutor::PySyncRuning(size_t stream_id) const {
MS_EXCEPTION_IF_NULL(res_manager_);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
MS_EXCEPTION_IF_NULL(res_manager_);
if ((ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) &&
ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE) &&
!res_manager_->SyncStream(kDefaultStreamIndex)) {
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE) && !res_manager_->SyncStream(stream_id)) {
return false;
}
return true;
@ -299,6 +435,7 @@ bool GeKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<KernelT
auto stream = AscendStreamMng::GetInstance().GetStream(stream_id);
if (stream == nullptr) {
stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
stream_id = kDefaultStreamIndex;
}
MS_EXCEPTION_IF_NULL(stream);
#ifdef ENABLE_DEBUGGER
@ -329,10 +466,9 @@ bool GeKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<KernelT
}
}
// for PyNative Sync Run mode
auto ret = PySyncRuning();
auto ret = PySyncRuning(stream_id);
PROFILER_END(start_time, runtime::ProfilerModule::kKernel, runtime::ProfilerEvent::kKernelLaunch,
kernel->fullname_with_scope(), false);
return ret;
}

View File

@ -60,6 +60,7 @@ class GeKernelExecutor : public KernelExecutor {
// Unify the MindIR, the default behavior uses the common unified MindIR.
void UnifyMindIR(const KernelGraphPtr &graph) const override;
void AddMindIRPass(const KernelGraphPtr &graph) const override;
void OptimizeExecutionOrder(const FuncGraphPtr &graph) const;
// Get rank id for distributed training.
uint32_t GetRankID() const override { return 0; }
@ -69,10 +70,12 @@ class GeKernelExecutor : public KernelExecutor {
const device::DeviceAddressPtrList &output_addr_list, const size_t &stream_id) const override;
private:
static void DoSomas(const FuncGraphPtr &graph);
static void DoStreamAssign(const KernelGraphPtr &kernel_graph);
// launch
bool PySyncRuning() const;
bool MemoryCopyAsync(const CNodePtr &node, const vector<KernelTensor *> &inputs,
const vector<KernelTensor *> &outputs) const;
bool PySyncRuning(size_t stream_id) const;
mutable std::set<CNodePtr> nop_op_to_memcpy_;
// Maybe AscendDeviceResManager and GEDeviceResManager now

View File

@ -170,9 +170,10 @@ HcclMode HcclAdapter::GetCurrentHcclMode() const {
MS_EXCEPTION_IF_NULL(context);
bool is_graph_mode = context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode;
bool is_task_sink = context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
bool graph_op_run = context->IsKByKExecutorMode();
if (!is_graph_mode) {
return HcclMode::kPynative;
} else if (is_task_sink) {
} else if (is_task_sink && !graph_op_run) {
return HcclMode::kGraph;
} else {
return HcclMode::kKernelByKernel;
@ -190,10 +191,9 @@ void HcclAdapter::CheckExcutionMode() const {
}
std::string HcclAdapter::GetHcclModeString(HcclMode hccl_mode) {
static std::map<HcclMode, std::string> kHcclModeString = {
{HcclMode::kGraph, "GRAPH_MODE"},
{HcclMode::kPynative, "PYNATIVE_MODE"},
{HcclMode::kKernelByKernel, "GRAPH_MODE disable TASK_SINK"}};
static std::map<HcclMode, std::string> kHcclModeString = {{HcclMode::kGraph, "GRAPH_MODE"},
{HcclMode::kPynative, "PYNATIVE_MODE"},
{HcclMode::kKernelByKernel, "KERNEL_BY_KERNEL_MODE"}};
return kHcclModeString.at(hccl_mode);
}

View File

@ -43,7 +43,11 @@ file(GLOB_RECURSE SRC_IN_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"pyboost/*.cc"
"pyboost/auto_generate/*.cc"
"pyboost/ops/*.cc"
"ascend_kernel_mod.cc"
"rts/send.cc"
"rts/recv.cc"
"rts/rt_kernel.cc"
"rts/rt_kernel_build.cc"
"rts/rt_kernel_info.cc"
)
list(REMOVE_ITEM SRC_IN_910B ${AICPU_OPS_SRC})

View File

@ -91,9 +91,5 @@ bool ReduceSumAclnnKernelMod::Launch(const std::vector<KernelTensor *> &inputs,
RunOp(stream_ptr, workspace);
return true;
}
MS_ACLLNN_KERNEL_FACTORY_REG(ReduceAll, ReduceAllAclnnKernelMod);
MS_ACLLNN_KERNEL_FACTORY_REG(ReduceAny, ReduceAnyAclnnKernelMod);
MS_ACLLNN_KERNEL_FACTORY_REG(ReduceSum, ReduceSumAclnnKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -16,7 +16,6 @@
#include "plugin/device/ascend/kernel/rts/recv.h"
#include "utils/ms_context.h"
#include "plugin/device/ascend/hal/device/ge_runtime/task_info.h"
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
#include "include/backend/anf_runtime_algorithm.h"
#include "include/common/utils/anfalgo.h"
@ -25,8 +24,6 @@
namespace mindspore {
namespace kernel {
using mindspore::ge::model_runner::EventWaitTaskInfo;
using EventWaitTaskInfoPtr = std::shared_ptr<EventWaitTaskInfo>;
RecvKernel::~RecvKernel() {}
@ -62,14 +59,5 @@ bool RecvKernel::Launch(const std::vector<KernelTensor *> &, const std::vector<K
}
return true;
}
std::vector<TaskInfoPtr> RecvKernel::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "RecvKernel GenTask event_id_:" << event_id_ << ", stream_id_:" << stream_id;
stream_id_ = stream_id;
EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(unique_name_, stream_id, event_id_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}
} // namespace kernel
} // namespace mindspore

View File

@ -32,8 +32,7 @@ class RecvKernel : public RtKernel {
bool Init(const AnfNodePtr &anf_node) override;
bool Launch(const std::vector<KernelTensor *> &, const std::vector<KernelTensor *> &,
const std::vector<KernelTensor *> &, void *stream_ptr) override;
std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
std::vector<KernelAttr> GetOpSupport() override { MS_LOG(EXCEPTION) << "This interface is not support in RtKernel."; }
private:
uint32_t event_id_{0};

View File

@ -41,12 +41,5 @@ RtKernel::RtKernel() {}
RtKernel::~RtKernel() {}
bool RtKernel::Init(const mindspore::AnfNodePtr & /* anf_node */) { return true; }
void RtKernel::SetInputSizeList(const std::vector<size_t> &size_list) { mutable_input_size_list_ = size_list; }
void RtKernel::SetOutputSizeList(const std::vector<size_t> &size_list) { mutable_output_size_list_ = size_list; }
void RtKernel::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { mutable_workspace_size_list_ = size_list; }
const std::vector<size_t> &RtKernel::GetOutputSizeList() const { return mutable_output_size_list_; }
const std::vector<size_t> &RtKernel::GetWorkspaceSizeList() const { return mutable_workspace_size_list_; }
} // namespace kernel
} // namespace mindspore

View File

@ -22,26 +22,17 @@
#include <memory>
#include <map>
#include <string>
#include "plugin/device/ascend/kernel/ascend_kernel_mod.h"
#include "kernel/task_stream.h"
#include "kernel/kernel.h"
#include "acl/acl.h"
#include "acl/acl_rt.h"
namespace mindspore {
namespace kernel {
class RtKernel : public AscendKernelMod {
class RtKernel : public KernelMod {
public:
RtKernel();
~RtKernel() override;
virtual bool Init(const AnfNodePtr &anf_node);
void SetInputSizeList(const std::vector<size_t> &size_list) override;
void SetOutputSizeList(const std::vector<size_t> &size_list) override;
void SetWorkspaceSizeList(const std::vector<size_t> &size_list) override;
const std::vector<size_t> &GetOutputSizeList() const override;
const std::vector<size_t> &GetWorkspaceSizeList() const override;
protected:
mutable std::vector<size_t> mutable_input_size_list_;
mutable std::vector<size_t> mutable_output_size_list_;
mutable std::vector<size_t> mutable_workspace_size_list_;
};
using RTKernelPtr = std::shared_ptr<RtKernel>;

View File

@ -35,7 +35,6 @@ KernelModPtr RtOpBuild(const AnfNodePtr &anf_node) {
MS_LOG(ERROR) << "Rt Op initialize failed!";
return nullptr;
}
ker_ptr->SetNode(anf_node);
return ker_ptr;
}
} // namespace kernel

View File

@ -18,14 +18,10 @@
#include "runtime/event.h"
#include "acl/acl.h"
#include "acl/acl_rt.h"
#include "plugin/device/ascend/hal/device/ge_runtime/task_info.h"
#include "plugin/device/ascend/hal/device/ascend_stream_manager.h"
#include "include/backend/anf_runtime_algorithm.h"
#include "include/common/utils/anfalgo.h"
using mindspore::ge::model_runner::EventRecordTaskInfo;
using EventRecordTaskInfoPtr = std::shared_ptr<EventRecordTaskInfo>;
namespace mindspore {
namespace kernel {
SendKernel::~SendKernel() {}
@ -57,14 +53,5 @@ bool SendKernel::Launch(const std::vector<KernelTensor *> &, const std::vector<K
}
return true;
}
std::vector<TaskInfoPtr> SendKernel::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "SendKernel GenTask event id:" << event_id_ << ", stream id:" << stream_id;
stream_id_ = stream_id;
EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(unique_name_, stream_id, event_id_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}
} // namespace kernel
} // namespace mindspore

View File

@ -31,8 +31,7 @@ class SendKernel : public RtKernel {
bool Init(const AnfNodePtr &anf_node) override;
bool Launch(const std::vector<KernelTensor *> &, const std::vector<KernelTensor *> &,
const std::vector<KernelTensor *> &, void *stream_ptr) override;
std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &, uint32_t stream_id) override;
std::vector<KernelAttr> GetOpSupport() override { MS_LOG(EXCEPTION) << "This interface is not support in RtKernel."; }
private:
uint32_t event_id_{0};

View File

@ -39,6 +39,7 @@ file(GLOB_RECURSE MS_OPTIMIZER_910B RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"./ir_fusion/histogram_fixed_width_fusion.cc"
"./ir_fusion/adaptive_max_pool2d_fusion.cc"
"./ir_fusion/flash_attention_fusion.cc"
"./enhancer/eliminate_maketuple_getitem.cc"
)
list(REMOVE_ITEM MS_OPTIMIZER_910B
"mindir/ascend_vm_op_adapter.cc"

View File

@ -24,18 +24,14 @@
#include "ir/anf.h"
#include "include/backend/optimizer/helper.h"
#include "include/backend/optimizer/optimizer.h"
#include "plugin/device/ascend/optimizer/ascend_helper.h"
namespace mindspore {
namespace opt {
class EliminateMaketupleGetitem : public Pass {
public:
EliminateMaketupleGetitem() : Pass("eliminate_maketuple_getitem"), kernel_select_(std::make_shared<KernelSelect>()) {}
EliminateMaketupleGetitem() : Pass("eliminate_maketuple_getitem") {}
~EliminateMaketupleGetitem() override = default;
bool Run(const FuncGraphPtr &graph) override;
private:
KernelSelectPtr kernel_select_;
};
} // namespace opt
} // namespace mindspore

View File

@ -19,6 +19,7 @@
#include <vector>
#include <utility>
#include "mindspore/core/ops/sequence_ops.h"
#include "mindspore/core/ops/framework_ops.h"
#include "plugin/device/ascend/optimizer/format_type/utils.h"
namespace mindspore {

View File

@ -39,6 +39,8 @@ CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &
builder.SetOutputsReshapeType({reshape_type});
builder.SetInputsDeviceType({input_type});
builder.SetOutputsDeviceType({output_type});
builder.SetInputsKernelObjectType({kernel::KernelObjectType::TENSOR});
builder.SetOutputsKernelObjectType({kernel::KernelObjectType::TENSOR});
builder.SetFusionType(kernel::kPatternOpaque);
builder.SetProcessor(kernel::Processor::AICORE);
if (kernel::OpLib::FindOp(prim::kPrimCast->name(), kernel::kImplyTBE) != nullptr) {

View File

@ -0,0 +1,66 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/ascend/optimizer/ge/process_call_inline.h"
#include <memory>
#include <string>
#include <vector>
#include "include/backend/anf_runtime_algorithm.h"
#include "include/backend/optimizer/helper.h"
#include "include/common/utils/anfalgo.h"
#include "mindspore/core/ops/framework_ops.h"
#include "utils/anf_utils.h"
namespace mindspore {
namespace opt {
namespace {
bool CheckCallInline(const CNodePtr &cnode) {
if (!common::AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimCall)) {
return false;
}
auto call_graph = cnode->input(kIndex1);
auto sub_kernel_graph = session::AnfRuntimeAlgorithm::GetValueNodeKernelGraph(call_graph);
return sub_kernel_graph->need_inline();
}
} // namespace
const BaseRef ProcessCallInline::DefinePattern() const {
VarPtr Xs = std::make_shared<SeqVar>();
return VectorRef({prim::kPrimCall, Xs});
}
const AnfNodePtr ProcessCallInline::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
auto cnode = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode);
if (CheckCallInline(cnode)) {
auto call_graph = cnode->input(kIndex1);
auto sub_kernel_graph = session::AnfRuntimeAlgorithm::GetValueNodeKernelGraph(call_graph);
std::vector<AnfNodePtr> call_inline_inputs = {
NewValueNode(std::make_shared<Primitive>(prim::kPrimCallInline->name()))};
for (size_t i = kIndex1; i < common::AnfAlgo::GetInputNum(cnode); i++) {
call_inline_inputs.emplace_back(common::AnfAlgo::GetInputNode(cnode, i));
}
auto call_inline = graph->NewCNode(call_inline_inputs);
MS_EXCEPTION_IF_NULL(call_inline);
call_inline->set_abstract(cnode->abstract());
common::AnfAlgo::SetNodeAttr(kAttrKernelGraph, MakeValue(sub_kernel_graph), call_inline);
return call_inline;
}
return nullptr;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,32 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_OPTIMIZER_PROCESS_CALL_INLINE_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_OPTIMIZER_PROCESS_CALL_INLINE_H_
#include "include/backend/optimizer/optimizer.h"
namespace mindspore {
namespace opt {
class ProcessCallInline : public PatternProcessPass {
public:
explicit ProcessCallInline(bool multi_graph = true) : PatternProcessPass("process call inline", multi_graph) {}
~ProcessCallInline() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &) const override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_OPTIMIZER_PROCESS_CALL_INLINE_H_

View File

@ -18,12 +18,13 @@
#include <memory>
#include <string>
#include "backend/common/pass/dropout_gen_mask_fusion.h"
#include "backend/common/pass/common_subexpression_elimination.h"
#include "backend/common/pass/erase_visit_attr.h"
#include "include/common/debug/anf_ir_dump.h"
#include "include/common/debug/dump_proto.h"
#include "include/backend/optimizer/optimizer.h"
#include "backend/common/pass/erase_visit_attr.h"
#include "include/backend/debug/profiler/profiling.h"
#include "plugin/device/ascend/hal/hardware/ge_utils.h"
#include "plugin/device/ascend/optimizer/ge/all_to_all_v_for_ge.h"
#include "plugin/device/ascend/optimizer/ge/maketuple_depend_remover.h"
#include "plugin/device/ascend/optimizer/ge/expand_dims_for_batchnorm.h"
@ -31,14 +32,14 @@
#include "plugin/device/ascend/optimizer/ge/convert_condition_input_to_scalar.h"
#include "plugin/device/ascend/optimizer/ge/hcom/add_parallel_group_for_hcom.h"
#include "plugin/device/ascend/optimizer/ge/hcom/add_depend_for_all_gather.h"
#include "plugin/device/ascend/optimizer/ge/expander_fallback.h"
#include "plugin/device/ascend/optimizer/ge/trans_depend_value_to_int32.h"
#include "plugin/device/ascend/optimizer/ge/expander_fallback.h"
#include "plugin/device/ascend/optimizer/ge/insert_identity.h"
#include "plugin/device/ascend/optimizer/ge/dropout_gen_mask_depend.h"
#include "plugin/device/ascend/optimizer/ge/unfold_maketuple.h"
#include "plugin/device/ascend/optimizer/ge/unfold_nested_output.h"
#include "plugin/device/ascend/optimizer/ge/resize_bilinear_add_attr.h"
#include "plugin/device/ascend/optimizer/ge/process_call_inline.h"
#include "plugin/device/ascend/optimizer/format_type/deal_ref_output.h"
#include "plugin/device/ascend/optimizer/ge/hcom/insert_load_for_allgather.h"
#include "plugin/device/ascend/optimizer/format_type/set_fracz_group_attr.h"
@ -49,14 +50,14 @@
#include "plugin/device/ascend/optimizer/ge/scalar_unify_mindir.h"
#include "plugin/device/ascend/optimizer/ge/tuple_unify_mindir.h"
#include "plugin/device/ascend/optimizer/ir_fission/seed_adapter.h"
#include "plugin/device/ascend/optimizer/ir_fission/bn_split.h"
#include "plugin/device/ascend/optimizer/ir_fission/bn_grad_split.h"
#include "plugin/device/ascend/optimizer/ir_fission/ascend_convert_tuple_input_to_dynamic_input.h"
#include "plugin/device/ascend/optimizer/backend_common_unify_mindir.h"
#include "plugin/device/ascend/optimizer/ge/remove_tensor_to_scalar_or_tuple_ops.h"
#include "plugin/device/ascend/optimizer/ge/scalar_ops_output_unify_mindir.h"
#include "backend/common/pass/convert_const_input_to_tensor_input.h"
#include "backend/common/pass/insert_type_transform_op.h"
#include "backend/common/pass/insert_tensor_move_for_communication.h"
#include "plugin/device/ascend/optimizer/enhancer/eliminate_maketuple_getitem.h"
namespace mindspore {
namespace opt {
@ -120,6 +121,7 @@ void GEBackendOptimizeACL(const KernelGraphPtr &kernel_graph) {
opt_acl_pm->AddPass(std::make_shared<SeedAdapter>());
opt_acl_pm->AddPass(std::make_shared<opt::AICpuLibSelectPass>());
opt_acl_pm->AddPass(std::make_shared<opt::TransDependValueToInt32>());
opt_acl_pm->AddPass(std::make_shared<opt::ProcessCallInline>());
opt_acl_pm->AddPass(std::make_shared<opt::ExpanderFallback>());
optimizer->AddPassManager(opt_acl_pm);
(void)optimizer->Optimize(kernel_graph);
@ -155,6 +157,7 @@ void GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr &kernel_graph) {
auto opt_acl_after_kernel_select_pm = std::make_shared<PassManager>("opt_acl_after_kernel_select_pm");
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<InsertIdentity>());
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<InsertTensorMoveForCommunication>());
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<EraseVisitAttr>());
opt_acl_after_kernel_select_pm->AddPass(std::make_shared<DealRefOutput>());
if (!kernel_graph->is_from_single_op() && !kernel_graph->has_flag(kFlagIsPyNativeBpropKernelGraph)) {
@ -201,6 +204,36 @@ void GEUnifyMindIR(const KernelGraphPtr &kernel_graph) {
profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_UnifyMindIR", 0, 0, 1);
}
void GEAfterInlineOptimize(const KernelGraphPtr &kernel_graph) {
profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 0);
MS_EXCEPTION_IF_NULL(kernel_graph);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
#ifdef ENABLE_DUMP_IR
if (context_ptr->CanDump(kIntroductory)) {
std::string file_name =
"hwopt_d_before_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
DumpIR(file_name, kernel_graph);
}
#endif
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto after_inline_pm = std::make_shared<PassManager>("after_inline_pm");
after_inline_pm->AddPass(std::make_shared<DropoutGenMaskFusion>());
after_inline_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
after_inline_pm->AddPass(std::make_shared<EliminateMaketupleGetitem>());
optimizer->AddPassManager(after_inline_pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
#ifdef ENABLE_DUMP_IR
if (context_ptr->CanDump(kIntroductory)) {
std::string file_name =
"hwopt_d_after_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
DumpIR(file_name, kernel_graph);
}
#endif
profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 1);
}
void GEDynamicUnifyMindIR(const FuncGraphPtr &func_graph) {
profiler::CollectHostInfo("GE", "GE Dynamic Shape Unify MindIR", "GEBackend_Dynamic_UnifyMindIR", 0, 0, 0);
MS_EXCEPTION_IF_NULL(func_graph);

View File

@ -24,6 +24,7 @@ namespace opt {
void GEBackendOptimization(const KernelGraphPtr &kernel_graph);
void GEBackendOptimizeACL(const KernelGraphPtr &kernel_graph);
void GEUnifyMindIR(const KernelGraphPtr &kernel_graph);
void GEAfterInlineOptimize(const KernelGraphPtr &kernel_graph);
void GEDynamicUnifyMindIR(const FuncGraphPtr &func_graph);
void GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr &kernel_graph);
PassManagerPtr GetGEUnifyMindIRPassManager();

View File

@ -44,7 +44,8 @@ const AnfNodePtr AscendConvertTupleInputToDynamicInput::Process(const FuncGraphP
static const PrimitiveSet need_unfold_calculate_node = {
prim::kPrimAddN, prim::kPrimConcatD, prim::kPrimPack, prim::kPrimStack,
prim::kPrimPrint, prim::kPrimConcat, prim::kPrimAccumulateNV2, prim::kPrimMeshgrid,
prim::kPrimTensorSummary, prim::kPrimDynamicStitch, prim::kPrimParallelConcat, prim::kPrimIncreFlashAttention};
prim::kPrimTensorSummary, prim::kPrimDynamicStitch, prim::kPrimParallelConcat, prim::kPrimIncreFlashAttention,
prim::kPrimIdentityN};
static const PrimitiveSet need_unfold_control_node = {prim::kPrimSwitchLayer, prim::kPrimCall, prim::kPrimSwitch,
prim::kPrimCallInline};

View File

@ -26,6 +26,7 @@
#include "runtime/pynative/op_executor.h"
#include "pipeline/pynative/pynative_execute.h"
#include "pipeline/jit/ps/pipeline.h"
#include "include/common/thread_pool.h"
#include "include/common/pybind_api/api_register.h"
namespace mindspore {
@ -38,6 +39,10 @@ void RegisterForkCallbacks() {
MS_LOG(DEBUG) << "Register ActorMgr fork callbacks.";
ForkUtils::GetInstance().RegisterCallbacks(ActorMgr::GetActorMgrRef(), static_cast<void (ActorMgr::*)()>(nullptr),
static_cast<void (ActorMgr::*)()>(nullptr), &ActorMgr::ChildAfterFork);
MS_LOG(DEBUG) << "Register Common ThreadPool fork callbacks.";
ForkUtils::GetInstance().RegisterCallbacks(
&common::ThreadPool::GetInstance(), static_cast<void (common::ThreadPool::*)()>(nullptr),
static_cast<void (common::ThreadPool::*)()>(nullptr), &common::ThreadPool::ChildAfterFork);
MS_LOG(DEBUG) << "Register PyNativeExecutor fork callbacks.";
ForkUtils::GetInstance().RegisterCallbacks(
pynative::PyNativeExecutor::GetInstance(), &pynative::PyNativeExecutor::ParentBeforeFork,

View File

@ -68,17 +68,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
}
}
bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
MS_EXCEPTION_IF_NULL(node);
if (strategy == GraphExecutionStrategy::kStep) {
return false;
}
if (node->isa<CNode>() && common::AnfAlgo::IsGetNextNode(node)) {
return true;
}
return false;
}
bool IsDeviceQueueDSActor(const AnfNodePtr &, GraphExecutionStrategy) { return false; }
bool IsHostQueueDSActor(const AnfNodePtr &node, const KernelGraphPtr &graph,
const std::vector<AnfNodePtr> &host_parameters, GraphExecutionStrategy strategy) {
@ -136,7 +126,7 @@ bool IsCustomActor(const AnfNodePtr &node) {
return AnfUtils::IsCustomActorNode(node);
}
bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy) {
MS_EXCEPTION_IF_NULL(node);
if (IsCustomActor(node)) {
return false;
@ -146,11 +136,7 @@ bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
return false;
}
if (strategy == GraphExecutionStrategy::kStep) {
return true;
}
return !common::AnfAlgo::IsGetNextNode(node);
return true;
}
bool IsSkippedKernelActor(const AnfNodePtr &node) {

View File

@ -129,7 +129,7 @@ void KernelActor::InitOutputInfo() {
}
// Used to keep graph output address when somas block memory free, and reused by the ref conut in other graphs.
if (somas_graph_output_indexes_.count(i) > 0) {
(void)somas_info_->InsertGraphOutputInfo(output_address.get(), somas_outputs[i].second);
(void)somas_info_->InsertGraphOutputInfo(output_address.get(), somas_outputs[i].first, somas_outputs[i].second);
} else {
UpdateRefCount(output_address.get(), true);
}

View File

@ -22,6 +22,7 @@
#include <functional>
#include <string>
#include <utility>
#include <unordered_map>
#include "acl/acl_base.h"
#include "acl/acl.h"
#include "transform/acl_ir/op_api_convert.h"
@ -133,9 +134,32 @@ class ReleaseCall {
T converted_params_;
};
class ApiCachePool {
public:
ApiCachePool() = default;
~ApiCachePool() = default;
const char *get(const std::string &str) {
auto it = pool_.find(str);
if (it != pool_.end()) {
return it->second.c_str();
}
auto [map_iter, inserted] = pool_.emplace(str, str);
if (!inserted) {
MS_LOG(EXCEPTION) << "Failed to cache api.";
}
return map_iter->second.c_str();
}
private:
std::unordered_map<std::string, std::string> pool_;
};
// For normal generate executor.
#define GEN_EXECUTOR(aclnn_api, ...) \
[](const char *api_name, const std::string &workspace_api_name, const auto &... args) -> auto { \
[](const std::string &api_str, const std::string &workspace_api_name, const auto &... args) -> auto { \
static transform::ApiCachePool api_cache_pool; \
const char *api_name = api_cache_pool.get(api_str); \
static const auto get_workspace_size_func_ptr = transform::GetOpApiFunc(workspace_api_name.c_str()); \
if (get_workspace_size_func_ptr == nullptr) { \
MS_LOG(EXCEPTION) << workspace_api_name << " not in " << transform::GetOpApiLibName() << ", please check!"; \
@ -163,7 +187,7 @@ class ReleaseCall {
release_func = std::function<void()>(releas_call); \
return std::make_tuple(workspace_size, executor, release_func); \
} \
(aclnn_api.c_str(), aclnn_api + "GetWorkspaceSize", __VA_ARGS__)
(aclnn_api, aclnn_api + "GetWorkspaceSize", __VA_ARGS__)
// For custom generate executor.
#define GEN_EXECUTOR_CUST(aclnn_api, ...) \

View File

@ -129,7 +129,12 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
set_param<bool>(MS_CTX_ENABLE_GRAD_COMM_OPT, false);
set_param<bool>(MS_CTX_INTERLEAVED_MATMUL_COMM, false);
set_param<bool>(MS_CTX_INTERLEAVED_LAYERNORM_COMM, false);
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
if (target == kAscendDevice || target == kDavinciDevice) {
// enable somas by default on ascend
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO1);
} else {
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
}
set_param<uint32_t>(MS_CTX_OP_TIMEOUT, kOpTimeout);
set_param<int>(MS_CTX_JIT_SYNTAX_LEVEL, kLax);
set_param<std::string>(MS_CTX_CONV_FPROP_ALGO, "normal");

View File

@ -166,7 +166,7 @@ def init(backend_name=None):
if backend_name == "hccl":
if _is_ps_mode():
# Use MindSpore cluster to build network for Parameter Server traning.
# Use MindSpore cluster to build network for Parameter Server training.
init_cluster()
if _is_role_sched() or _is_role_pserver():
raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'")

View File

@ -120,6 +120,8 @@ package_data = {
'lib/plugin/*/*/*/*/*/*',
'lib/plugin/*/*/*/*/*/*/*',
'lib/plugin/*/*/*/*/*/*/*/*',
'lib/plugin/*/*/*/*/*/*/*/*/*',
'lib/plugin/*/*/*/*/*/*/*/*/*/*',
'lib/*.so*',
'lib/*.a',
'lib/*.dylib*',

View File

@ -70,7 +70,7 @@ def test_fastgelu_op_forward_ascend(context_mode, data_type):
x = ms.Tensor(np.array([1., 2., 3.]).astype(data_type))
out = fastgelu_forward_func(x)
expect_out = np.array([0.845703, 1.9375, 2.982422]).astype(np.float32)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)
@pytest.mark.level0
@ -107,7 +107,7 @@ def test_fastgelu_op_backward_ascend(context_mode, data_type):
x = ms.Tensor(np.array([1., 2., 3.]).astype(data_type))
grads = fastgelu_backward_func(x)
expect_out = np.array([1.069909, 1.072501, 1.022826]).astype(np.float32)
np.testing.assert_allclose(grads.asnumpy(), expect_out, rtol=1e-4)
np.testing.assert_allclose(grads.asnumpy(), expect_out, rtol=1e-2)
@pytest.mark.level1
@ -144,4 +144,4 @@ def test_fastgelu_op_vmap_ascend(context_mode, data_type):
x = ms.Tensor(np.array([1., 2., 3.]).astype(data_type))
out = fastgelu_vmap_func(x)
expect_out = np.array([0.845703, 1.9375, 2.982422]).astype(np.float32)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)

View File

@ -67,7 +67,7 @@ def test_fastgelugrad_op_forward_ascend(context_mode, data_type):
dy = ms.Tensor(np.array([1, 2, 3]).astype(data_type))
out = fastgelugrad_forward_func(dy, x)
expect_out = np.array([1.069909, 2.145001, 3.068479]).astype(np.float32)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)
@pytest.mark.level1
@ -106,4 +106,4 @@ def test_fastgelugrad_op_vmap_ascend(context_mode, data_type):
dy = ms.Tensor(np.array([1, 2, 3]).astype(data_type))
out = fastgelugrad_vmap_func(dy, x)
expect_out = np.array([1.069909, 2.145001, 3.068479]).astype(np.float32)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-4)
np.testing.assert_allclose(out.asnumpy(), expect_out, rtol=1e-2)

View File

@ -14,6 +14,7 @@
# ============================================================================
"""burgers pinns"""
import time
import os
import pytest
import numpy as np
@ -55,6 +56,7 @@ def test_mindflow_burgers_pinns():
Description: test train and eval
Expectation: success
"""
os.environ['GRAPH_OP_RUN'] = "0"
context.set_context(mode=context.GRAPH_MODE)
model = Net()
optimizer = nn.Adam(model.trainable_params(), 0.0001)
@ -132,3 +134,4 @@ def test_mindflow_burgers_pinns():
assert epoch_time < 0.01
assert train_loss < 0.6
assert eval_error < 0.8
del os.environ['GRAPH_OP_RUN']

View File

@ -14,6 +14,7 @@
# ============================================================================
"""navier stokes pinns"""
import time
import os
import pytest
import numpy as np
@ -55,6 +56,7 @@ def test_mindflow_navier_stokes():
Description: test train
Expectation: success
"""
os.environ['GRAPH_OP_RUN'] = "0"
context.set_context(mode=context.GRAPH_MODE)
model = Net(in_channels=3, out_channels=3)
optimizer = nn.Adam(model.trainable_params(), 0.0001)
@ -127,3 +129,4 @@ def test_mindflow_navier_stokes():
else:
assert epoch_time < 0.01
assert train_loss < 0.8
del os.environ['GRAPH_OP_RUN']

View File

@ -35,7 +35,7 @@ def run_testcase(testcase_name, expect_memory_usage):
os.remove(log_filename)
assert not os.path.exists(log_filename)
cmd = f"export GLOG_v=1; pytest -s test_recompute.py::" + testcase_name + " > " \
cmd = f"export GLOG_v=1; export GRAPH_OP_RUN=0; pytest -s test_recompute.py::" + testcase_name + " > " \
+ log_filename + " 2>&1"
subprocess.check_output(cmd, shell=True)
assert os.path.exists(log_filename)