!27427 Bugs Fix of Ascend MindRT

Merge pull request !27427 from hwjiaorui/mindrt-bug-fix
This commit is contained in:
i-robot 2021-12-13 08:31:24 +00:00 committed by Gitee
commit 1ecbadd7a8
9 changed files with 66 additions and 21 deletions

View File

@ -298,8 +298,8 @@ void TbeKernelCompileManager::ParseTargetJobStatus(const nlohmann::json &json, T
MS_LOG(EXCEPTION) << "Parse query result error.";
}
auto json_name = GetJsonValue<std::string>(query_result, kFusionOpName);
auto target_job_id = query_result.at(kJobId);
auto status = query_result.at(kStatus);
auto target_job_id = GetJsonValue<int>(query_result, kJobId);
auto status = GetJsonValue<std::string>(query_result, kStatus);
auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(query_result, kProcessInfo);
auto message = FilterExceptionMessage(all_logs);
// save job status and exception message
@ -354,8 +354,8 @@ void TbeKernelCompileManager::JsonAssemble(const std::string &job_type, const nl
(*dst_json)[kJobContent] = job_info;
} else if (job_type == kQuery) {
nlohmann::json content;
content[kSourceId] = src_json[kSourceId];
content[kJobId] = src_json[kJobId];
content[kSourceId] = GetJsonValue<int>(src_json, kSourceId);
content[kJobId] = GetJsonValue<int>(src_json, kJobId);
(*dst_json)[kJobContent] = content;
} else {
(*dst_json)[kJobContent] = src_json;
@ -438,7 +438,8 @@ void TbeKernelCompileManager::SaveIOSizeInfo(const nlohmann::json &json, const s
std::vector<size_t> input_size_list;
std::vector<size_t> output_size_list;
if (!output_nodes.empty()) {
(void)TbeKernelBuild::GetIOSize(json[kOpList], output_nodes, &input_size_list, &output_size_list);
(void)TbeKernelBuild::GetIOSize(GetJsonValue<nlohmann::json>(json, kOpList), output_nodes, &input_size_list,
&output_size_list);
} else {
(void)TbeKernelBuild::GetIOSize(json, &input_size_list, &output_size_list);
}

View File

@ -1642,6 +1642,7 @@ void FinalizeHccl() {
(void)FinalizeBackend();
#else
session::ExecutorManager::Instance().Clear();
device::DeviceContextManager::GetInstance().ClearDeviceContexts();
device::KernelRuntimeManager::Instance().ClearRuntimeResource();
#endif
}

View File

@ -255,18 +255,17 @@ bool CheckHitTargetDtype(const std::map<TypeId, TypeId> &type_map, const TypeId
}
bool TagRaiseReduce(const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build_info, const CNodePtr &cnode,
const std::map<TypeId, TypeId> &type_map) {
const std::map<TypeId, TypeId> &type_map, bool *int64_flag) {
// filte kernel info that unsupported raise or reduce datatype
MS_EXCEPTION_IF_NULL(cnode);
MS_EXCEPTION_IF_NULL(kernel_build_info);
bool flag = false;
for (size_t input_index = 0; input_index < kernel_build_info->GetInputNum(); ++input_index) {
auto in_dtype = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
auto device_dtype = kernel_build_info->GetInputDeviceType(input_index);
if (device_dtype == kNumberTypeFloat || device_dtype == kNumberTypeFloat32) {
device_dtype = kNumberTypeFloat32;
}
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) {
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) {
return false;
}
}
@ -278,14 +277,10 @@ bool TagRaiseReduce(const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build
device_dtype = kNumberTypeFloat32;
}
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) {
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) {
return false;
}
}
if (flag) {
auto node_name = AnfAlgo::GetCNodeName(cnode);
MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32.";
}
return true;
}
@ -298,10 +293,11 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
const std::map<TypeId, TypeId> reduce_map = {{kNumberTypeInt64, kNumberTypeInt32},
{kNumberTypeFloat, kNumberTypeFloat16},
{kNumberTypeFloat32, kNumberTypeFloat16}};
bool int64_reduce = false;
// raise precision
for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]);
if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map)) {
if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map, &int64_reduce)) {
filtered_kernel_info_list.push_back(kernel_info_list[info_index]);
}
}
@ -317,7 +313,7 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_REDUCE_PRECISION)) {
for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]);
if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map)) {
if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map, &int64_reduce)) {
filtered_kernel_info_list.push_back(kernel_info_list[info_index]);
}
}
@ -325,6 +321,10 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
if (!filtered_kernel_info_list.empty()) {
*precision_reduce = true;
}
if (int64_reduce) {
auto node_name = AnfAlgo::GetCNodeName(cnode);
MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32.";
}
return filtered_kernel_info_list;
}

View File

@ -292,5 +292,15 @@ std::string FetchActorName(KernelTransformType kernel_type, const std::string &a
}
return actor_name;
}
bool CheckMemcpyInDevice(const DeviceTensor *dst_device_addr, const DeviceTensor *src_device_addr) {
MS_EXCEPTION_IF_NULL(dst_device_addr);
if (src_device_addr == nullptr) {
return false;
}
return (src_device_addr->DeviceType() == dst_device_addr->DeviceType() &&
src_device_addr->format() == dst_device_addr->format() &&
src_device_addr->type_id() == dst_device_addr->type_id());
}
} // namespace runtime
} // namespace mindspore

View File

@ -210,6 +210,8 @@ KernelTransformType FetchKernelTransformType(const AnfNodePtr &node, const Kerne
GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
std::string FetchActorName(KernelTransformType kernel_type, const std::string &actor_set_name,
const AnfNodePtr &node = nullptr, const KernelGraphPtr &graph = nullptr);
bool CheckMemcpyInDevice(const DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor);
} // namespace runtime
} // namespace mindspore

View File

@ -239,10 +239,14 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons
auto tensor_device_address = std::dynamic_pointer_cast<DeviceTensor>(host_tensor->device_address());
// Sync data from host_tensor_device_address to device_tensor.
if (tensor_device_address != nullptr) {
if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed.");
if (CheckMemcpyInDevice(device_tensor, tensor_device_address.get())) {
if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed.");
}
continue;
} else {
host_tensor->data_sync(false);
}
continue;
}
// Sync data from host_tensor to device_tensor.

View File

@ -109,7 +109,7 @@ TensorPtr OutputActor::CreateOutputTensor(const AnfNodePtr &output_node, size_t
const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(output_node, output_index, false);
MS_EXCEPTION_IF_NULL(device_tensor);
// In the input as output scenario, use the device tensor of node.
if (output_node->isa<ValueNode>() || output_node->isa<Parameter>()) {
if (IsPersistentDeviceTensor(output_node)) {
tensor->set_device_address(device_tensor);
return tensor;
}
@ -151,7 +151,7 @@ void OutputActor::UpdateOutputDeviceAddress() {
auto output_index = output_nodes_[i].second;
auto &tensor = outputs_[i];
// In the input as output scenario, the output device tensor may come from the input tensor and can't be replaced.
if ((output_node == nullptr) || output_node->isa<ValueNode>() || output_node->isa<Parameter>()) {
if ((output_node == nullptr) || IsPersistentDeviceTensor(output_node)) {
continue;
}

View File

@ -28,6 +28,7 @@
#include "runtime/hardware/ascend/ascend_graph_optimization.h"
#include "backend/kernel_compiler/ascend_kernel_mod.h"
#include "runtime/device/ascend/ascend_bucket.h"
#include "common/util/error_manager/error_manager.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
@ -65,6 +66,7 @@ namespace ascend {
using KernelGraph = mindspore::session::KernelGraph;
const char kMsVm[] = "vm";
constexpr size_t kAtomicCleanInputSize = 2;
constexpr auto kUnknowErrorString = "Unknown error occurred";
namespace {
CNodePtr GetNextLabelSet(const std::vector<CNodePtr> &kernel_nodes, uint32_t index) {
size_t node_sizes = kernel_nodes.size();
@ -582,10 +584,30 @@ bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const {
runtime_instance_->SetContext();
device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph);
auto ret = ExecuteGraph(graph);
if (!ret) {
MS_LOG(ERROR) << "run task error!";
ReportErrorMessage();
return ret;
}
ReportWarningMessage();
MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id();
return ret;
}
void AscendDeviceContext::ReportErrorMessage() const {
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
}
void AscendDeviceContext::ReportWarningMessage() const {
const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
if (!warning_message.empty()) {
MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
}
}
bool AscendDeviceContext::SyncStream(size_t stream_id) const {
MS_EXCEPTION_IF_NULL(runtime_instance_);
return runtime_instance_->SyncStream();
@ -597,7 +619,9 @@ bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const {
return ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && IsGraphMode();
}
bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return IsGraphMode(); }
bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const {
return device::KernelAdjust::NeedLoopSink() && IsGraphMode();
}
// kernel by kernel mode interface
void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {

View File

@ -138,6 +138,9 @@ class AscendDeviceContext : public DeviceContext {
static bool IsGraphMode();
bool SyncRuning() const;
void ReportErrorMessage() const;
void ReportWarningMessage() const;
// Kernel Runtime --- only for task sink
AscendKernelRuntime *runtime_instance_{nullptr};
std::shared_ptr<MemoryManager> mem_manager_{nullptr};