forked from mindspore-Ecosystem/mindspore
!27427 Bugs Fix of Ascend MindRT
Merge pull request !27427 from hwjiaorui/mindrt-bug-fix
This commit is contained in:
commit
1ecbadd7a8
|
@ -298,8 +298,8 @@ void TbeKernelCompileManager::ParseTargetJobStatus(const nlohmann::json &json, T
|
|||
MS_LOG(EXCEPTION) << "Parse query result error.";
|
||||
}
|
||||
auto json_name = GetJsonValue<std::string>(query_result, kFusionOpName);
|
||||
auto target_job_id = query_result.at(kJobId);
|
||||
auto status = query_result.at(kStatus);
|
||||
auto target_job_id = GetJsonValue<int>(query_result, kJobId);
|
||||
auto status = GetJsonValue<std::string>(query_result, kStatus);
|
||||
auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(query_result, kProcessInfo);
|
||||
auto message = FilterExceptionMessage(all_logs);
|
||||
// save job status and exception message
|
||||
|
@ -354,8 +354,8 @@ void TbeKernelCompileManager::JsonAssemble(const std::string &job_type, const nl
|
|||
(*dst_json)[kJobContent] = job_info;
|
||||
} else if (job_type == kQuery) {
|
||||
nlohmann::json content;
|
||||
content[kSourceId] = src_json[kSourceId];
|
||||
content[kJobId] = src_json[kJobId];
|
||||
content[kSourceId] = GetJsonValue<int>(src_json, kSourceId);
|
||||
content[kJobId] = GetJsonValue<int>(src_json, kJobId);
|
||||
(*dst_json)[kJobContent] = content;
|
||||
} else {
|
||||
(*dst_json)[kJobContent] = src_json;
|
||||
|
@ -438,7 +438,8 @@ void TbeKernelCompileManager::SaveIOSizeInfo(const nlohmann::json &json, const s
|
|||
std::vector<size_t> input_size_list;
|
||||
std::vector<size_t> output_size_list;
|
||||
if (!output_nodes.empty()) {
|
||||
(void)TbeKernelBuild::GetIOSize(json[kOpList], output_nodes, &input_size_list, &output_size_list);
|
||||
(void)TbeKernelBuild::GetIOSize(GetJsonValue<nlohmann::json>(json, kOpList), output_nodes, &input_size_list,
|
||||
&output_size_list);
|
||||
} else {
|
||||
(void)TbeKernelBuild::GetIOSize(json, &input_size_list, &output_size_list);
|
||||
}
|
||||
|
|
|
@ -1642,6 +1642,7 @@ void FinalizeHccl() {
|
|||
(void)FinalizeBackend();
|
||||
#else
|
||||
session::ExecutorManager::Instance().Clear();
|
||||
device::DeviceContextManager::GetInstance().ClearDeviceContexts();
|
||||
device::KernelRuntimeManager::Instance().ClearRuntimeResource();
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -255,18 +255,17 @@ bool CheckHitTargetDtype(const std::map<TypeId, TypeId> &type_map, const TypeId
|
|||
}
|
||||
|
||||
bool TagRaiseReduce(const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build_info, const CNodePtr &cnode,
|
||||
const std::map<TypeId, TypeId> &type_map) {
|
||||
const std::map<TypeId, TypeId> &type_map, bool *int64_flag) {
|
||||
// filte kernel info that unsupported raise or reduce datatype
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
MS_EXCEPTION_IF_NULL(kernel_build_info);
|
||||
bool flag = false;
|
||||
for (size_t input_index = 0; input_index < kernel_build_info->GetInputNum(); ++input_index) {
|
||||
auto in_dtype = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
|
||||
auto device_dtype = kernel_build_info->GetInputDeviceType(input_index);
|
||||
if (device_dtype == kNumberTypeFloat || device_dtype == kNumberTypeFloat32) {
|
||||
device_dtype = kNumberTypeFloat32;
|
||||
}
|
||||
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) {
|
||||
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -278,14 +277,10 @@ bool TagRaiseReduce(const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build
|
|||
device_dtype = kNumberTypeFloat32;
|
||||
}
|
||||
|
||||
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) {
|
||||
if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (flag) {
|
||||
auto node_name = AnfAlgo::GetCNodeName(cnode);
|
||||
MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32.";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -298,10 +293,11 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
|
|||
const std::map<TypeId, TypeId> reduce_map = {{kNumberTypeInt64, kNumberTypeInt32},
|
||||
{kNumberTypeFloat, kNumberTypeFloat16},
|
||||
{kNumberTypeFloat32, kNumberTypeFloat16}};
|
||||
bool int64_reduce = false;
|
||||
// raise precision
|
||||
for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]);
|
||||
if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map)) {
|
||||
if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map, &int64_reduce)) {
|
||||
filtered_kernel_info_list.push_back(kernel_info_list[info_index]);
|
||||
}
|
||||
}
|
||||
|
@ -317,7 +313,7 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
|
|||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_REDUCE_PRECISION)) {
|
||||
for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]);
|
||||
if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map)) {
|
||||
if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map, &int64_reduce)) {
|
||||
filtered_kernel_info_list.push_back(kernel_info_list[info_index]);
|
||||
}
|
||||
}
|
||||
|
@ -325,6 +321,10 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
|
|||
if (!filtered_kernel_info_list.empty()) {
|
||||
*precision_reduce = true;
|
||||
}
|
||||
if (int64_reduce) {
|
||||
auto node_name = AnfAlgo::GetCNodeName(cnode);
|
||||
MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32.";
|
||||
}
|
||||
return filtered_kernel_info_list;
|
||||
}
|
||||
|
||||
|
|
|
@ -292,5 +292,15 @@ std::string FetchActorName(KernelTransformType kernel_type, const std::string &a
|
|||
}
|
||||
return actor_name;
|
||||
}
|
||||
|
||||
bool CheckMemcpyInDevice(const DeviceTensor *dst_device_addr, const DeviceTensor *src_device_addr) {
|
||||
MS_EXCEPTION_IF_NULL(dst_device_addr);
|
||||
if (src_device_addr == nullptr) {
|
||||
return false;
|
||||
}
|
||||
return (src_device_addr->DeviceType() == dst_device_addr->DeviceType() &&
|
||||
src_device_addr->format() == dst_device_addr->format() &&
|
||||
src_device_addr->type_id() == dst_device_addr->type_id());
|
||||
}
|
||||
} // namespace runtime
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -210,6 +210,8 @@ KernelTransformType FetchKernelTransformType(const AnfNodePtr &node, const Kerne
|
|||
GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
|
||||
std::string FetchActorName(KernelTransformType kernel_type, const std::string &actor_set_name,
|
||||
const AnfNodePtr &node = nullptr, const KernelGraphPtr &graph = nullptr);
|
||||
|
||||
bool CheckMemcpyInDevice(const DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor);
|
||||
} // namespace runtime
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -239,10 +239,14 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons
|
|||
auto tensor_device_address = std::dynamic_pointer_cast<DeviceTensor>(host_tensor->device_address());
|
||||
// Sync data from host_tensor_device_address to device_tensor.
|
||||
if (tensor_device_address != nullptr) {
|
||||
if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed.");
|
||||
if (CheckMemcpyInDevice(device_tensor, tensor_device_address.get())) {
|
||||
if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed.");
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
host_tensor->data_sync(false);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Sync data from host_tensor to device_tensor.
|
||||
|
|
|
@ -109,7 +109,7 @@ TensorPtr OutputActor::CreateOutputTensor(const AnfNodePtr &output_node, size_t
|
|||
const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(output_node, output_index, false);
|
||||
MS_EXCEPTION_IF_NULL(device_tensor);
|
||||
// In the input as output scenario, use the device tensor of node.
|
||||
if (output_node->isa<ValueNode>() || output_node->isa<Parameter>()) {
|
||||
if (IsPersistentDeviceTensor(output_node)) {
|
||||
tensor->set_device_address(device_tensor);
|
||||
return tensor;
|
||||
}
|
||||
|
@ -151,7 +151,7 @@ void OutputActor::UpdateOutputDeviceAddress() {
|
|||
auto output_index = output_nodes_[i].second;
|
||||
auto &tensor = outputs_[i];
|
||||
// In the input as output scenario, the output device tensor may come from the input tensor and can't be replaced.
|
||||
if ((output_node == nullptr) || output_node->isa<ValueNode>() || output_node->isa<Parameter>()) {
|
||||
if ((output_node == nullptr) || IsPersistentDeviceTensor(output_node)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "runtime/hardware/ascend/ascend_graph_optimization.h"
|
||||
#include "backend/kernel_compiler/ascend_kernel_mod.h"
|
||||
#include "runtime/device/ascend/ascend_bucket.h"
|
||||
#include "common/util/error_manager/error_manager.h"
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
|
@ -65,6 +66,7 @@ namespace ascend {
|
|||
using KernelGraph = mindspore::session::KernelGraph;
|
||||
const char kMsVm[] = "vm";
|
||||
constexpr size_t kAtomicCleanInputSize = 2;
|
||||
constexpr auto kUnknowErrorString = "Unknown error occurred";
|
||||
namespace {
|
||||
CNodePtr GetNextLabelSet(const std::vector<CNodePtr> &kernel_nodes, uint32_t index) {
|
||||
size_t node_sizes = kernel_nodes.size();
|
||||
|
@ -582,10 +584,30 @@ bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const {
|
|||
runtime_instance_->SetContext();
|
||||
device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph);
|
||||
auto ret = ExecuteGraph(graph);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "run task error!";
|
||||
ReportErrorMessage();
|
||||
return ret;
|
||||
}
|
||||
ReportWarningMessage();
|
||||
MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id();
|
||||
return ret;
|
||||
}
|
||||
|
||||
void AscendDeviceContext::ReportErrorMessage() const {
|
||||
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
}
|
||||
|
||||
void AscendDeviceContext::ReportWarningMessage() const {
|
||||
const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
|
||||
if (!warning_message.empty()) {
|
||||
MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
|
||||
}
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::SyncStream(size_t stream_id) const {
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
return runtime_instance_->SyncStream();
|
||||
|
@ -597,7 +619,9 @@ bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const {
|
|||
return ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && IsGraphMode();
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return IsGraphMode(); }
|
||||
bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const {
|
||||
return device::KernelAdjust::NeedLoopSink() && IsGraphMode();
|
||||
}
|
||||
|
||||
// kernel by kernel mode interface
|
||||
void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
|
||||
|
|
|
@ -138,6 +138,9 @@ class AscendDeviceContext : public DeviceContext {
|
|||
static bool IsGraphMode();
|
||||
bool SyncRuning() const;
|
||||
|
||||
void ReportErrorMessage() const;
|
||||
void ReportWarningMessage() const;
|
||||
|
||||
// Kernel Runtime --- only for task sink
|
||||
AscendKernelRuntime *runtime_instance_{nullptr};
|
||||
std::shared_ptr<MemoryManager> mem_manager_{nullptr};
|
||||
|
|
Loading…
Reference in New Issue