diff --git a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc index 5125e887542..81b9b78c9fe 100644 --- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc +++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc @@ -38,10 +38,28 @@ int NPUExecutor::Prepare(const std::vector &kernels) { } bool IsSameShapeTensor(Tensor *tensor, std::shared_ptr npu_tensor) { - return tensor->Batch() == npu_tensor->GetTensorDimension().GetNumber() && - tensor->Channel() == npu_tensor->GetTensorDimension().GetChannel() && - tensor->Height() == npu_tensor->GetTensorDimension().GetHeight() && - tensor->Width() == npu_tensor->GetTensorDimension().GetWidth(); + if (tensor->shape().size() == 4) { + return tensor->Batch() == npu_tensor->GetTensorDimension().GetNumber() && + tensor->Channel() == npu_tensor->GetTensorDimension().GetChannel() && + tensor->Height() == npu_tensor->GetTensorDimension().GetHeight() && + tensor->Width() == npu_tensor->GetTensorDimension().GetWidth(); + } + if (tensor->shape().size() > 4) { + MS_LOG(ERROR) << "Npu doesn't support input tensor dims greater than 4"; + return false; + } + std::vector npu_shape; + auto dim = tensor->shape().size(); + if (dim > 0) { + npu_shape.push_back(npu_tensor->GetTensorDimension().GetNumber()); + } + if (dim > 1) { + npu_shape.push_back(npu_tensor->GetTensorDimension().GetChannel()); + } + if (dim > 2) { + npu_shape.push_back(npu_tensor->GetTensorDimension().GetWidth()); + } + return npu_shape == tensor->shape(); } int NPUExecutor::Run(const std::vector &in_tensors, const std::vector &out_tensors, @@ -49,10 +67,11 @@ int NPUExecutor::Run(const std::vector &in_tensors, const std::vector< const std::vector &kernels, Allocator *allocator, const KernelCallBack &before, const KernelCallBack &after) { hiai::AiContext context; + std::vector inputs_visited(in_tensors.size(), false); for (int i = 0; i < npu_input_tensors_.size(); ++i) { int index = 0; for (; index < in_tensors.size(); index++) { - if (IsSameShapeTensor(in_tensors[index], npu_input_tensors_[i])) { + if (!inputs_visited[index] && IsSameShapeTensor(in_tensors[index], npu_input_tensors_[i])) { void *data = in_tensors[index]->data_c(); if (data == nullptr) { MS_LOG(ERROR) << model_name_ << " Inputs data is nullptr"; @@ -60,6 +79,7 @@ int NPUExecutor::Run(const std::vector &in_tensors, const std::vector< } memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[index]->Size()); + inputs_visited[index] = true; in_tensors[index]->set_ref_count(in_tensors[index]->ref_count() - 1); if (in_tensors[index]->ref_count() <= 0) { in_tensors[index]->FreeData(); @@ -85,33 +105,14 @@ int NPUExecutor::Run(const std::vector &in_tensors, const std::vector< return RET_ERROR; } - // For the output kernel of the entire model, and the format is nchw, the output tensor needs to be nchw TO nhwc. - std::vector trans_tensors; - for (auto kernel : out_kernels) { - if (kernel->out_kernels().empty() && npu_trans_nodes.find(kernel->Type()) != npu_trans_nodes.end()) { - for (int i = 0; i < kernel->out_tensors().size(); ++i) { - trans_tensors.push_back(kernel->out_tensors()[i]); - } - } - } for (int i = 0; i < npu_output_tensors_.size(); ++i) { void *data = out_tensors[i]->MutableData(); if (data == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR; } - - if (std::find(trans_tensors.begin(), trans_tensors.end(), out_tensors[i]) != trans_tensors.end()) { - // Change data&tensor shape nc->nh - PackNCHWToNHWCFp32(npu_output_tensors_[i]->GetBuffer(), data, - npu_output_tensors_[i]->GetTensorDimension().GetNumber(), - npu_output_tensors_[i]->GetTensorDimension().GetWidth() * - npu_output_tensors_[i]->GetTensorDimension().GetHeight(), - npu_output_tensors_[i]->GetTensorDimension().GetChannel()); - } else { - memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize()); - out_tensors[i]->ResetRefCount(); - } + memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize()); + out_tensors[i]->ResetRefCount(); } return RET_OK; } diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc index fd0748d0338..13f76821c75 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc @@ -29,9 +29,8 @@ bool CheckFusion(kernel::LiteKernel *kernel) { return false; } auto post_flag = - std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), [](const kernel::LiteKernel *out_kernel) { - return NPUPassUtils::IsNhwc2Nchw(out_kernel) && (!out_kernel->out_kernels().empty()); - }); + std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), + [](const kernel::LiteKernel *out_kernel) { return NPUPassUtils::IsNhwc2Nchw(out_kernel); }); return post_flag; } @@ -41,15 +40,11 @@ bool CheckFormatFusion(kernel::LiteKernel *kernel) { } if (NPUPassUtils::IsNhwc2Nchw(kernel)) { return std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), - [](const kernel::LiteKernel *kernel) { - return NPUPassUtils::IsNchw2Nhwc(kernel) && (!kernel->out_kernels().empty()); - }); + [](const kernel::LiteKernel *kernel) { return NPUPassUtils::IsNchw2Nhwc(kernel); }); } if (NPUPassUtils::IsNchw2Nhwc(kernel)) { return std::all_of(kernel->out_kernels().begin(), kernel->out_kernels().end(), - [](const kernel::LiteKernel *kernel) { - return NPUPassUtils::IsNhwc2Nchw(kernel) && (!kernel->out_kernels().empty()); - }); + [](const kernel::LiteKernel *kernel) { return NPUPassUtils::IsNhwc2Nchw(kernel); }); } return false; } @@ -92,32 +87,32 @@ void NPUFusionPass::UpdatePreKernels(kernel::LiteKernel *cur_kernel) { } void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) { + auto cur_out_kernels = cur_kernel->out_kernels(); for (auto out_kernel : cur_kernel->out_kernels()) { // graph out kernel if (out_kernel->out_kernels().empty()) { - continue; - } - auto post_kernel = out_kernel->out_kernels()[0]; + cur_out_kernels.erase(find(cur_out_kernels.begin(), cur_out_kernels.end(), out_kernel)); + } else { + auto post_kernel = out_kernel->out_kernels()[0]; + auto post_in_kernels = post_kernel->in_kernels(); + for (size_t i = 0; i < post_in_kernels.size(); i++) { + if (post_in_kernels[i] == out_kernel) { + post_in_kernels[i] = cur_kernel; + break; + } + } + post_kernel->set_in_kernels(post_in_kernels); - auto post_in_kernels = post_kernel->in_kernels(); - for (size_t i = 0; i < post_in_kernels.size(); i++) { - if (post_in_kernels[i] == out_kernel) { - post_in_kernels[i] = cur_kernel; - break; + for (size_t i = 0; i < cur_out_kernels.size(); i++) { + if (cur_out_kernels[i] == out_kernel) { + cur_out_kernels[i] = post_kernel; + break; + } } } - post_kernel->set_in_kernels(post_in_kernels); - - auto cur_out_kernels = cur_kernel->out_kernels(); - for (size_t i = 0; i < cur_out_kernels.size(); i++) { - if (cur_out_kernels[i] == out_kernel) { - cur_out_kernels[i] = post_kernel; - break; - } - } - cur_kernel->set_out_kernels(cur_out_kernels); RemoveAndFreeKernel(out_kernel); } + cur_kernel->set_out_kernels(cur_out_kernels); } void UpdatePreTensors(kernel::LiteKernel *cur_kernel) { @@ -145,6 +140,9 @@ void UpdatePostTensors(kernel::LiteKernel *cur_kernel) { auto tensor = cur_kernel->out_tensors()[0]; for (auto out_kernel : cur_kernel->out_kernels()) { auto out_tensor = out_kernel->out_tensors()[0]; + if (out_kernel->out_kernels().empty()) { + cur_kernel->set_out_tensors({out_kernel->out_tensors()[0]}); + } for (auto post_kernel : out_kernel->out_kernels()) { auto tensors_vec = post_kernel->in_tensors(); for (int i = 0; i < tensors_vec.size(); i++) { @@ -197,6 +195,10 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { auto in_tensor = kernel->in_tensors()[0]; std::vector pre_insert_kernels; for (const auto &trans_kernel : kernel->out_kernels()) { + if (trans_kernel->out_kernels().empty()) { + // kernel is a trans kernel, it's input kernel num and input tensor num must be 1 + kernel->in_kernels()[0]->set_out_tensors({trans_kernel->out_tensors()[0]}); + } for (const auto &post_kernel : trans_kernel->out_kernels()) { // update tensor auto tensors_vec = post_kernel->in_tensors(); @@ -218,8 +220,8 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { } post_kernel->set_in_kernels(post_in_kernels); pre_insert_kernels.push_back(post_kernel); - RemoveAndFreeKernel(trans_kernel); } + RemoveAndFreeKernel(trans_kernel); } pre_kernel->set_out_kernels(pre_insert_kernels); RemoveAndFreeKernel(kernel); diff --git a/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc b/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc index 3b63ab2c007..0bf88592728 100644 --- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc +++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc @@ -42,7 +42,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector &inputs, const int size = static_cast(pad_->GetPaddings().size() / 2); ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32); ge::TensorPtr padding_tensor = std::make_shared(padding_tensor_desc); - padding_tensor->SetData(reinterpret_cast(pad_->GetPaddings().data()), size * sizeof(int)); + padding_tensor->SetData(reinterpret_cast(pad_->GetPaddings().data()), 2 * size * sizeof(int)); auto paddings = new hiai::op::Const(name_ + "paddings"); paddings->set_attr_value(padding_tensor); diff --git a/mindspore/lite/src/runtime/kernel/npu/softmax_npu.cc b/mindspore/lite/src/runtime/kernel/npu/softmax_npu.cc index a502a861098..c8cbeecd6be 100644 --- a/mindspore/lite/src/runtime/kernel/npu/softmax_npu.cc +++ b/mindspore/lite/src/runtime/kernel/npu/softmax_npu.cc @@ -24,6 +24,10 @@ using mindspore::schema::PrimitiveType_SoftMax; namespace mindspore::kernel { int SoftmaxNPUKernel::IsSupport(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter) { + if (inputs[0]->shape().size() > 4) { + MS_LOG(ERROR) << "Npu softmax only supports tensor'dim less than 4."; + return RET_ERROR; + } return RET_OK; } diff --git a/mindspore/lite/test/models_caffe.cfg b/mindspore/lite/test/models_caffe.cfg index 85e2931f8ac..fc7114da1ab 100644 --- a/mindspore/lite/test/models_caffe.cfg +++ b/mindspore/lite/test/models_caffe.cfg @@ -67,3 +67,4 @@ PoseNet_dla_17_x512 ml_location_scene_division ml_tabel_recog ml_text_division +6c_seg_nomean_20200610 diff --git a/mindspore/lite/test/models_npu.cfg b/mindspore/lite/test/models_npu.cfg index 0d5d1e22899..82466639b89 100644 --- a/mindspore/lite/test/models_npu.cfg +++ b/mindspore/lite/test/models_npu.cfg @@ -1,4 +1,5 @@ mobilenet_v2_1.0_224.tflite 2.5 squeezenet.tflite 2.5 inception_v3.tflite 1 +6c_seg_nomean_20200610 1.5 porseg_tmp.onnx 1 2