forked from mindspore-Ecosystem/mindspore
!13867 add pad and strided slice fusion npu
From: @zhaozhenlong Reviewed-by: Signed-off-by:
This commit is contained in:
commit
9b23952fc2
|
@ -18,6 +18,9 @@
|
|||
#include "src/runtime/agent/npu/optimizer/npu_pass_utils.h"
|
||||
#include "src/lite_kernel.h"
|
||||
#include "nnacl/concat_parameter.h"
|
||||
#include "nnacl/split_parameter.h"
|
||||
#include "nnacl/pad_parameter.h"
|
||||
#include "nnacl/strided_slice_parameter.h"
|
||||
|
||||
namespace mindspore::lite {
|
||||
bool CheckFusion(kernel::LiteKernel *kernel) {
|
||||
|
@ -119,7 +122,7 @@ void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) {
|
|||
}
|
||||
|
||||
void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
|
||||
auto tensors_vec = cur_kernel->in_tensors();
|
||||
auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_kernel);
|
||||
for (auto in_kernel : cur_kernel->in_kernels()) {
|
||||
lite::Tensor *cur_tensor = nullptr;
|
||||
auto in_tensor = in_kernel->in_tensors()[0];
|
||||
|
@ -136,6 +139,15 @@ void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
|
|||
}
|
||||
}
|
||||
}
|
||||
// add constant inputs back
|
||||
if (nodes2const_index.find(static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)) !=
|
||||
nodes2const_index.end()) {
|
||||
tensors_vec.resize(cur_kernel->in_tensors().size());
|
||||
auto const_index = nodes2const_index[static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)];
|
||||
for (auto index : const_index) {
|
||||
tensors_vec[index] = cur_kernel->in_tensors()[index];
|
||||
}
|
||||
}
|
||||
cur_kernel->set_in_tensors(tensors_vec);
|
||||
}
|
||||
|
||||
|
@ -275,15 +287,75 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int NPUFusionPass::SplitFusion(kernel::LiteKernel *kernel) {
|
||||
UpdateKernel(kernel);
|
||||
auto split_param = reinterpret_cast<SplitParameter *>(kernel->op_parameter());
|
||||
split_param->split_dim_ = TransFormAxis(split_param->split_dim_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int NPUFusionPass::PadFusion(kernel::LiteKernel *kernel) {
|
||||
UpdateKernel(kernel);
|
||||
auto pad_param = reinterpret_cast<PadParameter *>(kernel->op_parameter());
|
||||
int c1 = pad_param->paddings_[6];
|
||||
int c2 = pad_param->paddings_[7];
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// n n h h w w c c
|
||||
// n n c c h h w w
|
||||
pad_param->paddings_[6] = pad_param->paddings_[4];
|
||||
pad_param->paddings_[7] = pad_param->paddings_[5];
|
||||
pad_param->paddings_[4] = pad_param->paddings_[2];
|
||||
pad_param->paddings_[5] = pad_param->paddings_[3];
|
||||
pad_param->paddings_[2] = c1;
|
||||
pad_param->paddings_[3] = c2;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int NPUFusionPass::StridedSliceFusion(kernel::LiteKernel *kernel) {
|
||||
// basic requirement: input is nhwc 4d
|
||||
UpdateKernel(kernel);
|
||||
auto param = reinterpret_cast<StridedSliceParameter *>(kernel->op_parameter());
|
||||
auto begin_tensor = kernel->in_tensors().at(1);
|
||||
int *begin = reinterpret_cast<int *>(begin_tensor->data_c());
|
||||
(void)NPUPassUtils::AssistDataNHWC2NCHW(begin, 1);
|
||||
auto end_tensor = kernel->in_tensors().at(2);
|
||||
int *end = reinterpret_cast<int *>(end_tensor->data_c());
|
||||
NPUPassUtils::AssistDataNHWC2NCHW(end, 1);
|
||||
auto stride_tensor = kernel->in_tensors().at(3);
|
||||
if (kernel->in_tensors().size() == 5) {
|
||||
stride_tensor = kernel->in_tensors().at(4);
|
||||
}
|
||||
int *stride = reinterpret_cast<int *>(stride_tensor->data_c());
|
||||
NPUPassUtils::AssistDataNHWC2NCHW(stride, 1);
|
||||
param->begins_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->begins_mask_);
|
||||
param->ends_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ends_mask_);
|
||||
param->ellipsisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ellipsisMask_);
|
||||
param->newAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->newAxisMask_);
|
||||
param->shrinkAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->shrinkAxisMask_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int NPUFusionPass::Run() {
|
||||
for (size_t i = 0; i < kernels->size(); i++) {
|
||||
auto kernel = (*kernels)[i];
|
||||
if (CheckFusion(kernel)) {
|
||||
switch (kernel->Type()) {
|
||||
case schema::PrimitiveType_Split:
|
||||
i -= kernel->in_kernels().size();
|
||||
SplitFusion(kernel);
|
||||
continue;
|
||||
case schema::PrimitiveType_Concat:
|
||||
i -= kernel->in_kernels().size();
|
||||
ConcatFusion(kernel);
|
||||
continue;
|
||||
case schema::PrimitiveType_PadFusion:
|
||||
i -= kernel->in_kernels().size();
|
||||
PadFusion(kernel);
|
||||
continue;
|
||||
case schema::PrimitiveType_StridedSlice:
|
||||
i -= kernel->in_kernels().size();
|
||||
StridedSliceFusion(kernel);
|
||||
continue;
|
||||
case schema::PrimitiveType_AddFusion:
|
||||
case schema::PrimitiveType_Activation:
|
||||
case schema::PrimitiveType_Eltwise:
|
||||
|
|
|
@ -39,6 +39,9 @@ class NPUFusionPass : public NPUBasePass {
|
|||
int CommonFusion(kernel::LiteKernel *kernel);
|
||||
int ConcatFusion(kernel::LiteKernel *kernel);
|
||||
int FormatFusion(kernel::LiteKernel *kernel);
|
||||
int SplitFusion(kernel::LiteKernel *kernel);
|
||||
int PadFusion(kernel::LiteKernel *kernel);
|
||||
int StridedSliceFusion(kernel::LiteKernel *kernel);
|
||||
|
||||
private:
|
||||
std::vector<kernel::LiteKernel *> *kernels;
|
||||
|
|
|
@ -23,8 +23,10 @@ namespace mindspore::lite {
|
|||
using kernel::KERNEL_ARCH::kNPU;
|
||||
enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert };
|
||||
std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = {
|
||||
schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
|
||||
schema::PrimitiveType_Activation};
|
||||
schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
|
||||
schema::PrimitiveType_Activation, schema::PrimitiveType_Split, schema::PrimitiveType_PadFusion,
|
||||
schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Activation};
|
||||
|
||||
// this pass goal is to minimize subgraphs generated
|
||||
// by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with
|
||||
// fusion pass. If transpose inserted are more than half of input output, we will insert remaining input
|
||||
|
@ -44,7 +46,7 @@ std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = {
|
|||
// so we won't insert nc2nh or nh2nc when op's in kernels and out kernels contains no nc2nh or nh2nc.
|
||||
// This pass should be run after npu_transform_pass, which insert transpose for nchw-input-limited op like conv2d.
|
||||
|
||||
int GetInsertState(kernel::LiteKernel *kernel) {
|
||||
int NPUInsertTransformPass::GetInsertState(kernel::LiteKernel *kernel) {
|
||||
// filter out irrelevant kernel
|
||||
if (npu_insert_nodes.find(kernel->Type()) == npu_insert_nodes.end()) {
|
||||
return InsertNone;
|
||||
|
@ -52,15 +54,17 @@ int GetInsertState(kernel::LiteKernel *kernel) {
|
|||
|
||||
// current kernel is target kernel
|
||||
// use out kernels to count how many out lines from current kernel
|
||||
std::vector<Tensor *> in_tensors = NPUPassUtils::GetNonConstInputs(kernel);
|
||||
size_t in_out_tensor_num =
|
||||
kernel->in_tensors().size() + std::max(kernel->out_kernels().size(), static_cast<size_t>(1));
|
||||
in_tensors.size() +
|
||||
std::max(std::max(kernel->out_kernels().size(), static_cast<size_t>(1)), kernel->out_tensors().size());
|
||||
size_t transpose_input_num = 0;
|
||||
size_t transpose_output_num = 0;
|
||||
bool need_pre_insert = false;
|
||||
bool need_post_insert = false;
|
||||
// count number of input tensor from nc2nh and output tensor to nh2nc
|
||||
for (size_t i = 0; i < kernel->in_tensors().size(); ++i) {
|
||||
auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i);
|
||||
for (size_t i = 0; i < in_tensors.size(); ++i) {
|
||||
auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, in_tensors.at(i));
|
||||
if (NPUPassUtils::IsNchw2Nhwc(in_kernel)) {
|
||||
transpose_input_num++;
|
||||
} else {
|
||||
|
@ -81,21 +85,22 @@ int GetInsertState(kernel::LiteKernel *kernel) {
|
|||
// won't insert any thing if num of transpose tensor is smaller than half of total input output.
|
||||
// won't insert if total input output are all transpose tensor, the fusion pass will handle this.
|
||||
size_t transpose_tensor_num = transpose_input_num + transpose_output_num;
|
||||
if (transpose_tensor_num <= in_out_tensor_num / 2 || transpose_tensor_num == in_out_tensor_num) {
|
||||
if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num ||
|
||||
transpose_tensor_num == in_out_tensor_num) {
|
||||
return InsertNone;
|
||||
}
|
||||
|
||||
InsertState ret;
|
||||
if (need_pre_insert && !need_post_insert) {
|
||||
return PreInsert;
|
||||
}
|
||||
if (need_pre_insert && need_post_insert) {
|
||||
return BothInsert;
|
||||
}
|
||||
if (!need_pre_insert && need_post_insert) {
|
||||
return PostInsert;
|
||||
ret = PreInsert;
|
||||
} else if (need_pre_insert && need_post_insert) {
|
||||
ret = BothInsert;
|
||||
} else if (!need_pre_insert && need_post_insert) {
|
||||
ret = PostInsert;
|
||||
} else {
|
||||
ret = InsertNone;
|
||||
}
|
||||
|
||||
return InsertNone;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int NPUInsertTransformPass::InsertNode(kernel::LiteKernel *kernel, kernel::LiteKernel *post_kernel,
|
||||
|
@ -200,13 +205,20 @@ int NPUInsertTransformPass::InsertForOutputTensor(kernel::LiteKernel *kernel, ke
|
|||
int NPUInsertTransformPass::InsertPreNodes(kernel::LiteKernel *kernel,
|
||||
std::vector<kernel::LiteKernel *> *trans_kernels) {
|
||||
int ret = RET_OK;
|
||||
for (size_t i = 0; i < kernel->in_tensors().size(); ++i) {
|
||||
auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i);
|
||||
auto in_tensors = NPUPassUtils::GetNonConstInputs(kernel);
|
||||
for (auto tensor : in_tensors) {
|
||||
auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, tensor);
|
||||
if (NPUPassUtils::IsNchw2Nhwc(pre_kernel)) {
|
||||
continue;
|
||||
}
|
||||
// if this tensor is input of graph, pre_kernel is nullptr.
|
||||
ret = InsertForInputTensor(kernel, i, pre_kernel, trans_kernels);
|
||||
auto it = find(kernel->in_tensors().begin(), kernel->in_tensors().end(), tensor);
|
||||
if (it == kernel->in_tensors().end()) {
|
||||
MS_LOG(ERROR) << "Find in tensor index error";
|
||||
return RET_ERROR;
|
||||
}
|
||||
size_t index = it - kernel->in_tensors().begin();
|
||||
ret = InsertForInputTensor(kernel, index, pre_kernel, trans_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() << " failed.";
|
||||
return ret;
|
||||
|
@ -249,59 +261,63 @@ int NPUInsertTransformPass::InsertPostNodes(kernel::LiteKernel *kernel,
|
|||
|
||||
int NPUInsertTransformPass::Run() {
|
||||
std::vector<kernel::LiteKernel *> insert_kernels;
|
||||
for (size_t i = 0; i < all_kernels_->size(); i++) {
|
||||
auto kernel = (*all_kernels_)[i];
|
||||
if (kernel->desc().arch != kNPU) {
|
||||
continue;
|
||||
}
|
||||
auto insert_state = GetInsertState(kernel);
|
||||
insert_kernels.clear();
|
||||
// If the every output kernel is nhwc2nchw, insert
|
||||
// modify loop index add post_kernels.size() to the next kernel in the origin vector
|
||||
switch (insert_state) {
|
||||
case PreInsert: {
|
||||
auto ret = InsertPreNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
|
||||
<< " failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
break;
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
for (size_t i = 0; i < all_kernels_->size(); i++) {
|
||||
auto kernel = (*all_kernels_)[i];
|
||||
if (kernel->desc().arch != kNPU) {
|
||||
continue;
|
||||
}
|
||||
case PostInsert: {
|
||||
auto ret = InsertPostNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed.";
|
||||
return RET_ERROR;
|
||||
auto insert_state = GetInsertState(kernel);
|
||||
insert_kernels.clear();
|
||||
// If the every output kernel is nhwc2nchw, insert
|
||||
// modify loop index add post_kernels.size() to the next kernel in the origin vector
|
||||
switch (insert_state) {
|
||||
case PreInsert: {
|
||||
auto ret = InsertPreNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
|
||||
<< " failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
break;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
break;
|
||||
}
|
||||
case BothInsert: {
|
||||
auto ret = InsertPreNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
|
||||
<< " failed.";
|
||||
return RET_ERROR;
|
||||
case PostInsert: {
|
||||
auto ret = InsertPostNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name()
|
||||
<< " failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
break;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
case BothInsert: {
|
||||
auto ret = InsertPreNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
|
||||
<< " failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
|
||||
insert_kernels.clear();
|
||||
ret = InsertPostNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed.";
|
||||
return RET_ERROR;
|
||||
insert_kernels.clear();
|
||||
ret = InsertPostNodes(kernel, &insert_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name()
|
||||
<< " failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
break;
|
||||
}
|
||||
all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
|
||||
i += insert_kernels.size();
|
||||
break;
|
||||
default:
|
||||
MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name();
|
||||
}
|
||||
default:
|
||||
MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name();
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
|
|
|
@ -34,6 +34,7 @@ class NPUInsertTransformPass : public NPUBasePass {
|
|||
int Run() override;
|
||||
|
||||
private:
|
||||
int GetInsertState(kernel::LiteKernel *kernel);
|
||||
int InsertPreNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels);
|
||||
|
||||
int InsertPostNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels);
|
||||
|
|
|
@ -25,7 +25,10 @@
|
|||
namespace mindspore::lite {
|
||||
using kernel::KERNEL_ARCH::kCPU;
|
||||
using kernel::KERNEL_ARCH::kNPU;
|
||||
|
||||
std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index{
|
||||
{schema::PrimitiveType_Split, {1}},
|
||||
{schema::PrimitiveType_PadFusion, {1}},
|
||||
{schema::PrimitiveType_StridedSlice, {1, 2, 3}}};
|
||||
kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
|
||||
const std::vector<Tensor *> &out_tensors,
|
||||
const InnerContext *ctx, const std::string &name) {
|
||||
|
@ -125,8 +128,8 @@ void NPUPassUtils::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *pre_kernel,
|
|||
}
|
||||
std::copy(trans_kernels.begin(), trans_kernels.end(), std::back_inserter(cur_out_kernels));
|
||||
pre_kernel->set_out_kernels(cur_out_kernels);
|
||||
// For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor with
|
||||
// the input tensor of trans.
|
||||
// For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor
|
||||
// with the input tensor of trans.
|
||||
pre_kernel->set_out_tensors({trans_kernels.at(0)->in_tensors().at(0)});
|
||||
}
|
||||
|
||||
|
@ -158,7 +161,7 @@ void NPUPassUtils::UpdateNC2NHTransNodePostKernel(kernel::LiteKernel *kernel, ke
|
|||
Tensor *old_in_tensor = nullptr;
|
||||
// find out which input tensor of post_kernel should be updated
|
||||
for (size_t i = 0; i < post_in_tensors.size(); ++i) {
|
||||
if (KernelInputFromKernel(post_kernel, i) == kernel) {
|
||||
if (KernelInputFromKernel(post_kernel, post_in_tensors.at(i)) == kernel) {
|
||||
old_in_tensor = post_in_tensors.at(i);
|
||||
break;
|
||||
}
|
||||
|
@ -219,17 +222,16 @@ bool NPUPassUtils::IsNchw2Nhwc(const kernel::LiteKernel *kernel) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index) {
|
||||
kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor) {
|
||||
// given kernel and input tensor index, get which kernel output this tensor.
|
||||
// If input tensor is graph input, return nullptr.
|
||||
if (kernel == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
auto tensor = kernel->in_tensors().at(in_tensor_index);
|
||||
auto in_kernels = kernel->in_kernels();
|
||||
auto output_contain = [tensor](const kernel::LiteKernel *kernel) {
|
||||
auto output_contain = [in_tensor](const kernel::LiteKernel *kernel) {
|
||||
auto out_tensors = kernel->out_tensors();
|
||||
return std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end();
|
||||
return std::find(out_tensors.begin(), out_tensors.end(), in_tensor) != out_tensors.end();
|
||||
};
|
||||
auto it = std::find_if(in_kernels.begin(), in_kernels.end(), output_contain);
|
||||
if (it == in_kernels.end()) {
|
||||
|
@ -238,10 +240,57 @@ kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel
|
|||
return *it;
|
||||
}
|
||||
|
||||
std::vector<Tensor *> NPUPassUtils::GetNonConstInputs(kernel::LiteKernel *kernel) {
|
||||
if (kernel == nullptr) {
|
||||
return std::vector<Tensor *>{};
|
||||
}
|
||||
auto type = static_cast<schema::PrimitiveType>(kernel->op_parameter()->type_);
|
||||
auto it = nodes2const_index.find(type);
|
||||
if (it != nodes2const_index.end()) {
|
||||
auto const_input_indices = it->second;
|
||||
std::vector<Tensor *> non_const_in_tensors;
|
||||
auto in_tensors = kernel->in_tensors();
|
||||
for (auto i = 0; i < in_tensors.size(); ++i) {
|
||||
if (const_input_indices.find(i) == const_input_indices.end()) {
|
||||
non_const_in_tensors.push_back(in_tensors[i]);
|
||||
}
|
||||
}
|
||||
return non_const_in_tensors;
|
||||
}
|
||||
return kernel->in_tensors();
|
||||
}
|
||||
|
||||
bool NPUPassUtils::Scale4dCase(const kernel::LiteKernel *kernel) {
|
||||
MS_ASSERT(kernel != nullptr && kernel->op_parameter() != nullptr);
|
||||
auto scale_param = reinterpret_cast<ScaleParameter *>(kernel->op_parameter());
|
||||
auto in_tensor = kernel->in_tensors().at(1);
|
||||
return in_tensor->shape().size() == 1 && (scale_param->axis_ == 3 || scale_param->axis_ == -1);
|
||||
}
|
||||
|
||||
void NPUPassUtils::AssistDataNHWC2NCHW(int *data, size_t unit_size) {
|
||||
MS_ASSERT(data != nullptr);
|
||||
for (size_t i = 0; i < unit_size; ++i) {
|
||||
int c = data[3 * unit_size + i];
|
||||
// n h w c
|
||||
// n c h w
|
||||
data[3 * unit_size + i] = data[2 * unit_size + i];
|
||||
data[2 * unit_size + i] = data[unit_size + i];
|
||||
data[unit_size + i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
int NPUPassUtils::MaskDataNHWC2NCHW(int mask) {
|
||||
int mask_vec[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
mask_vec[i] = (uint32_t)(mask) & (1 << i);
|
||||
}
|
||||
AssistDataNHWC2NCHW(mask_vec, 1);
|
||||
int ret = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (mask_vec[i]) {
|
||||
ret += 1 << i;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
} // namespace mindspore::lite
|
||||
|
|
|
@ -17,9 +17,12 @@
|
|||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include "src/lite_kernel.h"
|
||||
namespace mindspore::lite {
|
||||
extern std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index;
|
||||
class NPUPassUtils {
|
||||
public:
|
||||
static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
|
||||
|
@ -52,8 +55,11 @@ class NPUPassUtils {
|
|||
static bool IsNhwc2Nchw(const kernel::LiteKernel *kernel);
|
||||
|
||||
static bool IsNchw2Nhwc(const kernel::LiteKernel *kernel);
|
||||
static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index);
|
||||
static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor);
|
||||
static std::vector<Tensor *> GetNonConstInputs(kernel::LiteKernel *kernel);
|
||||
static bool Scale4dCase(const kernel::LiteKernel *kernel);
|
||||
static void AssistDataNHWC2NCHW(int *data, size_t unit_size);
|
||||
static int MaskDataNHWC2NCHW(int mask);
|
||||
};
|
||||
} // namespace mindspore::lite
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include "src/runtime/agent/npu/optimizer/npu_transform_pass.h"
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/agent/npu/npu_manager.h"
|
||||
|
@ -22,7 +21,7 @@
|
|||
namespace mindspore::lite {
|
||||
using kernel::KERNEL_ARCH::kNPU;
|
||||
|
||||
static std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
|
||||
std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
|
||||
schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_Resize,
|
||||
schema::PrimitiveType_MaxPoolFusion, schema::PrimitiveType_AvgPoolFusion, schema::PrimitiveType_ScaleFusion};
|
||||
|
||||
|
|
|
@ -16,11 +16,14 @@
|
|||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/agent/npu/optimizer/npu_base_pass.h"
|
||||
|
||||
namespace mindspore::lite {
|
||||
extern std::set<mindspore::schema::PrimitiveType> npu_trans_nodes;
|
||||
class NPUTransformPass : public NPUBasePass {
|
||||
public:
|
||||
int Run() override;
|
||||
|
|
|
@ -31,7 +31,7 @@ int PadNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std
|
|||
}
|
||||
if (inputs.size() >= 2 && inputs[1]->data_c() != nullptr) {
|
||||
for (int i = 0; i < inputs[1]->ElementsNum(); i++) {
|
||||
paddings_.push_back(static_cast<int *>(inputs[1]->data_c())[i]);
|
||||
param_->paddings_[i] = static_cast<int *>(inputs[1]->data_c())[i];
|
||||
}
|
||||
} else {
|
||||
MS_LOG(WARNING) << "NPU axis is attribute.";
|
||||
|
@ -50,7 +50,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const
|
|||
int size = static_cast<int>(param_->padding_length / 2);
|
||||
ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32);
|
||||
ge::TensorPtr padding_tensor = std::make_shared<hiai::Tensor>(padding_tensor_desc);
|
||||
padding_tensor->SetData(reinterpret_cast<uint8_t *>(paddings_.data()), 2 * size * sizeof(int));
|
||||
padding_tensor->SetData(reinterpret_cast<uint8_t *>(param_->paddings_), 2 * size * sizeof(int));
|
||||
hiai_paddings_ = new hiai::op::Const(name_ + "paddings");
|
||||
hiai_paddings_->set_attr_value(padding_tensor);
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ class PadNPUKernel : public NPUKernel {
|
|||
private:
|
||||
hiai::op::PadV2 *op_ = nullptr;
|
||||
PadParameter *param_;
|
||||
std::vector<int> paddings_;
|
||||
hiai::op::Const *hiai_paddings_ = nullptr;
|
||||
hiai::op::Const *hiai_constant_ = nullptr;
|
||||
};
|
||||
|
|
|
@ -77,3 +77,5 @@ ml_video_edit_img_segment_adaptise_pb2tflite.tflite 0.5 2
|
|||
ml_video_edit_imitate_filter.onnx 200
|
||||
hdc_mobilenet_1w_class.onnx 20
|
||||
hdc_age_medium 504
|
||||
posenet_mobilenet_float_075_1_default_1.tflite 395
|
||||
nasnet_mobile.tflite 1
|
||||
|
|
Loading…
Reference in New Issue