From c65435fdb79c53abf560702f906a2df0fa039dc8 Mon Sep 17 00:00:00 2001 From: zengxianglong Date: Tue, 24 Aug 2021 12:45:59 +0800 Subject: [PATCH] npu optimization and add models --- .../delegate/npu/pass/npu_insert_transform_pass.cc | 14 ++++++++++++-- mindspore/lite/test/config/models_npu.cfg | 1 + mindspore/lite/test/config/models_onnx.cfg | 3 ++- mindspore/lite/test/config/models_onnx_fp16.cfg | 1 + mindspore/lite/test/config/models_tflite.cfg | 1 + mindspore/lite/test/config/models_tflite_fp16.cfg | 1 + .../lite/test/st/scripts/run_benchmark_codegen.sh | 11 ++--------- 7 files changed, 20 insertions(+), 12 deletions(-) diff --git a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc index 118e5dc0838..426d33a9a1b 100644 --- a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc +++ b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc @@ -60,6 +60,8 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) { inputs.size() + std::max(std::max(op->out_ops().size(), static_cast(1)), op->outputs().size()); size_t transpose_input_num = 0; size_t transpose_output_num = 0; + size_t graph_input_num = 0; + size_t graph_output_num = 0; bool need_pre_insert = false; bool need_post_insert = false; // count number of input tensor from nc2nh and output tensor to nh2nc @@ -70,10 +72,16 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) { } else { need_pre_insert = true; } + if (in_op == nullptr) { + graph_input_num++; + } } if (op->out_ops().empty()) { need_post_insert = true; } + if (op->outputs().size() > op->out_ops().size()) { + graph_output_num = op->outputs().size() - op->out_ops().size(); + } for (const auto out_op : op->out_ops()) { if (NPUPassUtils::IsNhwc2Nchw(out_op)) { transpose_output_num++; @@ -82,10 +90,12 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) { } } - // won't insert any thing if num of transpose tensor is smaller than half of total input output. + // won't insert any thing if num of transpose tensor is smaller than half of total op inputs and op outputs, unless + // current op is the graph input or output op, since we should avoid to build a single op subgraph in this case. // won't insert if total input output are all transpose tensor, the fusion pass will handle this. size_t transpose_tensor_num = transpose_input_num + transpose_output_num; - if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num || + size_t connected_in_out_tensor_num = in_out_tensor_num - graph_output_num - graph_input_num; + if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < connected_in_out_tensor_num || transpose_tensor_num == in_out_tensor_num) { return InsertNone; } diff --git a/mindspore/lite/test/config/models_npu.cfg b/mindspore/lite/test/config/models_npu.cfg index cd14797d0ef..d6d43c3abc8 100644 --- a/mindspore/lite/test/config/models_npu.cfg +++ b/mindspore/lite/test/config/models_npu.cfg @@ -89,3 +89,4 @@ ml_video_edit_makeup_mobilenetv203.onnx 2 ml_video_edit_hairline_segmentation;3 0.5 ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 0.5 Modify_Out_ml_audio_kit_encoder_v5.pb;6:5,2,1,4,6,3;1:1,32:1,32:1,32:1:1,32 +ml_video_edit_hair_dyeing_migrate_v2_fix.onnx;4 1.5 diff --git a/mindspore/lite/test/config/models_onnx.cfg b/mindspore/lite/test/config/models_onnx.cfg index ee3a57ac845..df761baf8b5 100644 --- a/mindspore/lite/test/config/models_onnx.cfg +++ b/mindspore/lite/test/config/models_onnx.cfg @@ -103,4 +103,5 @@ tiny-yolov3-11.onnx;2;1,224,224,3:1,2 3 # cur acc for ml_video_edit_art_transfer is 2+% ml_video_edit_art_transfer.onnx;3 ssd-10.onnx;;;;calib_only -Q888_CV_face_recognition_self.onnx \ No newline at end of file +Q888_CV_face_recognition_self.onnx +ml_video_edit_hair_dyeing_migrate_v2_fix.onnx;4 diff --git a/mindspore/lite/test/config/models_onnx_fp16.cfg b/mindspore/lite/test/config/models_onnx_fp16.cfg index 0b864e15da5..1b95f896b5f 100644 --- a/mindspore/lite/test/config/models_onnx_fp16.cfg +++ b/mindspore/lite/test/config/models_onnx_fp16.cfg @@ -103,3 +103,4 @@ ml_video_edit_makeup_mobilenetv203.onnx 4 # The input of ml_video_edit_hair_dyeing_migrate_v2.onnx should be between [0, 1] ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 2.5 Q888_CV_face_recognition_self.onnx 3.5 +ml_video_edit_hair_dyeing_migrate_v2_fix.onnx;4 3 diff --git a/mindspore/lite/test/config/models_tflite.cfg b/mindspore/lite/test/config/models_tflite.cfg index fef9fb298c7..2511a80aa5c 100644 --- a/mindspore/lite/test/config/models_tflite.cfg +++ b/mindspore/lite/test/config/models_tflite.cfg @@ -200,3 +200,4 @@ lite-model_albert_lite_base_squadv1_metadata_1.tflite;3:2,3,1 lite-model_mobilebert_1_metadata_1.tflite;3 Modify_Out_hiai_vad.tflite;2 add_uint8.tflite;2 +coco_ssd_mobilenet_v1_1.0.tflite diff --git a/mindspore/lite/test/config/models_tflite_fp16.cfg b/mindspore/lite/test/config/models_tflite_fp16.cfg index 66fba94be91..0e5e4359b60 100644 --- a/mindspore/lite/test/config/models_tflite_fp16.cfg +++ b/mindspore/lite/test/config/models_tflite_fp16.cfg @@ -220,3 +220,4 @@ hdc_tb_cn_neg.tflite;3:3,1,2 295 hiai_cv_labelDetectorModel_v3.tflite;2:2,1 2 ml_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 1 ml_ei_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 0.6 +coco_ssd_mobilenet_v1_1.0.tflite diff --git a/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh b/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh index 54c84008c97..d20c5792c6f 100644 --- a/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh +++ b/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh @@ -267,29 +267,25 @@ fi if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then # Run on x86-codegen-parallel echo "start Run x86 codegen parallel ..." - Run_x86_codegen ${build_parallal_path} ${ms_models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} & - Run_x86_codegen_parallel_PID=$! - sleep 1 + Run_x86_codegen ${build_parallal_path} ${ms_models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} + Run_x86_codegen_parallel_status=$? fi if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen&train" ]]; then # Run on codegen echo "start Run arm64 codegen ..." Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm64_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm64" Run_arm64_codegen_status=$? - sleep 1 fi if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen&train" ]]; then # Run on arm32 codegen echo "start Run arm32 codegen ..." Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm32_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm32" Run_arm32_codegen_status=$? - sleep 1 fi if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen&train" ]]; then wait ${Run_x86_codegen_PID} Run_x86_codegen_status=$? - if [[ ${Run_x86_codegen_status} != 0 ]];then echo "Run_x86 codegen failed" cat ${run_x86_codegen_log_file} @@ -297,9 +293,6 @@ if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || fi fi if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then - wait ${Run_x86_codegen_parallel_PID} - Run_x86_codegen_parallel_status=$? - if [[ ${Run_x86_codegen_parallel_status} != 0 ]];then echo "Run_x86 codegen parallel failed" cat ${run_x86_codegen_log_file}