forked from mindspore-Ecosystem/mindspore
npu optimization and add models
This commit is contained in:
parent
eecff027bb
commit
c65435fdb7
|
@ -60,6 +60,8 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
|
||||||
inputs.size() + std::max(std::max(op->out_ops().size(), static_cast<size_t>(1)), op->outputs().size());
|
inputs.size() + std::max(std::max(op->out_ops().size(), static_cast<size_t>(1)), op->outputs().size());
|
||||||
size_t transpose_input_num = 0;
|
size_t transpose_input_num = 0;
|
||||||
size_t transpose_output_num = 0;
|
size_t transpose_output_num = 0;
|
||||||
|
size_t graph_input_num = 0;
|
||||||
|
size_t graph_output_num = 0;
|
||||||
bool need_pre_insert = false;
|
bool need_pre_insert = false;
|
||||||
bool need_post_insert = false;
|
bool need_post_insert = false;
|
||||||
// count number of input tensor from nc2nh and output tensor to nh2nc
|
// count number of input tensor from nc2nh and output tensor to nh2nc
|
||||||
|
@ -70,10 +72,16 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
|
||||||
} else {
|
} else {
|
||||||
need_pre_insert = true;
|
need_pre_insert = true;
|
||||||
}
|
}
|
||||||
|
if (in_op == nullptr) {
|
||||||
|
graph_input_num++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (op->out_ops().empty()) {
|
if (op->out_ops().empty()) {
|
||||||
need_post_insert = true;
|
need_post_insert = true;
|
||||||
}
|
}
|
||||||
|
if (op->outputs().size() > op->out_ops().size()) {
|
||||||
|
graph_output_num = op->outputs().size() - op->out_ops().size();
|
||||||
|
}
|
||||||
for (const auto out_op : op->out_ops()) {
|
for (const auto out_op : op->out_ops()) {
|
||||||
if (NPUPassUtils::IsNhwc2Nchw(out_op)) {
|
if (NPUPassUtils::IsNhwc2Nchw(out_op)) {
|
||||||
transpose_output_num++;
|
transpose_output_num++;
|
||||||
|
@ -82,10 +90,12 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// won't insert any thing if num of transpose tensor is smaller than half of total input output.
|
// won't insert any thing if num of transpose tensor is smaller than half of total op inputs and op outputs, unless
|
||||||
|
// current op is the graph input or output op, since we should avoid to build a single op subgraph in this case.
|
||||||
// won't insert if total input output are all transpose tensor, the fusion pass will handle this.
|
// won't insert if total input output are all transpose tensor, the fusion pass will handle this.
|
||||||
size_t transpose_tensor_num = transpose_input_num + transpose_output_num;
|
size_t transpose_tensor_num = transpose_input_num + transpose_output_num;
|
||||||
if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num ||
|
size_t connected_in_out_tensor_num = in_out_tensor_num - graph_output_num - graph_input_num;
|
||||||
|
if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < connected_in_out_tensor_num ||
|
||||||
transpose_tensor_num == in_out_tensor_num) {
|
transpose_tensor_num == in_out_tensor_num) {
|
||||||
return InsertNone;
|
return InsertNone;
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,3 +89,4 @@ ml_video_edit_makeup_mobilenetv203.onnx 2
|
||||||
ml_video_edit_hairline_segmentation;3 0.5
|
ml_video_edit_hairline_segmentation;3 0.5
|
||||||
ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 0.5
|
ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 0.5
|
||||||
Modify_Out_ml_audio_kit_encoder_v5.pb;6:5,2,1,4,6,3;1:1,32:1,32:1,32:1:1,32
|
Modify_Out_ml_audio_kit_encoder_v5.pb;6:5,2,1,4,6,3;1:1,32:1,32:1,32:1:1,32
|
||||||
|
ml_video_edit_hair_dyeing_migrate_v2_fix.onnx;4 1.5
|
||||||
|
|
|
@ -103,4 +103,5 @@ tiny-yolov3-11.onnx;2;1,224,224,3:1,2 3
|
||||||
# cur acc for ml_video_edit_art_transfer is 2+%
|
# cur acc for ml_video_edit_art_transfer is 2+%
|
||||||
ml_video_edit_art_transfer.onnx;3
|
ml_video_edit_art_transfer.onnx;3
|
||||||
ssd-10.onnx;;;;calib_only
|
ssd-10.onnx;;;;calib_only
|
||||||
Q888_CV_face_recognition_self.onnx
|
Q888_CV_face_recognition_self.onnx
|
||||||
|
ml_video_edit_hair_dyeing_migrate_v2_fix.onnx;4
|
||||||
|
|
|
@ -103,3 +103,4 @@ ml_video_edit_makeup_mobilenetv203.onnx 4
|
||||||
# The input of ml_video_edit_hair_dyeing_migrate_v2.onnx should be between [0, 1]
|
# The input of ml_video_edit_hair_dyeing_migrate_v2.onnx should be between [0, 1]
|
||||||
ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 2.5
|
ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 2.5
|
||||||
Q888_CV_face_recognition_self.onnx 3.5
|
Q888_CV_face_recognition_self.onnx 3.5
|
||||||
|
ml_video_edit_hair_dyeing_migrate_v2_fix.onnx;4 3
|
||||||
|
|
|
@ -200,3 +200,4 @@ lite-model_albert_lite_base_squadv1_metadata_1.tflite;3:2,3,1
|
||||||
lite-model_mobilebert_1_metadata_1.tflite;3
|
lite-model_mobilebert_1_metadata_1.tflite;3
|
||||||
Modify_Out_hiai_vad.tflite;2
|
Modify_Out_hiai_vad.tflite;2
|
||||||
add_uint8.tflite;2
|
add_uint8.tflite;2
|
||||||
|
coco_ssd_mobilenet_v1_1.0.tflite
|
||||||
|
|
|
@ -220,3 +220,4 @@ hdc_tb_cn_neg.tflite;3:3,1,2 295
|
||||||
hiai_cv_labelDetectorModel_v3.tflite;2:2,1 2
|
hiai_cv_labelDetectorModel_v3.tflite;2:2,1 2
|
||||||
ml_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 1
|
ml_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 1
|
||||||
ml_ei_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 0.6
|
ml_ei_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 0.6
|
||||||
|
coco_ssd_mobilenet_v1_1.0.tflite
|
||||||
|
|
|
@ -267,29 +267,25 @@ fi
|
||||||
if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then
|
if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then
|
||||||
# Run on x86-codegen-parallel
|
# Run on x86-codegen-parallel
|
||||||
echo "start Run x86 codegen parallel ..."
|
echo "start Run x86 codegen parallel ..."
|
||||||
Run_x86_codegen ${build_parallal_path} ${ms_models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} &
|
Run_x86_codegen ${build_parallal_path} ${ms_models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file}
|
||||||
Run_x86_codegen_parallel_PID=$!
|
Run_x86_codegen_parallel_status=$?
|
||||||
sleep 1
|
|
||||||
fi
|
fi
|
||||||
if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen&train" ]]; then
|
if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen&train" ]]; then
|
||||||
# Run on codegen
|
# Run on codegen
|
||||||
echo "start Run arm64 codegen ..."
|
echo "start Run arm64 codegen ..."
|
||||||
Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm64_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm64"
|
Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm64_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm64"
|
||||||
Run_arm64_codegen_status=$?
|
Run_arm64_codegen_status=$?
|
||||||
sleep 1
|
|
||||||
fi
|
fi
|
||||||
if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen&train" ]]; then
|
if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen&train" ]]; then
|
||||||
# Run on arm32 codegen
|
# Run on arm32 codegen
|
||||||
echo "start Run arm32 codegen ..."
|
echo "start Run arm32 codegen ..."
|
||||||
Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm32_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm32"
|
Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm32_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm32"
|
||||||
Run_arm32_codegen_status=$?
|
Run_arm32_codegen_status=$?
|
||||||
sleep 1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen&train" ]]; then
|
if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen&train" ]]; then
|
||||||
wait ${Run_x86_codegen_PID}
|
wait ${Run_x86_codegen_PID}
|
||||||
Run_x86_codegen_status=$?
|
Run_x86_codegen_status=$?
|
||||||
|
|
||||||
if [[ ${Run_x86_codegen_status} != 0 ]];then
|
if [[ ${Run_x86_codegen_status} != 0 ]];then
|
||||||
echo "Run_x86 codegen failed"
|
echo "Run_x86 codegen failed"
|
||||||
cat ${run_x86_codegen_log_file}
|
cat ${run_x86_codegen_log_file}
|
||||||
|
@ -297,9 +293,6 @@ if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" ||
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then
|
if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then
|
||||||
wait ${Run_x86_codegen_parallel_PID}
|
|
||||||
Run_x86_codegen_parallel_status=$?
|
|
||||||
|
|
||||||
if [[ ${Run_x86_codegen_parallel_status} != 0 ]];then
|
if [[ ${Run_x86_codegen_parallel_status} != 0 ]];then
|
||||||
echo "Run_x86 codegen parallel failed"
|
echo "Run_x86 codegen parallel failed"
|
||||||
cat ${run_x86_codegen_log_file}
|
cat ${run_x86_codegen_log_file}
|
||||||
|
|
Loading…
Reference in New Issue