fix parallel

2022-08-04 17:23:07 +08:00 · 2022-08-04 17:23:07 +08:00 · 324b409ec0
parent 187cae9f1a
commit 324b409ec0
13 changed files with 106 additions and 51 deletions
--- a/mindspore/lite/test/config_level0/models_codegen.cfg
+++ b/mindspore/lite/test/config_level0/models_codegen.cfg
@ -1,3 +1,4 @@
+intent_detect_hi_v2.tflite
 hiai_model_0909_kd_rot_ps_softmax.tflite
 # hiai_chinese_english_recognize_model_float32.tflite
 # hiai_bigmodel_ghost_2_1_no_normalized_no_trans_tflite.tflite
--- a/mindspore/lite/test/config_level0/models_codegen_parallel.cfg
+++ b/mindspore/lite/test/config_level0/models_codegen_parallel.cfg
@ -1 +0,0 @@
-intent_detect_hi_v2.tflite
--- a/mindspore/lite/test/config_level1/models_codegen_parallel.cfg
+++ b/mindspore/lite/test/config_level1/models_codegen_parallel.cfg
@ -1 +0,0 @@
-
--- a/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh
+++ b/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh
@ -2,7 +2,7 @@
 source ./scripts/base_functions.sh

 function Run_x86_codegen() {
-    # $1:buildPath $2:modelPath $3:models_list $4:logFile $5:resultFile $6:micro_cofig
+    # $1:buildPath $2:modelPath $3:models_list $4:logFile $5:resultFile $6:micro_cofig $7:parallel_flag
    local bind_mode thread_num suffix run_result
    rm -rf $1
    mkdir -p $1
@ -74,7 +74,7 @@ function Run_x86_codegen() {
      bind_mode=""
      thread_num=""
      suffix=""
-      if [[ $3 =~ "parallel" ]]; then
+      if [[ $7 == "parallel" ]]; then
          bind_mode="0"
          thread_num="4"
          suffix="_parallel"
@ -426,7 +426,7 @@ fi
 # Set model-list
 models_codegen_config=${basepath}/../${config_folder}/models_codegen.cfg
 models_cortex_codegen_config=${basepath}/../${config_folder}/models_codegen_cortex.cfg
-models_codegen_parallel_config=${basepath}/../${config_folder}/models_codegen_parallel.cfg
+models_codegen_parallel_config=${basepath}/../${config_folder}/models_codegen.cfg

 #micro config
 micro_x86_config=${basepath}/../${config_folder}/micro/micro_x86.cfg
@ -470,7 +470,7 @@ echo "current backend is ${backend}"
 if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" ]]; then
    # Run on x86-codegen
    echo "start Run x86 codegen ..."
-    Run_x86_codegen ${build_path_x86} ${models_path} ${models_codegen_config} ${run_x86_codegen_log_file} ${run_benchmark_result_file} ${micro_x86_config}
+    Run_x86_codegen ${build_path_x86} ${models_path} ${models_codegen_config} ${run_x86_codegen_log_file} ${run_benchmark_result_file} ${micro_x86_config} ""
    Run_x86_codegen_status=$?
 #    Run_x86_codegen_PID=$!
 #    sleep 1
@ -478,7 +478,7 @@ fi
 if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" ]]; then
    # Run on x86-codegen-parallel
    echo "start Run x86 codegen parallel ..."
-    Run_x86_codegen ${build_path_parallel} ${models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} ${micro_x86_parallel_config}
+    Run_x86_codegen ${build_path_parallel} ${models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} ${micro_x86_parallel_config} "parallel"
    Run_x86_codegen_parallel_status=$?
 #    Run_x86_codegen_parallel_PID=$!
 #    sleep 1
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc
@ -361,6 +361,9 @@ int ArithmeticFP32Coder::BatchScalarCalc(int task_id, CoderContext *const contex
  if (break_pos_ < 1) {
    return RET_ERROR;
  }
+  if (support_parallel_) {
+    thread_num_ = 1;
+  }
  int batch = arithmetic_parameter_->out_elements_num_ / arithmetic_parameter_->out_strides_[break_pos_ - 1];
  int batch_per_thread = UP_DIV(batch, thread_num_);

@ -378,7 +381,11 @@ int ArithmeticFP32Coder::BatchScalarCalc(int task_id, CoderContext *const contex

  arithmetic_wrapper_info_ = {offset0, stride0, offset1, stride1, out_offset, out_stride, arithmetic_func_type_};
  code->CodeStruct("arithmetic_wrapper_info", arithmetic_wrapper_info_);
-  code->CodeStruct("arithmetic_parameter", *arithmetic_parameter_);
+  std::string param_name = "arithmetic_parameter";
+  code->CodeStruct(param_name, *arithmetic_parameter_);
+  if (support_parallel_) {
+    *code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+  }
  code->CodeFunction("BatchScalarCalc", wrap_uint8(input0_ptr_str_), wrap_uint8(input1_ptr_str_),
                     wrap_uint8(output_ptr_str_), batch_size, arithmetic_parameter_->out_strides_[break_pos_ - 1], true,
                     arithmetic_func_str_, "&arithmetic_wrapper_info", "&arithmetic_parameter");
@ -388,6 +395,9 @@ int ArithmeticFP32Coder::BatchScalarCalc(int task_id, CoderContext *const contex

 int ArithmeticFP32Coder::BiasCalc(int task_id, CoderContext *const context, NNaclFp32Serializer *const code) {
  MS_CHECK_TRUE_RET(arithmetic_parameter_->ndim_ - 1 >= 0 && arithmetic_parameter_->ndim_ - 1 < 10, RET_ERROR);
+  if (support_parallel_) {
+    thread_num_ = 1;
+  }
  int last_shape = arithmetic_parameter_->out_shape_[arithmetic_parameter_->ndim_ - 1];
  int batch = arithmetic_parameter_->out_elements_num_ / last_shape;
  int batch_per_thread = UP_DIV(batch, thread_num_);
@ -398,7 +408,11 @@ int ArithmeticFP32Coder::BiasCalc(int task_id, CoderContext *const context, NNac

  int stride = last_shape * data_type_len_;
  int offset = stride * start_batch;
-  code->CodeStruct("arithmetic_parameter", *arithmetic_parameter_);
+  std::string param_name = "arithmetic_parameter";
+  code->CodeStruct(param_name, *arithmetic_parameter_);
+  if (support_parallel_) {
+    *code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+  }
  if (arithmetic_parameter_->in_elements_num0_ > arithmetic_parameter_->in_elements_num1_) {
    arithmetic_wrapper_info_ = {offset, stride, 0, 0, offset, stride, arithmetic_func_type_};
    code->CodeStruct("arithmetic_wrapper_info", arithmetic_wrapper_info_);
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/concat_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/concat_fp32_coder.cc
@ -71,7 +71,9 @@ int ConcatFP32Coder::DoCode(CoderContext *const context) {
    code << "shape_" << i << ", ";
  }
  code << "};\n";
-
+  if (support_parallel_) {
+    thread_num_ = 1;
+  }
  code.CodeFunction("Concat", "inputs_addr", input_num, axis_, "inputs_output_shape", output_tensor_->shape().size(),
                    output_tensor_, 0, thread_num_, sizeof(float));
  context->AppendCode(code.str());
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc
@ -80,8 +80,13 @@ int ConvolutionDepthwiseFP32Coder::DoCode(CoderContext *const context) {
          {});
  nnacl::NNaclFp32Serializer code;
  // call the op function
-  code.CodeStruct("conv_parameter", *conv_param_);
-  code.CodeFunction("ConvDw", output_tensor_, input_tensor_, packed_weight_, bias_, "&conv_parameter", kDefaultTaskId);
+  std::string param_name = "conv_parameter";
+  code.CodeStruct(param_name, *conv_param_);
+  if (support_parallel_) {
+    code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+    code << "    " << param_name << ".thread_num_ = 1;\n";
+  }
+  code.CodeFunction("ConvDw", output_tensor_, input_tensor_, packed_weight_, bias_, "&" + param_name, kDefaultTaskId);
  context->AppendCode(code.str());
  return RET_OK;
 }
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/groupnorm_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/groupnorm_fp32_coder.cc
@ -74,7 +74,11 @@ int GroupNormFP32Coder::DoCode(CoderContext *const context) {
            "group_norm_fp32.c",
          });
  NNaclFp32Serializer code;
-  code.CodeStruct("gn_parameter", *gn_parameter);
+  std::string param_name = "gn_parameter";
+  code.CodeStruct(param_name, *gn_parameter);
+  if (support_parallel_) {
+    code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+  }
  code.CodeFunction("GroupNormFp32", input_tensor_, scale_tensor, offset_tensor, mean_, variance_, "&gn_parameter",
                    kDefaultTaskId, output_tensor_);
  MS_LOG(INFO) << "GroupNormFp32Code has been called";
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc
@ -29,6 +29,9 @@ using mindspore::schema::PrimitiveType_MatMulFusion;

 namespace mindspore::lite::micro::nnacl {
 int MatMulFP32BaseCoder::ReSize() {
+  if (support_parallel_) {
+    thread_num_ = 1;
+  }
  ResizeParameter();
  MS_CHECK_TRUE(params_->col_align_ != 0, "params_->col_align_ = 0");
  thread_count_ = MSMIN(thread_num_, UP_DIV(params_->col_align_, col_tile_));
@ -175,8 +178,12 @@ int MatMulFP32BaseCoder::DoCode(CoderContext *const context) {
  CollectFilesForTarget(context);
  NNaclFp32Serializer code, init_code;
  size_t w_buf_size = 0;
+  std::string param_name = "mat_mul_parameter";

-  code.CodeStruct("mat_mul_parameter", *params_);
+  code.CodeStruct(param_name, *params_);
+  if (support_parallel_) {
+    code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+  }
  init_code.CodeStruct("mat_mul_parameter", *params_);
  // do bias packing to init
  if (input_tensors_.size() == DIMENSION_3D) {
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc
@ -42,7 +42,8 @@ int PoolingFP32Coder::DoCode(CoderContext *const context) {
  pooling_parameter->thread_num_ = pooling_parameter->op_parameter_.thread_num_;

  NNaclFp32Serializer code;
-  code.CodeStruct("pooling_parameter", *pooling_parameter);
+  std::string param_name = "pooling_parameter";
+  code.CodeStruct(param_name, *pooling_parameter);
  float minf = -FLT_MAX;
  float maxf = FLT_MAX;
  Collect(context,
@ -71,12 +72,18 @@ int PoolingFP32Coder::DoCode(CoderContext *const context) {
  }
  if (pooling_parameter->pool_mode_ == PoolMode_MaxPool) {
    if (!support_parallel_) {
+      code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+      code << "    " << param_name << ".thread_num_ = 1;\n";
      code.CodeFunction("MaxPooling", input_tensor_, output_tensor_, "&pooling_parameter", kDefaultTaskId, minf, maxf);
    } else {
      code.CodeBaseStruct("PoolingFp32Args", kRunArgs, input_tensor_, output_tensor_, minf, maxf, "&pooling_parameter");
      code.CodeFunction(kParallelLaunch, "DoMaxPooling", kRunArgsAddr, "pooling_parameter.op_parameter_.thread_num_");
    }
  } else {
+    if (support_parallel_) {
+      code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+      code << "    " << param_name << ".thread_num_ = 1;\n";
+    }
    code.CodeFunction("AvgPooling", input_tensor_, output_tensor_, "&pooling_parameter", kDefaultTaskId, minf, maxf);
  }

--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/softmax_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/softmax_fp32_coder.cc
@ -60,9 +60,13 @@ int SoftMaxFP32Coder::DoCode(CoderContext *const context) {
            "exp_fp32.c",
          });
  NNaclFp32Serializer code;
-  code.CodeStruct("softmax_parameter", *softmax_param_);
+  std::string param_name = "softmax_parameter";
+  code.CodeStruct(param_name, *softmax_param_);
  code.CodeFunction("memset", sum_data_, "0", sum_data_size_);
  auto primitive_type = softmax_param_->op_parameter_.type_;
+  if (support_parallel_) {
+    code << "    " << param_name << ".op_parameter_.thread_num_ = 1;\n";
+  }
  if (primitive_type == schema::PrimitiveType_Softmax) {
    code.CodeFunction("Softmax", input_tensor_, output_tensor_, sum_data_, "&softmax_parameter");
  } else {
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc
@ -23,19 +23,19 @@
 namespace mindspore::lite::micro::nnacl {
 int NNaclFp32Serializer::count = 0;
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const PoolingParameter &pooling_parameter) {
-  CodeBaseStruct("PoolingParameter", name,
-                 // Primitive parameter
-                 pooling_parameter.op_parameter_, pooling_parameter.pool_mode_, pooling_parameter.round_mode_,
-                 pooling_parameter.pad_mode_, pooling_parameter.act_type_, pooling_parameter.avg_mode_,
-                 pooling_parameter.global_, pooling_parameter.window_w_, pooling_parameter.window_h_,
-                 pooling_parameter.stride_w_, pooling_parameter.stride_h_,
-                 // shape correlative
-                 pooling_parameter.input_w_, pooling_parameter.input_h_, pooling_parameter.input_batch_,
-                 pooling_parameter.input_channel_, pooling_parameter.output_w_, pooling_parameter.output_h_,
-                 pooling_parameter.output_batch_, pooling_parameter.output_channel_, pooling_parameter.pad_u_,
-                 pooling_parameter.pad_d_, pooling_parameter.pad_l_, pooling_parameter.pad_r_,
-                 // other parameter
-                 gThreadNum, nullptr, pooling_parameter.quantize_);
+  CodeBaseStruct<false>("PoolingParameter", name,
+                        // Primitive parameter
+                        pooling_parameter.op_parameter_, pooling_parameter.pool_mode_, pooling_parameter.round_mode_,
+                        pooling_parameter.pad_mode_, pooling_parameter.act_type_, pooling_parameter.avg_mode_,
+                        pooling_parameter.global_, pooling_parameter.window_w_, pooling_parameter.window_h_,
+                        pooling_parameter.stride_w_, pooling_parameter.stride_h_,
+                        // shape correlative
+                        pooling_parameter.input_w_, pooling_parameter.input_h_, pooling_parameter.input_batch_,
+                        pooling_parameter.input_channel_, pooling_parameter.output_w_, pooling_parameter.output_h_,
+                        pooling_parameter.output_batch_, pooling_parameter.output_channel_, pooling_parameter.pad_u_,
+                        pooling_parameter.pad_d_, pooling_parameter.pad_l_, pooling_parameter.pad_r_,
+                        // other parameter
+                        gThreadNum, nullptr, pooling_parameter.quantize_);
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const BatchNormParameter &batch_norm_parameter) {
@ -45,19 +45,20 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const BatchNormPar
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) {
-  CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_,
-                 arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_,
-                 ToString(arithmetic_parameter.in_shape0_), arithmetic_parameter.in_elements_num0_,
-                 ToString(arithmetic_parameter.in_shape1_), arithmetic_parameter.in_elements_num1_,
-                 ToString(arithmetic_parameter.out_shape_), arithmetic_parameter.out_elements_num_,
-                 ToString(arithmetic_parameter.in_strides0_), ToString(arithmetic_parameter.in_strides1_),
-                 ToString(arithmetic_parameter.out_strides_), ToString(arithmetic_parameter.multiples0_),
-                 ToString(arithmetic_parameter.multiples1_));
+  CodeBaseStruct<false>("ArithmeticParameter", name, arithmetic_parameter.op_parameter_,
+                        arithmetic_parameter.broadcasting_, arithmetic_parameter.ndim_,
+                        arithmetic_parameter.activation_type_, ToString(arithmetic_parameter.in_shape0_),
+                        arithmetic_parameter.in_elements_num0_, ToString(arithmetic_parameter.in_shape1_),
+                        arithmetic_parameter.in_elements_num1_, ToString(arithmetic_parameter.out_shape_),
+                        arithmetic_parameter.out_elements_num_, ToString(arithmetic_parameter.in_strides0_),
+                        ToString(arithmetic_parameter.in_strides1_), ToString(arithmetic_parameter.out_strides_),
+                        ToString(arithmetic_parameter.multiples0_), ToString(arithmetic_parameter.multiples1_));
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const SoftmaxParameter &softmax_parameter) {
-  CodeBaseStruct("SoftmaxParameter", name, softmax_parameter.op_parameter_, softmax_parameter.axis_,
-                 ToString(softmax_parameter.input_shape_), softmax_parameter.element_size_, softmax_parameter.n_dim_);
+  CodeBaseStruct<false>("SoftmaxParameter", name, softmax_parameter.op_parameter_, softmax_parameter.axis_,
+                        ToString(softmax_parameter.input_shape_), softmax_parameter.element_size_,
+                        softmax_parameter.n_dim_);
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const ConvParameter &conv_parameter) {
@ -74,14 +75,14 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const ConvParamete
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const MatMulParameter &mat_mul_parameter) {
-  CodeBaseStruct("MatMulParameter", name, mat_mul_parameter.op_parameter_, mat_mul_parameter.has_bias_,
-                 mat_mul_parameter.row_, mat_mul_parameter.col_, mat_mul_parameter.row_4_, mat_mul_parameter.row_6_,
-                 mat_mul_parameter.row_12_, mat_mul_parameter.row_16_, mat_mul_parameter.row_align_,
-                 mat_mul_parameter.col_4_, mat_mul_parameter.col_8_, mat_mul_parameter.col_align_,
-                 mat_mul_parameter.deep_, mat_mul_parameter.deep_4_, mat_mul_parameter.deep_16_,
-                 mat_mul_parameter.deep_align_, mat_mul_parameter.batch, mat_mul_parameter.a_transpose_,
-                 mat_mul_parameter.b_transpose_, mat_mul_parameter.a_const_, mat_mul_parameter.b_const_,
-                 mat_mul_parameter.act_type_, mat_mul_parameter.use_axis_, mat_mul_parameter.axis_);
+  CodeBaseStruct<false>(
+    "MatMulParameter", name, mat_mul_parameter.op_parameter_, mat_mul_parameter.has_bias_, mat_mul_parameter.row_,
+    mat_mul_parameter.col_, mat_mul_parameter.row_4_, mat_mul_parameter.row_6_, mat_mul_parameter.row_12_,
+    mat_mul_parameter.row_16_, mat_mul_parameter.row_align_, mat_mul_parameter.col_4_, mat_mul_parameter.col_8_,
+    mat_mul_parameter.col_align_, mat_mul_parameter.deep_, mat_mul_parameter.deep_4_, mat_mul_parameter.deep_16_,
+    mat_mul_parameter.deep_align_, mat_mul_parameter.batch, mat_mul_parameter.a_transpose_,
+    mat_mul_parameter.b_transpose_, mat_mul_parameter.a_const_, mat_mul_parameter.b_const_, mat_mul_parameter.act_type_,
+    mat_mul_parameter.use_axis_, mat_mul_parameter.axis_);
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const ScaleParameter &scale_parameter) {
@ -163,8 +164,8 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const TransFuncStr
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const GroupNormParameter &gn_param) {
-  CodeBaseStruct("GroupNormParameter", name, gn_param.op_parameter_, gn_param.epsilon_, gn_param.num_groups_,
-                 gn_param.channel_, gn_param.unit_, gn_param.batch_, gn_param.affine_);
+  CodeBaseStruct<false>("GroupNormParameter", name, gn_param.op_parameter_, gn_param.epsilon_, gn_param.num_groups_,
+                        gn_param.channel_, gn_param.unit_, gn_param.batch_, gn_param.affine_);
 }
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const ActivationParameter &activation_parameter) {
  CodeBaseStruct("ActivationParameter", name, activation_parameter.op_parameter_, activation_parameter.type_,
--- a/mindspore/lite/tools/converter/micro/coder/wrapper/fp32/arithmetic_fp32_wrapper.c
+++ b/mindspore/lite/tools/converter/micro/coder/wrapper/fp32/arithmetic_fp32_wrapper.c
@ -92,9 +92,21 @@ int ArithmeticFp32Run(void *cdata, int task_id, float lhs_scale, float rhs_scale
  int completed_size = task_id * size;
  int cur_size = MSMIN(size, args->size_ - completed_size);
  if (cur_size <= 0) return NNACL_OK;
-  void *input0 = (void *)((float *)args->input0_ + completed_size);
-  void *input1 = (void *)((float *)args->input1_ + completed_size);
  void *output = (void *)((float *)args->output_ + completed_size);
+  void *input0;
+  void *input1;
+  if (args->is_opt_) {
+    if (args->param->in_elements_num0_ == 1) {
+      input0 = (void *)args->input0_;
+      input1 = (void *)((float *)args->input1_ + completed_size);
+    } else {
+      input0 = (void *)((float *)args->input0_ + completed_size);
+      input1 = (void *)args->input1_;
+    }
+  } else {
+    input0 = (void *)((float *)args->input0_ + completed_size);
+    input1 = (void *)((float *)args->input1_ + completed_size);
+  }
  ArithmeticExecute(input0, input1, output, cur_size, args->is_opt_, args->func_type_, args->arithmetic_func_,
                    args->param);
  return NNACL_OK;