!12391 add conv1x1 coder

From: @zhujingxuan
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2021-02-19 15:09:11 +08:00 committed by Gitee
commit 3cc3d5c9cf
10 changed files with 678 additions and 6 deletions

View File

@ -81,6 +81,7 @@ set(CODER_OPCODERS_SRC
${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc
${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc
${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc
${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
@ -126,13 +127,10 @@ set(LITE_KERNEL_SRC
${LITE_DIR}/nnacl/int8/fixed_point.c
${LITE_DIR}/nnacl/fp32/matmul_fp32.c
${LITE_DIR}/nnacl/int8/conv3x3_int8.c
)
set(MICRO_ADAPTER_SRC
${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
${MICRO_DIR}/wrapper/int8/conv_init_int8.c
${LITE_DIR}/nnacl/int8/conv1x1_int8.c
${LITE_DIR}/nnacl/base/conv1x1_base.c
)
list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MICRO_ADAPTER_SRC})
${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC})

View File

@ -0,0 +1,12 @@
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set(MICRO_WRAPPER_SRC
${LITE_DIR}/src/runtime/thread_pool.c
${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
${MICRO_DIR}/wrapper/int8/conv_init_int8.c
${MICRO_DIR}/wrapper/int8/conv1x1_init_int8.c
${MICRO_DIR}/wrapper/int8/conv1x1_run_int8.c
)
list(APPEND FILE_SET ${MICRO_WRAPPER_SRC})

View File

@ -19,6 +19,7 @@ include_directories(${TOP_DIR}/mindspore/core/)
#include coder
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
include(${MICRO_DIR}/cmake/file_list.cmake)
include(${MICRO_DIR}/cmake/wrapper.cmake)
add_executable(codegen main.cc ${FILE_SET})
add_dependencies(codegen fbs_src)
add_dependencies(codegen fbs_inner_src)

View File

@ -0,0 +1,193 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h"
#include <string>
#include <vector>
#include "securec/include/securec.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "micro/coder/opcoders/file_collector.h"
#include "micro/coder/log.h"
#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
namespace mindspore::lite::micro::nnacl {
int Conv2D1x1Int8Coder::Prepare(CoderContext *const context) {
matmul_param_ = new (std::nothrow) MatMulParameter();
MS_CHECK_PTR(matmul_param_);
MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Init failed");
MS_CHECK_RET_CODE(Conv2DBaseCoder::SetQuantParam(), "SetQuantParam failed");
filter_peroc_ = (conv_param_->conv_quant_arg_.filter_arg_num_ != kPerTensor);
if (filter_peroc_) {
MS_CHECK_RET_CODE(InitFilterPeroc(), "InitFilterPeroc failed.");
}
CheckSupportOptimize();
MS_CHECK_RET_CODE(InitWeightBias(context), "InitWeightBias failed");
MS_CHECK_RET_CODE(InitParam(), "InitParam failed");
MS_CHECK_RET_CODE(InitRunBuf(), "InitRunBuf failed");
return RET_OK;
}
int Conv2D1x1Int8Coder::DoCode(CoderContext *const context) {
Collect(context,
{"nnacl/int8/conv1x1_int8.h", "nnacl/common_func.h", "wrapper/int8/conv1x1_init_int8.h",
"wrapper/int8/conv1x1_run_int8.h"},
{"common_func.c", "pack.c", "conv1x1_int8.c", "matmul_int8.c", "fixed_point.c", "conv1x1_init_int8.c",
"conv1x1_run_int8.c"});
nnacl::NNaclInt8Serializer code;
code.CodeStruct("conv_param", *conv_param_);
code.CodeStruct("matmul_param", *matmul_param_);
code.CodeBaseStruct("Conv1x1Args", "args", input_sum_, filter_zp_ptr_, left_shift_, right_shift_, multiplier_,
packed_weight_, bias_data_, packed_input_, nullptr, nullptr, 0, 0, "conv_param", "matmul_param",
matmul_func_, pre_trans_input_, support_optimize_, filter_peroc_);
code.CodeFunction("Conv1x1Run", input_tensor_, "args", "THREAD_POOL_DEFAULT", thread_num_s_, output_tensor_);
context->AppendCode(code.str());
return RET_OK;
}
void Conv2D1x1Int8Coder::CheckSupportOptimize() {
support_optimize_ = false;
matmul_func_ = "MatMulInt8_4x16_r";
if (target_ == kARM64) {
matmul_func_ = "MatMulDpInt8_optimize_handler";
}
}
int Conv2D1x1Int8Coder::InitWeightBias(CoderContext *const context) {
int32_t input_channel = filter_tensor_->Channel();
int32_t output_channel = filter_tensor_->Batch();
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
nnacl::NNaclInt8Serializer code;
packed_weight_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight));
MS_CHECK_PTR(packed_weight_);
bias_data_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight));
MS_CHECK_PTR(bias_data_);
std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_);
std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_);
std::string filter_zp_str = "";
if (filter_peroc_) {
filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_);
} else {
MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
filter_zp_str = "filter_zp";
code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n";
}
if (target_ == kARM64) {
code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
output_channel, input_zp, "GetSupportOptFlag()", filter_peroc_, packed_weight_str,
bias_data_str);
} else {
code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
output_channel, input_zp, support_optimize_, filter_peroc_, packed_weight_str,
bias_data_str);
}
context->AppendInitCode(code.str());
return RET_OK;
}
int Conv2D1x1Int8Coder::InitFilterPeroc() {
int32_t output_channel = filter_tensor_->Batch();
int round_oc;
if (target_ == kARM32A) {
round_oc = UP_ROUND(output_channel, C2NUM);
} else {
round_oc = MSMAX(UP_ROUND(output_channel, C16NUM), UP_ROUND(output_channel, C4NUM));
}
MS_CHECK_TRUE(conv_quant_arg_->filter_arg_num_ == static_cast<size_t>(output_channel),
"weight per channel quant param length is not equal to filter num, filter is not PerChannel");
size_t output_size = output_channel * sizeof(int32_t);
size_t oc_size = round_oc * sizeof(int32_t);
/* filter zp */
filter_zp_ptr_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, output_size, kOfflinePackWeight));
MS_CHECK_PTR(filter_zp_ptr_);
MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
for (int fi = 0; fi < output_channel; fi++) {
filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_;
}
/* left shift */
left_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
MS_CHECK_PTR(left_shift_);
MS_CHECK_RET_CODE(memset_s(left_shift_, oc_size, 0, oc_size), "memset left_shift_ failed");
MS_CHECK_RET_CODE(memcpy_s(left_shift_, oc_size, conv_param_->conv_quant_arg_.left_shift_, output_size),
"memcpy_s left_shift_ failed");
/* right shift */
right_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
MS_CHECK_PTR(right_shift_);
MS_CHECK_RET_CODE(memset_s(right_shift_, oc_size, 0, oc_size), "memset right_shift_ failed");
MS_CHECK_RET_CODE(memcpy_s(right_shift_, oc_size, conv_param_->conv_quant_arg_.right_shift_, output_size),
"memcpy_s right_shift_ failed");
/* multiplier */
multiplier_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
MS_CHECK_PTR(multiplier_);
MS_CHECK_RET_CODE(memset_s(multiplier_, oc_size, 0, oc_size), "memset multiplier_ failed");
MS_CHECK_RET_CODE(memcpy_s(multiplier_, oc_size, conv_param_->conv_quant_arg_.quant_multiplier_, output_size),
"memcpy_s multiplier_ failed");
return RET_OK;
}
int Conv2D1x1Int8Coder::InitParam() {
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
conv_param_->stride_w_ != 1);
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
matmul_param_->deep_ = conv_param_->input_channel_;
matmul_param_->col_ = conv_param_->output_channel_;
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);
int row_pack_count = C4NUM;
/* init input sum size */
input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
if (pre_trans_input_) {
input_ptr_ = reinterpret_cast<int8_t *>(
allocator_->Malloc(kNumberTypeInt8, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t), kWorkspace));
MS_CHECK_PTR(input_ptr_);
}
return RET_OK;
}
int Conv2D1x1Int8Coder::InitRunBuf() {
input_sum_ =
reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, input_sum_size_ * sizeof(int32_t), kWorkspace));
MS_CHECK_PTR(input_sum_);
size_t size = MSMAX(UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM),
UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM));
packed_input_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, size * sizeof(int8_t), kWorkspace));
MS_CHECK_PTR(packed_input_);
return RET_OK;
}
} // namespace mindspore::lite::micro::nnacl

View File

@ -0,0 +1,67 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
#include "micro/coder/opcoders/base/conv2d_base_coder.h"
#include <memory>
#include <string>
#include <vector>
#include "nnacl/conv_parameter.h"
namespace mindspore::lite::micro::nnacl {
class Conv2D1x1Int8Coder final : public Conv2DBaseCoder {
public:
Conv2D1x1Int8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
const Model::Node *node, size_t node_index, Target target)
: Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
int Prepare(CoderContext *const context) override;
int DoCode(CoderContext *const context) override;
~Conv2D1x1Int8Coder() override = default;
private:
void CheckSupportOptimize();
int InitWeightBias(CoderContext *const context);
int InitFilterPeroc();
int InitParam();
int InitRunBuf();
int32_t *input_sum_{nullptr}; /* per-oc */
int32_t *filter_zp_ptr_{nullptr}; /* per-oc up round */
int32_t *left_shift_{nullptr}; /* per-oc up round */
int32_t *right_shift_{nullptr}; /* per-oc up round */
int32_t *multiplier_{nullptr}; /* per-oc up round */
int8_t *packed_weight_{nullptr};
int32_t *bias_data_{nullptr};
int8_t *packed_input_{nullptr};
int8_t *input_ptr_{nullptr};
int8_t *output_ptr_{nullptr};
size_t input_sum_size_{0};
MatMulParameter *matmul_param_{nullptr};
std::string matmul_func_;
bool pre_trans_input_{false};
bool support_optimize_{false};
bool filter_peroc_{false};
};
} // namespace mindspore::lite::micro::nnacl
#endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_

View File

@ -60,6 +60,16 @@ void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParamete
conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_);
}
void NNaclInt8Serializer::CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter) {
CodeBaseStruct("MatMulParameter", name, matmul_parameter.op_parameter_, matmul_parameter.has_bias_,
matmul_parameter.row_, matmul_parameter.col_, matmul_parameter.row_4_, matmul_parameter.row_6_,
matmul_parameter.row_12_, matmul_parameter.row_16_, matmul_parameter.row_align_,
matmul_parameter.col_4_, matmul_parameter.col_8_, matmul_parameter.col_align_, matmul_parameter.deep_,
matmul_parameter.deep_4_, matmul_parameter.deep_16_, matmul_parameter.batch,
matmul_parameter.a_transpose_, matmul_parameter.b_transpose_, matmul_parameter.a_const_,
matmul_parameter.b_const_, matmul_parameter.act_type_);
}
void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) {
CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_,
arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_,

View File

@ -0,0 +1,90 @@
/*
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "wrapper/int8/conv1x1_init_int8.h"
#include <memory.h>
#include "nnacl/int8/matmul_int8.h"
#include "nnacl/errorcode.h"
int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
int8_t **packed_weight, int32_t **bias_data) {
if (packed_weight == NULL || bias_data == NULL) {
return NNACL_ERR;
}
#ifdef ENABLE_ARM32
/* InitWeightBiasArm32 */
/* weight */
size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t);
int8_t *packed_weight_ = (int8_t *)(malloc(size));
if (packed_weight_ == NULL) {
return NNACL_ERR;
}
memset(packed_weight_, 0, size);
RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
/* bias */
size = UP_ROUND(output_channel, C2NUM);
int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
if (bias_data_ == NULL) {
free(packed_weight_);
return NNACL_ERR;
}
memset(bias_data_, 0, size * sizeof(int32_t));
if (src_bias != NULL) {
memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
}
#else
/* InitWeightBias */
/* weight */
size_t size = support_optimize ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C16NUM) * sizeof(int8_t)
: UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
int8_t *packed_weight_ = (int8_t *)(malloc(size));
if (packed_weight_ == NULL) {
return NNACL_ERR;
}
memset(packed_weight_, 0, size);
if (support_optimize) {
RowMajor2Row4x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
} else {
RowMajor2Row16x4MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
}
/* bias */
size = support_optimize ? UP_ROUND(output_channel, C16NUM) : UP_ROUND(output_channel, C4NUM);
int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
if (bias_data_ == NULL) {
free(packed_weight_);
return NNACL_ERR;
}
memset(bias_data_, 0, size * sizeof(int32_t));
if (src_bias != NULL) {
memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
}
#endif
/* InitBiasByzp */
/* bias = bias - v2 x zp1 + zp1 x zp2 */
for (int oc = 0; oc < output_channel; oc++) {
int32_t weight_sum_value = 0;
int32_t filter_zp = (filter_peroc) ? filter_zps[oc] : filter_zps[0];
for (int ic = 0; ic < input_channel; ic++) {
weight_sum_value += src_weight[oc * input_channel + ic];
}
bias_data_[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
}
*packed_weight = packed_weight_;
*bias_data = bias_data_;
return NNACL_OK;
}

View File

@ -0,0 +1,28 @@
/*
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
#include <stdint.h>
#include <stdbool.h>
#include "nnacl/conv_parameter.h"
int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
int8_t **packed_weight, int32_t **bias_data);
#endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_

View File

@ -0,0 +1,224 @@
/*
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "wrapper/int8/conv1x1_run_int8.h"
#include "nnacl/base/conv1x1_base.h"
#include "nnacl/int8/matmul_int8.h"
#include "nnacl/int8/pack_int8.h"
#include "nnacl/int8/conv1x1_int8.h"
#include "nnacl/errorcode.h"
void Pre1x1Trans(Conv1x1Args *args, int8_t *src_input, int8_t *src_output) {
args->output_ptr_ = src_output;
if (args->pre_trans_input_) {
Conv1x1InputPack(src_input, args->input_ptr_, args->conv_param_, sizeof(int8_t));
} else {
args->input_ptr_ = src_input;
}
}
int OcOptPre(void *cdata, int task_id) {
Conv1x1Args *args = (Conv1x1Args *)(cdata);
int cur_stride = args->thread_stride_hw_ * C4NUM;
int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
int cur_hw = MSMIN(cur_stride, res_stride);
if (cur_hw <= 0) {
return NNACL_OK;
}
int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
if (args->filter_peroc_) {
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
} else {
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
}
return NNACL_OK;
}
int RunArm64OptOc(void *cdata, int task_id) {
Conv1x1Args *args = (Conv1x1Args *)(cdata);
int stride = args->thread_stride_oc_ * C16NUM;
int cur_stride = task_id * stride;
int res_stride = args->matmul_param_->col_ - cur_stride;
int cur_oc = MSMIN(stride, res_stride);
if (cur_oc <= 0) {
return NNACL_OK;
}
bool filter_peroc = args->filter_peroc_;
int32_t *cur_left_shift =
filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
int32_t *cur_right_shift =
filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
int32_t *cur_multiplier =
filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;
Conv1x1Int8Opt(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_4_,
args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_4_, cur_left_shift, cur_right_shift,
cur_multiplier, args->conv_param_, args->matmul_func_, cur_zp);
return NNACL_OK;
}
int RunArmOc(void *cdata, int task_id) {
Conv1x1Args *args = (Conv1x1Args *)(cdata);
#ifdef ENABLE_ARM32
int col_tile = C2NUM;
#else
int col_tile = C4NUM;
#endif
int stride = args->thread_stride_oc_ * col_tile;
int cur_stride = task_id * stride;
int res_stride = args->matmul_param_->col_ - cur_stride;
int cur_oc = MSMIN(stride, res_stride);
if (cur_oc <= 0) {
return NNACL_OK;
}
bool filter_peroc = args->filter_peroc_;
int32_t *cur_left_shift =
filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
int32_t *cur_right_shift =
filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
int32_t *cur_multiplier =
filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;
Conv1x1Int8(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_16_,
args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_16_, cur_left_shift, cur_right_shift,
cur_multiplier, args->conv_param_, cur_zp);
return NNACL_OK;
}
int RunArm64OptHw(void *cdata, int task_id) {
Conv1x1Args *args = (Conv1x1Args *)(cdata);
int cur_stride = args->thread_stride_hw_ * C4NUM;
int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
int cur_hw = MSMIN(cur_stride, res_stride);
if (cur_hw <= 0) {
return NNACL_OK;
}
int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
if (args->filter_peroc_) {
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
} else {
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
}
Conv1x1Int8Opt(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
args->matmul_param_->col_, args->matmul_param_->deep_4_, args->left_shift_, args->right_shift_,
args->multiplier_, args->conv_param_, args->matmul_func_, args->filter_zp_ptr_);
return NNACL_OK;
}
int RunArmHw(void *cdata, int task_id) {
Conv1x1Args *args = (Conv1x1Args *)(cdata);
int cur_stride = args->thread_stride_hw_ * C4NUM;
int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
int cur_hw = MSMIN(cur_stride, res_stride);
if (cur_hw <= 0) {
return NNACL_OK;
}
int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
int8_t *hw_packed_in =
args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_16_;
int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, args->matmul_param_->deep_);
if (args->filter_peroc_) {
PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
} else {
PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
}
Conv1x1Int8(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
args->matmul_param_->col_, args->matmul_param_->deep_16_, args->left_shift_, args->right_shift_,
args->multiplier_, args->conv_param_, args->filter_zp_ptr_);
return NNACL_OK;
}
void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out) {
int row_pack_count = C4NUM;
int col_pack_count;
#ifdef ENABLE_ARM32
col_pack_count = C2NUM;
#else
if (args->support_optimize_) {
col_pack_count = C16NUM;
} else {
col_pack_count = C4NUM;
}
#endif
int hw_thread_count = UP_DIV(args->matmul_param_->row_, row_pack_count);
int oc_thread_count = UP_DIV(args->matmul_param_->col_, col_pack_count);
size_t thread_count_hw = MSMIN(thread_num, hw_thread_count);
args->thread_stride_hw_ = UP_DIV(hw_thread_count, thread_count_hw);
size_t thread_count_oc = MSMIN(thread_num, oc_thread_count);
args->thread_stride_oc_ = UP_DIV(oc_thread_count, thread_count_oc);
bool parallel_by_oc = oc_thread_count > thread_num;
for (int batch_index = 0; batch_index < args->conv_param_->input_batch_; batch_index++) {
Pre1x1Trans(args,
src_in + batch_index * args->conv_param_->input_h_ * args->conv_param_->input_w_ *
args->conv_param_->input_channel_,
src_out + batch_index * args->matmul_param_->row_ * args->matmul_param_->col_);
if (parallel_by_oc) {
/* input transpose and input sum */
if (args->support_optimize_) {
ParallelLaunch(thread_pool, OcOptPre, args, thread_count_hw);
} else {
RowMajor2Row16x4MajorInt8(args->input_ptr_, args->packed_input_, args->matmul_param_->row_,
args->matmul_param_->deep_);
if (args->filter_peroc_) {
PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, 1, args->matmul_param_->row_4_,
args->matmul_param_->deep_16_);
} else {
PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_,
args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
args->matmul_param_->row_4_, args->matmul_param_->deep_16_);
}
}
/* matmul parallel by oc */
if (args->support_optimize_) {
ParallelLaunch(thread_pool, RunArm64OptOc, args, thread_count_oc);
} else {
ParallelLaunch(thread_pool, RunArmOc, args, thread_count_oc);
}
} else {
/* matmul parallel by hw */
if (args->support_optimize_) {
ParallelLaunch(thread_pool, RunArm64OptHw, args, thread_count_hw);
} else {
ParallelLaunch(thread_pool, RunArmHw, args, thread_count_hw);
}
}
}
}

View File

@ -0,0 +1,49 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
#include <stdint.h>
#include <stdbool.h>
#include "nnacl/conv_parameter.h"
#include "nnacl/matmul_parameter.h"
#include "src/runtime/thread_pool.h"
typedef struct {
int32_t *input_sum_; /* per-oc */
int32_t *filter_zp_ptr_; /* per-oc up round */
int32_t *left_shift_; /* per-oc up round */
int32_t *right_shift_; /* per-oc up round */
int32_t *multiplier_; /* per-oc up round */
int8_t *packed_weight_;
int32_t *bias_data_;
int8_t *packed_input_;
int8_t *input_ptr_;
int8_t *output_ptr_;
size_t thread_stride_hw_;
size_t thread_stride_oc_;
ConvParameter *conv_param_;
MatMulParameter *matmul_param_;
MATMUL_OPT_DP_FUNC matmul_func_;
bool pre_trans_input_;
bool support_optimize_;
bool filter_peroc_;
} Conv1x1Args;
void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out);
#endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_