!12391 add conv1x1 coder
From: @zhujingxuan Reviewed-by: Signed-off-by:
This commit is contained in:
commit
3cc3d5c9cf
|
@ -81,6 +81,7 @@ set(CODER_OPCODERS_SRC
|
|||
${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc
|
||||
${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc
|
||||
${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc
|
||||
${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
|
||||
${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
|
||||
${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
|
||||
${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
|
||||
|
@ -126,13 +127,10 @@ set(LITE_KERNEL_SRC
|
|||
${LITE_DIR}/nnacl/int8/fixed_point.c
|
||||
${LITE_DIR}/nnacl/fp32/matmul_fp32.c
|
||||
${LITE_DIR}/nnacl/int8/conv3x3_int8.c
|
||||
)
|
||||
set(MICRO_ADAPTER_SRC
|
||||
${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
|
||||
${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
|
||||
${MICRO_DIR}/wrapper/int8/conv_init_int8.c
|
||||
${LITE_DIR}/nnacl/int8/conv1x1_int8.c
|
||||
${LITE_DIR}/nnacl/base/conv1x1_base.c
|
||||
)
|
||||
|
||||
list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
|
||||
${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MICRO_ADAPTER_SRC})
|
||||
${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC})
|
||||
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
|
||||
set(MICRO_WRAPPER_SRC
|
||||
${LITE_DIR}/src/runtime/thread_pool.c
|
||||
${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
|
||||
${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
|
||||
${MICRO_DIR}/wrapper/int8/conv_init_int8.c
|
||||
${MICRO_DIR}/wrapper/int8/conv1x1_init_int8.c
|
||||
${MICRO_DIR}/wrapper/int8/conv1x1_run_int8.c
|
||||
)
|
||||
|
||||
list(APPEND FILE_SET ${MICRO_WRAPPER_SRC})
|
|
@ -19,6 +19,7 @@ include_directories(${TOP_DIR}/mindspore/core/)
|
|||
#include coder
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
|
||||
include(${MICRO_DIR}/cmake/file_list.cmake)
|
||||
include(${MICRO_DIR}/cmake/wrapper.cmake)
|
||||
add_executable(codegen main.cc ${FILE_SET})
|
||||
add_dependencies(codegen fbs_src)
|
||||
add_dependencies(codegen fbs_inner_src)
|
||||
|
|
|
@ -0,0 +1,193 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "securec/include/securec.h"
|
||||
#include "src/runtime/kernel/arm/base/convolution_base.h"
|
||||
#include "micro/coder/opcoders/file_collector.h"
|
||||
#include "micro/coder/log.h"
|
||||
#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
|
||||
|
||||
namespace mindspore::lite::micro::nnacl {
|
||||
|
||||
int Conv2D1x1Int8Coder::Prepare(CoderContext *const context) {
|
||||
matmul_param_ = new (std::nothrow) MatMulParameter();
|
||||
MS_CHECK_PTR(matmul_param_);
|
||||
MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Init failed");
|
||||
MS_CHECK_RET_CODE(Conv2DBaseCoder::SetQuantParam(), "SetQuantParam failed");
|
||||
filter_peroc_ = (conv_param_->conv_quant_arg_.filter_arg_num_ != kPerTensor);
|
||||
if (filter_peroc_) {
|
||||
MS_CHECK_RET_CODE(InitFilterPeroc(), "InitFilterPeroc failed.");
|
||||
}
|
||||
CheckSupportOptimize();
|
||||
MS_CHECK_RET_CODE(InitWeightBias(context), "InitWeightBias failed");
|
||||
MS_CHECK_RET_CODE(InitParam(), "InitParam failed");
|
||||
MS_CHECK_RET_CODE(InitRunBuf(), "InitRunBuf failed");
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Conv2D1x1Int8Coder::DoCode(CoderContext *const context) {
|
||||
Collect(context,
|
||||
{"nnacl/int8/conv1x1_int8.h", "nnacl/common_func.h", "wrapper/int8/conv1x1_init_int8.h",
|
||||
"wrapper/int8/conv1x1_run_int8.h"},
|
||||
{"common_func.c", "pack.c", "conv1x1_int8.c", "matmul_int8.c", "fixed_point.c", "conv1x1_init_int8.c",
|
||||
"conv1x1_run_int8.c"});
|
||||
|
||||
nnacl::NNaclInt8Serializer code;
|
||||
|
||||
code.CodeStruct("conv_param", *conv_param_);
|
||||
code.CodeStruct("matmul_param", *matmul_param_);
|
||||
|
||||
code.CodeBaseStruct("Conv1x1Args", "args", input_sum_, filter_zp_ptr_, left_shift_, right_shift_, multiplier_,
|
||||
packed_weight_, bias_data_, packed_input_, nullptr, nullptr, 0, 0, "conv_param", "matmul_param",
|
||||
matmul_func_, pre_trans_input_, support_optimize_, filter_peroc_);
|
||||
|
||||
code.CodeFunction("Conv1x1Run", input_tensor_, "args", "THREAD_POOL_DEFAULT", thread_num_s_, output_tensor_);
|
||||
|
||||
context->AppendCode(code.str());
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void Conv2D1x1Int8Coder::CheckSupportOptimize() {
|
||||
support_optimize_ = false;
|
||||
matmul_func_ = "MatMulInt8_4x16_r";
|
||||
if (target_ == kARM64) {
|
||||
matmul_func_ = "MatMulDpInt8_optimize_handler";
|
||||
}
|
||||
}
|
||||
|
||||
int Conv2D1x1Int8Coder::InitWeightBias(CoderContext *const context) {
|
||||
int32_t input_channel = filter_tensor_->Channel();
|
||||
int32_t output_channel = filter_tensor_->Batch();
|
||||
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
|
||||
|
||||
nnacl::NNaclInt8Serializer code;
|
||||
|
||||
packed_weight_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight));
|
||||
MS_CHECK_PTR(packed_weight_);
|
||||
bias_data_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight));
|
||||
MS_CHECK_PTR(bias_data_);
|
||||
|
||||
std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_);
|
||||
std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_);
|
||||
std::string filter_zp_str = "";
|
||||
if (filter_peroc_) {
|
||||
filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_);
|
||||
} else {
|
||||
MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
|
||||
filter_zp_str = "filter_zp";
|
||||
code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n";
|
||||
}
|
||||
|
||||
if (target_ == kARM64) {
|
||||
code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
|
||||
output_channel, input_zp, "GetSupportOptFlag()", filter_peroc_, packed_weight_str,
|
||||
bias_data_str);
|
||||
} else {
|
||||
code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
|
||||
output_channel, input_zp, support_optimize_, filter_peroc_, packed_weight_str,
|
||||
bias_data_str);
|
||||
}
|
||||
|
||||
context->AppendInitCode(code.str());
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Conv2D1x1Int8Coder::InitFilterPeroc() {
|
||||
int32_t output_channel = filter_tensor_->Batch();
|
||||
int round_oc;
|
||||
if (target_ == kARM32A) {
|
||||
round_oc = UP_ROUND(output_channel, C2NUM);
|
||||
} else {
|
||||
round_oc = MSMAX(UP_ROUND(output_channel, C16NUM), UP_ROUND(output_channel, C4NUM));
|
||||
}
|
||||
|
||||
MS_CHECK_TRUE(conv_quant_arg_->filter_arg_num_ == static_cast<size_t>(output_channel),
|
||||
"weight per channel quant param length is not equal to filter num, filter is not PerChannel");
|
||||
size_t output_size = output_channel * sizeof(int32_t);
|
||||
size_t oc_size = round_oc * sizeof(int32_t);
|
||||
|
||||
/* filter zp */
|
||||
filter_zp_ptr_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, output_size, kOfflinePackWeight));
|
||||
MS_CHECK_PTR(filter_zp_ptr_);
|
||||
MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
|
||||
for (int fi = 0; fi < output_channel; fi++) {
|
||||
filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_;
|
||||
}
|
||||
|
||||
/* left shift */
|
||||
left_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
|
||||
MS_CHECK_PTR(left_shift_);
|
||||
MS_CHECK_RET_CODE(memset_s(left_shift_, oc_size, 0, oc_size), "memset left_shift_ failed");
|
||||
MS_CHECK_RET_CODE(memcpy_s(left_shift_, oc_size, conv_param_->conv_quant_arg_.left_shift_, output_size),
|
||||
"memcpy_s left_shift_ failed");
|
||||
|
||||
/* right shift */
|
||||
right_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
|
||||
MS_CHECK_PTR(right_shift_);
|
||||
MS_CHECK_RET_CODE(memset_s(right_shift_, oc_size, 0, oc_size), "memset right_shift_ failed");
|
||||
MS_CHECK_RET_CODE(memcpy_s(right_shift_, oc_size, conv_param_->conv_quant_arg_.right_shift_, output_size),
|
||||
"memcpy_s right_shift_ failed");
|
||||
/* multiplier */
|
||||
multiplier_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
|
||||
MS_CHECK_PTR(multiplier_);
|
||||
MS_CHECK_RET_CODE(memset_s(multiplier_, oc_size, 0, oc_size), "memset multiplier_ failed");
|
||||
MS_CHECK_RET_CODE(memcpy_s(multiplier_, oc_size, conv_param_->conv_quant_arg_.quant_multiplier_, output_size),
|
||||
"memcpy_s multiplier_ failed");
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Conv2D1x1Int8Coder::InitParam() {
|
||||
pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
|
||||
conv_param_->stride_w_ != 1);
|
||||
|
||||
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
matmul_param_->deep_ = conv_param_->input_channel_;
|
||||
matmul_param_->col_ = conv_param_->output_channel_;
|
||||
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
|
||||
matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
|
||||
matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);
|
||||
|
||||
int row_pack_count = C4NUM;
|
||||
/* init input sum size */
|
||||
input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
|
||||
|
||||
if (pre_trans_input_) {
|
||||
input_ptr_ = reinterpret_cast<int8_t *>(
|
||||
allocator_->Malloc(kNumberTypeInt8, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t), kWorkspace));
|
||||
MS_CHECK_PTR(input_ptr_);
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Conv2D1x1Int8Coder::InitRunBuf() {
|
||||
input_sum_ =
|
||||
reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, input_sum_size_ * sizeof(int32_t), kWorkspace));
|
||||
MS_CHECK_PTR(input_sum_);
|
||||
|
||||
size_t size = MSMAX(UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM),
|
||||
UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM));
|
||||
|
||||
packed_input_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, size * sizeof(int8_t), kWorkspace));
|
||||
MS_CHECK_PTR(packed_input_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
} // namespace mindspore::lite::micro::nnacl
|
|
@ -0,0 +1,67 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
|
||||
#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
|
||||
#include "micro/coder/opcoders/base/conv2d_base_coder.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
|
||||
namespace mindspore::lite::micro::nnacl {
|
||||
class Conv2D1x1Int8Coder final : public Conv2DBaseCoder {
|
||||
public:
|
||||
Conv2D1x1Int8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
|
||||
const Model::Node *node, size_t node_index, Target target)
|
||||
: Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
|
||||
|
||||
int Prepare(CoderContext *const context) override;
|
||||
|
||||
int DoCode(CoderContext *const context) override;
|
||||
|
||||
~Conv2D1x1Int8Coder() override = default;
|
||||
|
||||
private:
|
||||
void CheckSupportOptimize();
|
||||
|
||||
int InitWeightBias(CoderContext *const context);
|
||||
|
||||
int InitFilterPeroc();
|
||||
|
||||
int InitParam();
|
||||
|
||||
int InitRunBuf();
|
||||
|
||||
int32_t *input_sum_{nullptr}; /* per-oc */
|
||||
int32_t *filter_zp_ptr_{nullptr}; /* per-oc up round */
|
||||
int32_t *left_shift_{nullptr}; /* per-oc up round */
|
||||
int32_t *right_shift_{nullptr}; /* per-oc up round */
|
||||
int32_t *multiplier_{nullptr}; /* per-oc up round */
|
||||
int8_t *packed_weight_{nullptr};
|
||||
int32_t *bias_data_{nullptr};
|
||||
int8_t *packed_input_{nullptr};
|
||||
int8_t *input_ptr_{nullptr};
|
||||
int8_t *output_ptr_{nullptr};
|
||||
size_t input_sum_size_{0};
|
||||
MatMulParameter *matmul_param_{nullptr};
|
||||
std::string matmul_func_;
|
||||
bool pre_trans_input_{false};
|
||||
bool support_optimize_{false};
|
||||
bool filter_peroc_{false};
|
||||
};
|
||||
} // namespace mindspore::lite::micro::nnacl
|
||||
#endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
|
|
@ -60,6 +60,16 @@ void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParamete
|
|||
conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_);
|
||||
}
|
||||
|
||||
void NNaclInt8Serializer::CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter) {
|
||||
CodeBaseStruct("MatMulParameter", name, matmul_parameter.op_parameter_, matmul_parameter.has_bias_,
|
||||
matmul_parameter.row_, matmul_parameter.col_, matmul_parameter.row_4_, matmul_parameter.row_6_,
|
||||
matmul_parameter.row_12_, matmul_parameter.row_16_, matmul_parameter.row_align_,
|
||||
matmul_parameter.col_4_, matmul_parameter.col_8_, matmul_parameter.col_align_, matmul_parameter.deep_,
|
||||
matmul_parameter.deep_4_, matmul_parameter.deep_16_, matmul_parameter.batch,
|
||||
matmul_parameter.a_transpose_, matmul_parameter.b_transpose_, matmul_parameter.a_const_,
|
||||
matmul_parameter.b_const_, matmul_parameter.act_type_);
|
||||
}
|
||||
|
||||
void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) {
|
||||
CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_,
|
||||
arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_,
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "wrapper/int8/conv1x1_init_int8.h"
|
||||
#include <memory.h>
|
||||
#include "nnacl/int8/matmul_int8.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
|
||||
int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
|
||||
int8_t **packed_weight, int32_t **bias_data) {
|
||||
if (packed_weight == NULL || bias_data == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
#ifdef ENABLE_ARM32
|
||||
/* InitWeightBiasArm32 */
|
||||
/* weight */
|
||||
size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t);
|
||||
int8_t *packed_weight_ = (int8_t *)(malloc(size));
|
||||
if (packed_weight_ == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
memset(packed_weight_, 0, size);
|
||||
RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
|
||||
/* bias */
|
||||
size = UP_ROUND(output_channel, C2NUM);
|
||||
int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
|
||||
if (bias_data_ == NULL) {
|
||||
free(packed_weight_);
|
||||
return NNACL_ERR;
|
||||
}
|
||||
memset(bias_data_, 0, size * sizeof(int32_t));
|
||||
if (src_bias != NULL) {
|
||||
memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
|
||||
}
|
||||
#else
|
||||
/* InitWeightBias */
|
||||
/* weight */
|
||||
size_t size = support_optimize ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C16NUM) * sizeof(int8_t)
|
||||
: UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
|
||||
int8_t *packed_weight_ = (int8_t *)(malloc(size));
|
||||
if (packed_weight_ == NULL) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
memset(packed_weight_, 0, size);
|
||||
if (support_optimize) {
|
||||
RowMajor2Row4x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
|
||||
} else {
|
||||
RowMajor2Row16x4MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
|
||||
}
|
||||
/* bias */
|
||||
size = support_optimize ? UP_ROUND(output_channel, C16NUM) : UP_ROUND(output_channel, C4NUM);
|
||||
int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
|
||||
if (bias_data_ == NULL) {
|
||||
free(packed_weight_);
|
||||
return NNACL_ERR;
|
||||
}
|
||||
memset(bias_data_, 0, size * sizeof(int32_t));
|
||||
if (src_bias != NULL) {
|
||||
memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
|
||||
}
|
||||
#endif
|
||||
/* InitBiasByzp */
|
||||
/* bias = bias - v2 x zp1 + zp1 x zp2 */
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
int32_t weight_sum_value = 0;
|
||||
int32_t filter_zp = (filter_peroc) ? filter_zps[oc] : filter_zps[0];
|
||||
for (int ic = 0; ic < input_channel; ic++) {
|
||||
weight_sum_value += src_weight[oc * input_channel + ic];
|
||||
}
|
||||
bias_data_[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
|
||||
}
|
||||
|
||||
*packed_weight = packed_weight_;
|
||||
*bias_data = bias_data_;
|
||||
return NNACL_OK;
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
|
||||
#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
|
||||
int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
|
||||
int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
|
||||
int8_t **packed_weight, int32_t **bias_data);
|
||||
|
||||
#endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
|
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "wrapper/int8/conv1x1_run_int8.h"
|
||||
#include "nnacl/base/conv1x1_base.h"
|
||||
#include "nnacl/int8/matmul_int8.h"
|
||||
#include "nnacl/int8/pack_int8.h"
|
||||
#include "nnacl/int8/conv1x1_int8.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
void Pre1x1Trans(Conv1x1Args *args, int8_t *src_input, int8_t *src_output) {
|
||||
args->output_ptr_ = src_output;
|
||||
if (args->pre_trans_input_) {
|
||||
Conv1x1InputPack(src_input, args->input_ptr_, args->conv_param_, sizeof(int8_t));
|
||||
} else {
|
||||
args->input_ptr_ = src_input;
|
||||
}
|
||||
}
|
||||
|
||||
int OcOptPre(void *cdata, int task_id) {
|
||||
Conv1x1Args *args = (Conv1x1Args *)(cdata);
|
||||
int cur_stride = args->thread_stride_hw_ * C4NUM;
|
||||
int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
|
||||
int cur_hw = MSMIN(cur_stride, res_stride);
|
||||
if (cur_hw <= 0) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
|
||||
int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
|
||||
int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
|
||||
|
||||
if (args->filter_peroc_) {
|
||||
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
|
||||
} else {
|
||||
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
|
||||
args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int RunArm64OptOc(void *cdata, int task_id) {
|
||||
Conv1x1Args *args = (Conv1x1Args *)(cdata);
|
||||
int stride = args->thread_stride_oc_ * C16NUM;
|
||||
int cur_stride = task_id * stride;
|
||||
int res_stride = args->matmul_param_->col_ - cur_stride;
|
||||
int cur_oc = MSMIN(stride, res_stride);
|
||||
if (cur_oc <= 0) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
bool filter_peroc = args->filter_peroc_;
|
||||
int32_t *cur_left_shift =
|
||||
filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
|
||||
int32_t *cur_right_shift =
|
||||
filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
|
||||
int32_t *cur_multiplier =
|
||||
filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
|
||||
int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;
|
||||
|
||||
Conv1x1Int8Opt(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_4_,
|
||||
args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
|
||||
args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_4_, cur_left_shift, cur_right_shift,
|
||||
cur_multiplier, args->conv_param_, args->matmul_func_, cur_zp);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int RunArmOc(void *cdata, int task_id) {
|
||||
Conv1x1Args *args = (Conv1x1Args *)(cdata);
|
||||
#ifdef ENABLE_ARM32
|
||||
int col_tile = C2NUM;
|
||||
#else
|
||||
int col_tile = C4NUM;
|
||||
#endif
|
||||
int stride = args->thread_stride_oc_ * col_tile;
|
||||
int cur_stride = task_id * stride;
|
||||
int res_stride = args->matmul_param_->col_ - cur_stride;
|
||||
int cur_oc = MSMIN(stride, res_stride);
|
||||
if (cur_oc <= 0) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
bool filter_peroc = args->filter_peroc_;
|
||||
int32_t *cur_left_shift =
|
||||
filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
|
||||
int32_t *cur_right_shift =
|
||||
filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
|
||||
int32_t *cur_multiplier =
|
||||
filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
|
||||
int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;
|
||||
|
||||
Conv1x1Int8(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_16_,
|
||||
args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
|
||||
args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_16_, cur_left_shift, cur_right_shift,
|
||||
cur_multiplier, args->conv_param_, cur_zp);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int RunArm64OptHw(void *cdata, int task_id) {
|
||||
Conv1x1Args *args = (Conv1x1Args *)(cdata);
|
||||
int cur_stride = args->thread_stride_hw_ * C4NUM;
|
||||
int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
|
||||
int cur_hw = MSMIN(cur_stride, res_stride);
|
||||
if (cur_hw <= 0) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
|
||||
int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
|
||||
int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
|
||||
int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
|
||||
|
||||
if (args->filter_peroc_) {
|
||||
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
|
||||
} else {
|
||||
PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
|
||||
args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
|
||||
}
|
||||
|
||||
Conv1x1Int8Opt(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
|
||||
args->matmul_param_->col_, args->matmul_param_->deep_4_, args->left_shift_, args->right_shift_,
|
||||
args->multiplier_, args->conv_param_, args->matmul_func_, args->filter_zp_ptr_);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int RunArmHw(void *cdata, int task_id) {
|
||||
Conv1x1Args *args = (Conv1x1Args *)(cdata);
|
||||
int cur_stride = args->thread_stride_hw_ * C4NUM;
|
||||
int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
|
||||
int cur_hw = MSMIN(cur_stride, res_stride);
|
||||
if (cur_hw <= 0) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
|
||||
int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
|
||||
int8_t *hw_packed_in =
|
||||
args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_16_;
|
||||
int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
|
||||
|
||||
RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, args->matmul_param_->deep_);
|
||||
|
||||
if (args->filter_peroc_) {
|
||||
PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
|
||||
} else {
|
||||
PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
|
||||
UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
|
||||
}
|
||||
|
||||
Conv1x1Int8(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
|
||||
args->matmul_param_->col_, args->matmul_param_->deep_16_, args->left_shift_, args->right_shift_,
|
||||
args->multiplier_, args->conv_param_, args->filter_zp_ptr_);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out) {
|
||||
int row_pack_count = C4NUM;
|
||||
int col_pack_count;
|
||||
|
||||
#ifdef ENABLE_ARM32
|
||||
col_pack_count = C2NUM;
|
||||
#else
|
||||
if (args->support_optimize_) {
|
||||
col_pack_count = C16NUM;
|
||||
} else {
|
||||
col_pack_count = C4NUM;
|
||||
}
|
||||
#endif
|
||||
int hw_thread_count = UP_DIV(args->matmul_param_->row_, row_pack_count);
|
||||
int oc_thread_count = UP_DIV(args->matmul_param_->col_, col_pack_count);
|
||||
size_t thread_count_hw = MSMIN(thread_num, hw_thread_count);
|
||||
args->thread_stride_hw_ = UP_DIV(hw_thread_count, thread_count_hw);
|
||||
size_t thread_count_oc = MSMIN(thread_num, oc_thread_count);
|
||||
args->thread_stride_oc_ = UP_DIV(oc_thread_count, thread_count_oc);
|
||||
bool parallel_by_oc = oc_thread_count > thread_num;
|
||||
|
||||
for (int batch_index = 0; batch_index < args->conv_param_->input_batch_; batch_index++) {
|
||||
Pre1x1Trans(args,
|
||||
src_in + batch_index * args->conv_param_->input_h_ * args->conv_param_->input_w_ *
|
||||
args->conv_param_->input_channel_,
|
||||
src_out + batch_index * args->matmul_param_->row_ * args->matmul_param_->col_);
|
||||
if (parallel_by_oc) {
|
||||
/* input transpose and input sum */
|
||||
if (args->support_optimize_) {
|
||||
ParallelLaunch(thread_pool, OcOptPre, args, thread_count_hw);
|
||||
} else {
|
||||
RowMajor2Row16x4MajorInt8(args->input_ptr_, args->packed_input_, args->matmul_param_->row_,
|
||||
args->matmul_param_->deep_);
|
||||
if (args->filter_peroc_) {
|
||||
PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, 1, args->matmul_param_->row_4_,
|
||||
args->matmul_param_->deep_16_);
|
||||
} else {
|
||||
PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_,
|
||||
args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
|
||||
args->matmul_param_->row_4_, args->matmul_param_->deep_16_);
|
||||
}
|
||||
}
|
||||
/* matmul parallel by oc */
|
||||
if (args->support_optimize_) {
|
||||
ParallelLaunch(thread_pool, RunArm64OptOc, args, thread_count_oc);
|
||||
} else {
|
||||
ParallelLaunch(thread_pool, RunArmOc, args, thread_count_oc);
|
||||
}
|
||||
} else {
|
||||
/* matmul parallel by hw */
|
||||
if (args->support_optimize_) {
|
||||
ParallelLaunch(thread_pool, RunArm64OptHw, args, thread_count_hw);
|
||||
} else {
|
||||
ParallelLaunch(thread_pool, RunArmHw, args, thread_count_hw);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
|
||||
#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
#include "nnacl/matmul_parameter.h"
|
||||
#include "src/runtime/thread_pool.h"
|
||||
|
||||
typedef struct {
|
||||
int32_t *input_sum_; /* per-oc */
|
||||
int32_t *filter_zp_ptr_; /* per-oc up round */
|
||||
int32_t *left_shift_; /* per-oc up round */
|
||||
int32_t *right_shift_; /* per-oc up round */
|
||||
int32_t *multiplier_; /* per-oc up round */
|
||||
int8_t *packed_weight_;
|
||||
int32_t *bias_data_;
|
||||
int8_t *packed_input_;
|
||||
int8_t *input_ptr_;
|
||||
int8_t *output_ptr_;
|
||||
size_t thread_stride_hw_;
|
||||
size_t thread_stride_oc_;
|
||||
ConvParameter *conv_param_;
|
||||
MatMulParameter *matmul_param_;
|
||||
MATMUL_OPT_DP_FUNC matmul_func_;
|
||||
bool pre_trans_input_;
|
||||
bool support_optimize_;
|
||||
bool filter_peroc_;
|
||||
} Conv1x1Args;
|
||||
|
||||
void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out);
|
||||
|
||||
#endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
|
Loading…
Reference in New Issue