forked from mindspore-Ecosystem/mindspore
!5878 reduce prod support int
Merge pull request !5878 from zhaozhenlong/lite/issue/reduce_int
This commit is contained in:
commit
8200410f20
|
@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co
|
|||
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
|
||||
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
|
||||
int offset4d(const int *shape, const int *dims);
|
||||
inline bool isAddOverflow(int32_t x, int32_t y) {
|
||||
int32_t sum = x + y;
|
||||
return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
|
||||
}
|
||||
|
||||
inline bool isMulOverflow(int32_t x, int32_t y) {
|
||||
int32_t p = x * y;
|
||||
return (x != 0) && (p / x != y);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <float.h>
|
||||
#include "nnacl/fp32/reduce.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
#include "nnacl/common_func.h"
|
||||
|
||||
int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
|
||||
const int tid, const int thread_num) {
|
||||
|
@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size,
|
|||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
|
||||
const int tid, const int thread_num) {
|
||||
if (src_data == NULL || dst_data == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
int i, j, k;
|
||||
for (j = tid; j < outer_size; j += thread_num) {
|
||||
const int *outer_src = src_data + j * axis_size * inner_size;
|
||||
int *outer_dst = dst_data + j * inner_size;
|
||||
for (k = 0; k < inner_size; k++) {
|
||||
const int *inner_src = outer_src + k;
|
||||
int *inner_dst = outer_dst + k;
|
||||
int tmp = 1;
|
||||
for (i = 0; i < axis_size; i++) {
|
||||
if (isMulOverflow(tmp, inner_src[i * inner_size])) {
|
||||
return NNACL_ERRCODE_MUL_OVERFLOW;
|
||||
}
|
||||
tmp *= inner_src[i * inner_size];
|
||||
}
|
||||
*inner_dst = tmp;
|
||||
}
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
|
||||
float *dst_data, const int tid, const int thread_num) {
|
||||
if (src_data == NULL || dst_data == NULL) {
|
||||
|
|
|
@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c
|
|||
const int tid, const int thread_num);
|
||||
int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
|
||||
const int tid, const int thread_num);
|
||||
int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
|
||||
const int tid, const int thread_num);
|
||||
int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
|
||||
float *dst_data, const int tid, const int thread_num);
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -18,16 +18,7 @@
|
|||
#include "nnacl/int8/reduce_int8.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
#include "nnacl/quantization/fixed_point.h"
|
||||
|
||||
inline bool isAddOverflow(int32_t x, int32_t y) {
|
||||
int32_t sum = x + y;
|
||||
return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
|
||||
}
|
||||
|
||||
inline bool isMulOverflow(int32_t x, int32_t y) {
|
||||
int32_t p = x * y;
|
||||
return (x != 0) && (p / x != y);
|
||||
}
|
||||
#include "nnacl/common_func.h"
|
||||
|
||||
// Get x such that (x-zp_in) * scale_in = mean
|
||||
// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
|
||||
|
@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis
|
|||
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
|
||||
(tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)),
|
||||
quant->in_out_multiplier_),
|
||||
quant->in_out_right_shift_ + base_offset);
|
||||
quant->in_out_right_shift_ + base_offset);
|
||||
if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
|
||||
return NNACL_ERRCODE_ADD_OVERFLOW;
|
||||
}
|
||||
|
|
|
@ -53,6 +53,7 @@
|
|||
|
||||
typedef enum LiteDataType {
|
||||
kDataTypeFloat,
|
||||
kDataTypeInt,
|
||||
kDataTypeInt8,
|
||||
} LiteDataType;
|
||||
|
||||
|
|
|
@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *>
|
|||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
|
||||
REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
|
||||
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
|
||||
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator)
|
||||
|
|
|
@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() {
|
|||
}
|
||||
case static_cast<int>(ReduceMode_ReduceProd): {
|
||||
reducer_ = ReduceProd;
|
||||
int_reducer_ = IntReduceProd;
|
||||
break;
|
||||
}
|
||||
case static_cast<int>(ReduceMode_ReduceSumSquare): {
|
||||
|
@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() {
|
|||
return ReSize();
|
||||
}
|
||||
|
||||
int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); }
|
||||
int ReduceCPUKernel::ReSize() {
|
||||
if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) {
|
||||
data_type_ = kDataTypeFloat;
|
||||
} else {
|
||||
data_type_ = kDataTypeInt;
|
||||
}
|
||||
return ReduceBaseCPUKernel::ReSize();
|
||||
}
|
||||
|
||||
int ReduceCPUKernel::CallReduceUnit(int task_id) {
|
||||
auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_);
|
||||
int ret;
|
||||
if (data_type_ == kDataTypeFloat) {
|
||||
ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_),
|
||||
static_cast<float *>(dst_data_), task_id, context_->thread_num_);
|
||||
} else {
|
||||
ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_),
|
||||
static_cast<int *>(dst_data_), task_id, context_->thread_num_);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() {
|
|||
return ret;
|
||||
}
|
||||
|
||||
src_data_ = static_cast<float *>(in_tensors_.at(0)->MutableData());
|
||||
src_data_ = in_tensors_.at(0)->MutableData();
|
||||
for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) {
|
||||
if (i != static_cast<size_t>(num_axes_ - 1)) {
|
||||
dst_data_ = data_buffers_[i];
|
||||
} else {
|
||||
dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
|
||||
dst_data_ = out_tensors_.at(0)->MutableData();
|
||||
}
|
||||
outer_size_ = outer_sizes_[i];
|
||||
inner_size_ = inner_sizes_[i];
|
||||
|
@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() {
|
|||
int ReduceCPUKernel::MallocTmpBuffer() {
|
||||
data_buffers_.clear();
|
||||
for (auto size : buffer_sizes_) {
|
||||
float *buffer = reinterpret_cast<float *>(context_->allocator->Malloc(size * sizeof(float)));
|
||||
void *buffer;
|
||||
if (data_type_ == kDataTypeFloat) {
|
||||
buffer = context_->allocator->Malloc(size * sizeof(float));
|
||||
} else {
|
||||
buffer = context_->allocator->Malloc(size * sizeof(int));
|
||||
}
|
||||
if (buffer == nullptr) {
|
||||
MS_LOG(ERROR) << "Malloc data failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() {
|
|||
}
|
||||
|
||||
void ReduceCPUKernel::FreeTmpBuffer() {
|
||||
for (size_t i = 0; i < data_buffers_.size(); i++) {
|
||||
float *buffer = data_buffers_[i];
|
||||
for (auto buffer : data_buffers_) {
|
||||
if (buffer != nullptr) {
|
||||
context_->allocator->Free(buffer);
|
||||
buffer = nullptr;
|
||||
|
|
|
@ -29,6 +29,8 @@ namespace mindspore::kernel {
|
|||
class ReduceCPUKernel : public ReduceBaseCPUKernel {
|
||||
typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
|
||||
float *dst_data, const int tid, const int thread_num);
|
||||
typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data,
|
||||
int *dst_data, const int tid, const int thread_num);
|
||||
|
||||
public:
|
||||
ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
|
||||
|
@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
|
|||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
|
||||
~ReduceCPUKernel() {
|
||||
FreeTmpBuffer();
|
||||
src_data_ = nullptr;
|
||||
dst_data_ = nullptr;
|
||||
reducer_ = nullptr;
|
||||
int_reducer_ = nullptr;
|
||||
}
|
||||
|
||||
int Init() override;
|
||||
|
@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
|
|||
|
||||
private:
|
||||
Reducer reducer_ = nullptr;
|
||||
std::vector<float *> data_buffers_;
|
||||
const float *src_data_ = nullptr;
|
||||
float *dst_data_ = nullptr;
|
||||
IntReducer int_reducer_ = nullptr;
|
||||
std::vector<void *> data_buffers_;
|
||||
LiteDataType data_type_;
|
||||
|
||||
const void *src_data_ = nullptr;
|
||||
void *dst_data_ = nullptr;
|
||||
|
||||
private:
|
||||
int MallocTmpBuffer();
|
||||
|
|
Loading…
Reference in New Issue