forked from OSchip/llvm-project
185 lines
6.7 KiB
C++
185 lines
6.7 KiB
C++
//===- FakeQuantSupport.cpp - Support utilities for FakeQuant ops ---------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/Quant/FakeQuantSupport.h"
|
|
#include "mlir/Dialect/Quant/QuantTypes.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::quant;
|
|
|
|
static bool getDefaultStorageParams(unsigned numBits, bool narrowRange,
|
|
bool isSigned, MLIRContext *ctx,
|
|
Type &storageType, int64_t &qmin,
|
|
int64_t &qmax) {
|
|
// Hard-coded type mapping from TFLite.
|
|
if (numBits <= 8) {
|
|
storageType = IntegerType::get(8, ctx);
|
|
if (isSigned) {
|
|
qmin = -128;
|
|
qmax = 127;
|
|
} else {
|
|
qmin = 0;
|
|
qmax = 255;
|
|
}
|
|
} else if (numBits <= 16) {
|
|
storageType = IntegerType::get(16, ctx);
|
|
if (isSigned) {
|
|
qmin = -32768;
|
|
qmax = 32767;
|
|
} else {
|
|
qmin = 0;
|
|
qmax = 65535;
|
|
}
|
|
} else if (numBits <= 32) {
|
|
storageType = IntegerType::get(32, ctx);
|
|
if (isSigned) {
|
|
qmin = std::numeric_limits<int32_t>::min();
|
|
qmax = std::numeric_limits<int32_t>::max();
|
|
} else {
|
|
qmin = std::numeric_limits<uint32_t>::min();
|
|
qmax = std::numeric_limits<uint32_t>::max();
|
|
}
|
|
} else {
|
|
return true;
|
|
}
|
|
|
|
// Handle narrowRange.
|
|
if (narrowRange) {
|
|
qmin += 1;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// This is a specific implementation of nudging:
|
|
// If 0.0 < rmin < rmax or rmin < rmax < 0.0, the range will be shifted
|
|
// to include 0.0, but the range width size (rmax-rmin) isn't changed. The zero
|
|
// point is derived from the shifted range, and the scale isn't changed. As
|
|
// a consequence some values, which are supposed in the original [rmin, rmax]
|
|
// range will be outside the shifted range and be clamped during quantization.
|
|
// TODO: we should nudge the scale as well, but that requires the
|
|
// fake quant op used in the training to use the nudged scale as well.
|
|
static void getNudgedScaleAndZeroPoint(int64_t qmin, int64_t qmax, double rmin,
|
|
double rmax, double &scale,
|
|
int64_t &nudgedZeroPoint) {
|
|
// Determine the scale.
|
|
const double qminDouble = qmin;
|
|
const double qmaxDouble = qmax;
|
|
scale = (rmax - rmin) / (qmaxDouble - qminDouble);
|
|
|
|
// Zero point computation.
|
|
// In float, solve the affine equation for any known pair
|
|
// (real value, corresponding quantized value), of which, two such pairs
|
|
// are known: (rmin, qmin), (rmax, qmax).
|
|
// The arithmetic error on the zero point computed from either pair will be
|
|
// roughly machine_epsilon * (sum of absolute values of terms).
|
|
// Use the variant that adds the smaller error.
|
|
const double zeroPointFromMin = qminDouble - rmin / scale;
|
|
const double zeroPointFromMinError =
|
|
std::abs(qminDouble) + std::abs(rmin / scale);
|
|
const double zeroPointFromMax = qmaxDouble - rmax / scale;
|
|
const double zeroPointFromMaxError =
|
|
std::abs(qmaxDouble) + std::abs(rmax / scale);
|
|
|
|
const double zeroPointDouble = (zeroPointFromMinError < zeroPointFromMaxError)
|
|
? zeroPointFromMin
|
|
: zeroPointFromMax;
|
|
|
|
// Now nudge the zero point to be an integer.
|
|
nudgedZeroPoint = 0;
|
|
if (zeroPointDouble < qminDouble) {
|
|
nudgedZeroPoint = qmin;
|
|
} else if (zeroPointDouble > qmaxDouble) {
|
|
nudgedZeroPoint = qmax;
|
|
} else {
|
|
nudgedZeroPoint = round(zeroPointDouble);
|
|
}
|
|
|
|
// By construction, the nudged zero point should always be in range.
|
|
assert(nudgedZeroPoint >= qmin);
|
|
assert(nudgedZeroPoint <= qmax);
|
|
}
|
|
|
|
UniformQuantizedType
|
|
mlir::quant::fakeQuantAttrsToType(Location loc, unsigned numBits, double rmin,
|
|
double rmax, bool narrowRange,
|
|
Type expressedType, bool isSigned) {
|
|
MLIRContext *ctx = expressedType.getContext();
|
|
unsigned flags = isSigned ? QuantizationFlags::Signed : 0;
|
|
Type storageType;
|
|
int64_t qmin;
|
|
int64_t qmax;
|
|
if (getDefaultStorageParams(numBits, narrowRange, isSigned, ctx, storageType,
|
|
qmin, qmax)) {
|
|
return (emitError(loc, "unsupported FakeQuant number of bits: ") << numBits,
|
|
nullptr);
|
|
}
|
|
|
|
// Special case where min/max is close enough. The tensor contents are all
|
|
// 0.0s, so the scale is set to 1.0 and the tensor can be quantized to zero
|
|
// points and dequantized to 0.0.
|
|
if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
|
|
return UniformQuantizedType::getChecked(flags, storageType, expressedType,
|
|
1.0, qmin, qmin, qmax, loc);
|
|
}
|
|
|
|
double scale;
|
|
int64_t nudgedZeroPoint;
|
|
getNudgedScaleAndZeroPoint(qmin, qmax, rmin, rmax, scale, nudgedZeroPoint);
|
|
|
|
return UniformQuantizedType::getChecked(flags, storageType, expressedType,
|
|
scale, nudgedZeroPoint, qmin, qmax,
|
|
loc);
|
|
}
|
|
|
|
UniformQuantizedPerAxisType mlir::quant::fakeQuantAttrsToType(
|
|
Location loc, unsigned numBits, int32_t quantizedDimension,
|
|
ArrayRef<double> rmins, ArrayRef<double> rmaxs, bool narrowRange,
|
|
Type expressedType, bool isSigned) {
|
|
size_t axis_size = rmins.size();
|
|
if (axis_size != rmaxs.size()) {
|
|
return (emitError(loc, "mismatched per-axis min and max size: ")
|
|
<< axis_size << " vs. " << rmaxs.size(),
|
|
nullptr);
|
|
}
|
|
|
|
MLIRContext *ctx = expressedType.getContext();
|
|
Type storageType;
|
|
int64_t qmin;
|
|
int64_t qmax;
|
|
if (getDefaultStorageParams(numBits, narrowRange, isSigned, ctx, storageType,
|
|
qmin, qmax)) {
|
|
return (emitError(loc, "unsupported FakeQuant number of bits: ") << numBits,
|
|
nullptr);
|
|
}
|
|
|
|
SmallVector<double, 4> scales;
|
|
SmallVector<int64_t, 4> zeroPoints;
|
|
scales.reserve(axis_size);
|
|
zeroPoints.reserve(axis_size);
|
|
for (size_t axis = 0; axis != axis_size; ++axis) {
|
|
double rmin = rmins[axis];
|
|
double rmax = rmaxs[axis];
|
|
if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
|
|
scales.push_back(1.0);
|
|
zeroPoints.push_back(qmin);
|
|
continue;
|
|
}
|
|
|
|
double scale;
|
|
int64_t nudgedZeroPoint;
|
|
getNudgedScaleAndZeroPoint(qmin, qmax, rmin, rmax, scale, nudgedZeroPoint);
|
|
scales.push_back(scale);
|
|
zeroPoints.push_back(nudgedZeroPoint);
|
|
}
|
|
|
|
unsigned flags = isSigned ? QuantizationFlags::Signed : 0;
|
|
return UniformQuantizedPerAxisType::getChecked(
|
|
flags, storageType, expressedType, scales, zeroPoints, quantizedDimension,
|
|
qmin, qmax, loc);
|
|
}
|