!19854 clearing the untrusted code of quantization

Merge pull request !19854 from Erpim/master
This commit is contained in:
i-robot 2021-07-12 11:24:10 +00:00 committed by Gitee
commit 5cde059ef3
21 changed files with 231 additions and 165 deletions

View File

@ -42,7 +42,6 @@ const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGradGpuKernel::GetWork
bool FakeLearnedScaleQuantPerChannelGradGpuKernel::Init(const CNodePtr &kernel_node) {
kernel_node_ = kernel_node;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "Input number is " << input_num
<< ", but FakeLearnedScaleQuantPerChannelGrad GpuKernel OP needs 4 input.";
@ -109,7 +108,7 @@ bool FakeLearnedScaleQuantPerChannelGradGpuKernel::Launch(const std::vector<Addr
if (global_step_ >= quant_delay_) {
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float) * num_channels_,
cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float) * kChannelLen,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed");
CalLSQNudgePerChannel(input, quant_num_, input_alpha, input_quant_max, input_div_alpha, input_quant, neg_trunc_,
@ -118,7 +117,7 @@ bool FakeLearnedScaleQuantPerChannelGradGpuKernel::Launch(const std::vector<Addr
neg_trunc_, num_channels_, reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float) * num_channels_,
cudaMemcpyAsync(grad_alpha, alpha_no_grad, sizeof(float) * kChannelLen,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,

View File

@ -36,7 +36,6 @@ const std::vector<size_t> &FakeLearnedScaleQuantPerLayerGradGpuKernel::GetWorksp
bool FakeLearnedScaleQuantPerLayerGradGpuKernel::Init(const CNodePtr &kernel_node) {
kernel_node_ = kernel_node;
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "Input number is " << input_num
<< ", but FakeLearnedScaleQuantPerLayerGrad GpuKernel OP needs 4 input.";

View File

@ -29,7 +29,7 @@ namespace mindspore {
namespace opt {
namespace {
void CreateOutputsOfLSQPerLayerGradD(const FuncGraphPtr &graph, const CNodePtr &lsq_perlayer_grad_node,
std::vector<AnfNodePtr> *lsq_perlayer_grad_d_outputs) {
std::vector<AnfNodePtr> *const lsq_perlayer_grad_d_outputs) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(lsq_perlayer_grad_node);
const auto &lsq_perlayer_grad_inputs = lsq_perlayer_grad_node->inputs();
@ -58,7 +58,7 @@ void CreateOutputsOfLSQPerLayerGradD(const FuncGraphPtr &graph, const CNodePtr &
void CreateOutputsOfLSQPerLayerReduceGrad(const FuncGraphPtr &graph, const CNodePtr &lsq_perlayer_grad_node,
const std::vector<AnfNodePtr> &lsq_perlayer_grad_d_outputs,
std::vector<AnfNodePtr> *lsq_perlayer_reduce_grad_outputs) {
std::vector<AnfNodePtr> *const lsq_perlayer_reduce_grad_outputs) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(lsq_perlayer_grad_node);
MS_EXCEPTION_IF_NULL(lsq_perlayer_reduce_grad_outputs);
@ -86,7 +86,7 @@ void CreateOutputsOfLSQPerLayerReduceGrad(const FuncGraphPtr &graph, const CNode
}
void CreateOutputsOfLSQPerChannelGradD(const FuncGraphPtr &graph, const CNodePtr &lsq_perchannel_grad_node,
std::vector<AnfNodePtr> *lsq_perchannel_grad_d_outputs) {
std::vector<AnfNodePtr> *const lsq_perchannel_grad_d_outputs) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(lsq_perchannel_grad_node);
const auto &lsq_perchannel_grad_inputs = lsq_perchannel_grad_node->inputs();
@ -116,7 +116,7 @@ void CreateOutputsOfLSQPerChannelGradD(const FuncGraphPtr &graph, const CNodePtr
void CreateOutputsOfLSQPerChannelReduceGrad(const FuncGraphPtr &graph, const CNodePtr &lsq_perchannel_grad_node,
const std::vector<AnfNodePtr> &lsq_perchannel_grad_d_outputs,
std::vector<AnfNodePtr> *lsq_perchannel_reduce_grad_outputs) {
std::vector<AnfNodePtr> *const lsq_perchannel_reduce_grad_outputs) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(lsq_perchannel_grad_node);
MS_EXCEPTION_IF_NULL(lsq_perchannel_reduce_grad_outputs);

View File

@ -30,7 +30,7 @@ from ...nn.layer import quant
from ...ops import functional as F
from ..common import QuantDtype
from .quantizer import Quantizer, OptimizeOption
from .quant_utils import compute_KL_threshold
from .quant_utils import compute_kl_threshold
__all__ = ["QuantizationAwareTraining", "create_quant_config"]
@ -281,7 +281,8 @@ class QuantizationAwareTraining(Quantizer):
mode=self.mode)
self.eps = 1e-5
def _convert_op_name(self, name):
@staticmethod
def _convert_op_name(name):
pattern = re.compile(r'([A-Z]{1})')
name_new = re.sub(pattern, r'_\1', name).lower()
if name_new[0] == '_':
@ -382,7 +383,7 @@ class QuantizationAwareTraining(Quantizer):
scale_factor = (subcell.batchnorm.gamma.data.asnumpy() /
np.sqrt(subcell.batchnorm.moving_variance.data.asnumpy() + self.eps))
subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
min_init, max_init = self._KL_init(subcell_weight_para, self.weight_dtype)
min_init, max_init = self._kl_init(subcell_weight_para, self.weight_dtype)
self.quant_config = self.quant_config._replace(
weight=self.quant_config.weight.partial_init(min_init=min_init, max_init=max_init))
@ -485,7 +486,7 @@ class QuantizationAwareTraining(Quantizer):
scale_factor = (subcell.batchnorm.gamma.data.asnumpy() /
np.sqrt(subcell.batchnorm.moving_variance.data.asnumpy() + self.eps))
subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
min_init, max_init = self._KL_init(subcell_weight_para, self.weight_dtype)
min_init, max_init = self._kl_init(subcell_weight_para, self.weight_dtype)
self.quant_config = self.quant_config._replace(
weight=self.quant_config.weight.partial_init(min_init=min_init, max_init=max_init))
@ -533,16 +534,16 @@ class QuantizationAwareTraining(Quantizer):
quant_dtype=self.act_dtype)
raise ValueError("Unsupported activation in auto quant: ", act_class)
def _KL_init(self, subcell_weight_para, weight_dtype):
def _kl_init(self, subcell_weight_para, weight_dtype):
"""
Calculate the value of max_init and min_init with compute_KL_threshold.
Calculate the value of max_init and min_init with compute_kl_threshold.
"""
if self.weight_channel:
max_init = [compute_KL_threshold(weight_para_each, weight_dtype)
max_init = [compute_kl_threshold(weight_para_each, weight_dtype)
for weight_para_each in subcell_weight_para]
min_init = [-x for x in max_init]
else:
max_init = [compute_KL_threshold(subcell_weight_para, weight_dtype)]
max_init = [compute_kl_threshold(subcell_weight_para, weight_dtype)]
min_init = [-x for x in max_init]
return min_init, max_init
@ -567,15 +568,17 @@ class QuantizationAwareTraining(Quantizer):
raise ValueError("The `_set_mixed_bits` function is currently only valid for `LEARNED_SCALE` "
"optimize_option.")
self.quantizable_idx = []
quantizable_idx = []
pass_cell = None
for i, cell_and_name in enumerate(network.cells_and_names()):
cell = cell_and_name[1]
if isinstance(cell, (nn.Conv2dBnAct, nn.DenseBnAct)) and cell is not pass_cell:
self.quantizable_idx.append(i)
quantizable_idx.append(i)
assert len(self.quantizable_idx) == len(strategy)
quantizable_layer_bit_dict = {idx: bit for idx, bit in zip(self.quantizable_idx, strategy)}
if len(quantizable_idx) != len(strategy):
raise ValueError("The dimension of quantifiable layers is not consistent with that of strategy.")
quantizable_layer_bit_dict = {idx: bit for idx, bit in zip(quantizable_idx, strategy)}
type_map = {
QuantDtype.INT2.num_bits: QuantDtype.INT2,
QuantDtype.INT3.num_bits: QuantDtype.INT3,
@ -587,7 +590,7 @@ class QuantizationAwareTraining(Quantizer):
}
for i, cell_and_name in enumerate(network.cells_and_names()):
cell = cell_and_name[1]
if i not in self.quantizable_idx:
if i not in quantizable_idx:
continue
else:
if isinstance(cell, (nn.Conv2dBnAct, nn.DenseBnAct)):
@ -598,7 +601,7 @@ class QuantizationAwareTraining(Quantizer):
scale_factor = (cell.conv.gamma.data.asnumpy() /
np.sqrt(cell.conv.moving_variance.data.asnumpy() + self.eps))
subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
min_init, max_init = self._KL_init(subcell_weight_para, cell.weight_dtype)
min_init, max_init = self._kl_init(subcell_weight_para, cell.weight_dtype)
cell.conv.fake_quant_weight.reset(quant_dtype=cell.weight_dtype,
min_init=min_init,
max_init=max_init)
@ -608,7 +611,7 @@ class QuantizationAwareTraining(Quantizer):
scale_factor = (cell.dense.gamma.data.asnumpy() /
np.sqrt(cell.dense.moving_variance.data.asnumpy() + self.eps))
subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
min_init, max_init = self._KL_init(subcell_weight_para, cell.weight_dtype)
min_init, max_init = self._kl_init(subcell_weight_para, cell.weight_dtype)
cell.dense.fake_quant_weight.reset(quant_dtype=cell.weight_dtype,
min_init=min_init,
max_init=max_init)

View File

@ -222,7 +222,7 @@ def without_fold_batchnorm(weight, cell_quant):
return weight, bias
def compute_KL_threshold(data, bitwidth):
def compute_kl_threshold(data, bitwidth):
r"""
Using KL-J Distance to calculate the clip threshold.
@ -232,20 +232,18 @@ def compute_KL_threshold(data, bitwidth):
Outputs:
Tensor with Shape 1. Threshold to calculate the data.
"""
bitwidth = bitwidth.num_bits
data_min = 0
data_max = np.abs(data).max()
if data_max < 1e-5:
return 1e-5
hist, bin_edges = np.histogram(np.abs(data), bins='sqrt', range=(data_min, data_max), density=True)
hist, bin_edges = np.histogram(np.abs(data), bins='sqrt', range=(0, data_max), density=True)
# For the sake of high efficiency, we limit the maximum number of bins to 1024 in `sqrt` mode, If it exceeds the
# largest size, turn to use the default bins config.
largest_bin_size = 1024
if hist.shape[0] > largest_bin_size:
hist, bin_edges = np.histogram(np.abs(data), range=(data_min, data_max), density=True)
hist, bin_edges = np.histogram(np.abs(data), range=(0, data_max), density=True)
hist = hist / np.sum(hist)
cumsum = np.cumsum(hist)
bit_pow_range = pow(2, int(bitwidth) - 1)
bit_pow_range = pow(2, int(bitwidth.num_bits) - 1)
threshold = []
scaling_factor = []
kl = []
@ -349,11 +347,11 @@ def load_nonquant_param_into_quant_net(quant_model, params_dict, quant_new_param
subcell_weight_para = subcell_weight_para * scale_factor.reshape(-1, 1, 1, 1)
if cell.fake_quant_weight.per_channel:
max_init = [compute_KL_threshold(weight_para_each, cell.fake_quant_weight.quant_dtype)
max_init = [compute_kl_threshold(weight_para_each, cell.fake_quant_weight.quant_dtype)
for weight_para_each in subcell_weight_para]
min_init = [-x for x in max_init]
else:
max_init = [compute_KL_threshold(subcell_weight_para, cell.fake_quant_weight.quant_dtype)]
max_init = [compute_kl_threshold(subcell_weight_para, cell.fake_quant_weight.quant_dtype)]
min_init = [-x for x in max_init]
cell.fake_quant_weight.reset(quant_dtype=cell.fake_quant_weight.quant_dtype,

View File

@ -57,4 +57,8 @@ class Quantizer(ABC):
@abstractmethod
def quantize(self, network):
pass
"""
Quant API to convert input network to a quantization aware training network
Args:
network (Cell): network to be quantized.
"""

View File

@ -1535,6 +1535,7 @@ class ActQuant(_QuantActivation):
quant_dtype=quant_dtype,
neg_trunc=self.neg_trunc,
narrow_range=self.narrow_range)
def construct(self, x):
if self.fake_before:
x = self.fake_quant_act_before(x)

View File

@ -125,7 +125,7 @@ def get_bprop_batchnorm_fold2(self):
@bprop_getters.register(Q.BatchNormFoldD)
def get_bprop_BatchNormFold(self):
def get_bprop_batchnormfold(self):
"""Generate bprop for BatchNormFold for Ascend"""
op = Q.BatchNormFoldGradD(self.epsilon, self.is_training, self.freeze_bn)
@ -137,7 +137,7 @@ def get_bprop_BatchNormFold(self):
@bprop_getters.register(P.BNTrainingReduce)
def get_bprop_BNTrainingReduce(self):
def get_bprop_bn_training_reduce(self):
"""Generate bprop for BNTrainingReduce for Ascend"""
def bprop(x, out, dout):

View File

@ -57,6 +57,43 @@ def _batchnorm_fold_tbe():
return
def _batchnorm_fold_compute(x_input, x_sum, x_square_sum, mean, variance, momentum, epsilon):
"""_batchnorm_fold_compute"""
shape_x = te.lang.cce.util.shape_to_list(x_input.shape)
num = shape_x[0] * shape_x[2] * shape_x[3]
num_rec = 1.0 / num
# compute the mean of x
batch_mean = te.lang.cce.vmuls(x_sum, num_rec)
# compute the variance of x
variance_div = te.lang.cce.vmuls(x_square_sum, num_rec)
mean_square = te.lang.cce.vmul(batch_mean, batch_mean)
batch_var_biased = te.lang.cce.vsub(variance_div, mean_square)
batch_std = te.lang.cce.vsqrt(te.lang.cce.vadds(batch_var_biased, epsilon))
if num == 1:
batch_var_scaler = 0.0
else:
batch_var_scaler = float(num) / (num - 1)
batch_var_unbiased = te.lang.cce.vmuls(batch_var_biased, batch_var_scaler)
factor = 1.0 - momentum
factor_reverse = momentum
mean_mul = te.lang.cce.vmuls(batch_mean, factor)
mean_mul_rev = te.lang.cce.vmuls(mean, factor_reverse)
mean_updated = te.lang.cce.vadd(mean_mul, mean_mul_rev)
var_mul = te.lang.cce.vmuls(batch_var_unbiased, factor)
var_mul_rev = te.lang.cce.vmuls(variance, factor_reverse)
variance_updated = te.lang.cce.vadd(var_mul, var_mul_rev)
y = te.lang.cce.vadds(x_input, 0.0)
running_mean = te.lang.cce.vadds(mean, 0.0)
running_std = te.lang.cce.vsqrt(te.lang.cce.vadds(variance, epsilon))
res = [y, batch_mean, batch_std, running_mean, running_std, mean_updated, variance_updated]
return res
@util.check_input_type(dict, dict, dict, dict, dict,
dict, dict, dict, dict, dict, dict, dict,
float, float, bool, int, str, str)
@ -108,39 +145,7 @@ def batchnorm_fold(x, x_sum, x_square_sum, mean, variance,
mean = tvm.placeholder(shape_mean, name="mean", dtype=dtype_mean.lower())
variance = tvm.placeholder(shape_mean, name="variance", dtype=dtype_variance.lower())
shape_x = te.lang.cce.util.shape_to_list(x_input.shape)
num = shape_x[0] * shape_x[2] * shape_x[3]
num_rec = 1.0 / num
# compute the mean of x
batch_mean = te.lang.cce.vmuls(x_sum, num_rec)
# compute the variance of x
variance_div = te.lang.cce.vmuls(x_square_sum, num_rec)
mean_square = te.lang.cce.vmul(batch_mean, batch_mean)
batch_var_biased = te.lang.cce.vsub(variance_div, mean_square)
batch_std = te.lang.cce.vsqrt(te.lang.cce.vadds(batch_var_biased, epsilon))
if num == 1:
batch_var_scaler = 0.0
else:
batch_var_scaler = float(num) / (num - 1)
batch_var_unbiased = te.lang.cce.vmuls(batch_var_biased, batch_var_scaler)
factor = 1.0 - momentum
factor_reverse = momentum
mean_mul = te.lang.cce.vmuls(batch_mean, factor)
mean_mul_rev = te.lang.cce.vmuls(mean, factor_reverse)
mean_updated = te.lang.cce.vadd(mean_mul, mean_mul_rev)
var_mul = te.lang.cce.vmuls(batch_var_unbiased, factor)
var_mul_rev = te.lang.cce.vmuls(variance, factor_reverse)
variance_updated = te.lang.cce.vadd(var_mul, var_mul_rev)
y = te.lang.cce.vadds(x_input, 0.0)
running_mean = te.lang.cce.vadds(mean, 0.0)
running_std = te.lang.cce.vsqrt(te.lang.cce.vadds(variance, epsilon))
res = [y, batch_mean, batch_std, running_mean, running_std, mean_updated, variance_updated]
res = _batchnorm_fold_compute(x_input, x_sum, x_square_sum, mean, variance, momentum, epsilon)
with tvm.target.cce():
sch = generic.auto_schedule(res)
config = {"name": kernel_name,

View File

@ -21,6 +21,7 @@ from te.platform.cce_build import build_config
from topi import generic
from topi.cce import util
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from impl.bn_training_reduce import bn_training_reduce_schedule_nd
SHAPE_SIZE_LIMIT = 2147483648
@ -81,9 +82,8 @@ def batchnorm_fold2_grad_reduce(dout, x, dout_reduce, dout_x_reduce, kernel_name
util.check_kernel_name(kernel_name)
util.check_shape_rule(shape)
util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
check_list = ["float16", "float32"]
inp_dtype = x.get("dtype").lower()
if not inp_dtype in check_list:
if not inp_dtype in ["float16", "float32"]:
raise RuntimeError("Dtype of input only support float16, float32")
dout_t = tvm.placeholder(shape, name="dout", dtype=inp_dtype)
x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)
@ -100,7 +100,7 @@ def batchnorm_fold2_grad_reduce(dout, x, dout_reduce, dout_x_reduce, kernel_name
te.lang.cce.cce_build_code(sch, config)
return
from impl.bn_training_reduce import bn_training_reduce_schedule_nd
sch, tensor_list = bn_training_reduce_schedule_nd(res_list)
with build_config:
tvm.build(sch, tensor_list, "cce", name=kernel_name)

View File

@ -95,12 +95,12 @@ def batchnorm_fold_grad(d_batch_mean, d_batch_std, x, batch_mean, batch_std, dx,
dtype_mean = d_batch_mean.get("dtype").lower()
if format_data == "NC1HWC0":
if len(shape_x) != 5:
raise RuntimeError("batchnorm_fold only support shape 5D"
raise RuntimeError("batchnorm_fold grad only support shape 5D"
"when input format is NC1HWC0")
shape_mean = (1, shape_x[1], 1, 1, shape_x[4])
elif format_data == "NCHW":
if len(shape_x) < 2 or len(shape_x) > 4:
raise RuntimeError("batchnorm_fold only support shape 2D to 4D")
raise RuntimeError("batchnorm_fold grad only support shape 2D to 4D")
if shape_x[1] != shape_mean[0]:
raise RuntimeError("data_format is NCHW, shape_bias must"
"be equal to the second axis of shape_x")

View File

@ -66,9 +66,8 @@ def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correctio
util.check_kernel_name(kernel_name)
util.check_shape_rule(shape)
util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
check_list = ["float16", "float32"]
inp_dtype = x.get("dtype").lower()
if not inp_dtype in check_list:
if not inp_dtype in ["float16", "float32"]:
raise RuntimeError("Dtype of input only support float16, float32")
x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)

View File

@ -71,10 +71,9 @@ def fake_learned_scale_quant_perchannel_compute(input_data, alpha_data, quant_ma
return res
@util.check_input_type(dict, dict, dict, dict, bool, int, str)
def fake_learned_scale_quant_perchannel(input_x, alpha, quant_max, out, neg_trunc, channel_axis,
kernel_name="fake_learned_scale_quant_perchannel"):
"""FakeLearnedScaleQuantPerChannel"""
def fake_learned_scale_quant_perchannel_param(input_x, alpha, quant_max, channel_axis,
kernel_name="fake_learned_scale_quant_perchannel"):
"""Get and check FakeLearnedScaleQuantPerChannel parameters"""
input_shape = input_x.get("shape")
input_x_shape_ = input_x.get("ori_shape")
input_x_format = input_x.get("format")
@ -113,6 +112,16 @@ def fake_learned_scale_quant_perchannel(input_x, alpha, quant_max, out, neg_trun
input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
alpha_data = tvm.placeholder(shape_c, name="alpha_data", dtype=alpha_dtype)
quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
return input_data, alpha_data, quant_max_data
@util.check_input_type(dict, dict, dict, dict, bool, int, str)
def fake_learned_scale_quant_perchannel(input_x, alpha, quant_max, out, neg_trunc, channel_axis,
kernel_name="fake_learned_scale_quant_perchannel"):
"""FakeLearnedScaleQuantPerChannel"""
input_data, alpha_data, quant_max_data = \
fake_learned_scale_quant_perchannel_param(input_x, alpha, quant_max, channel_axis, kernel_name)
res = fake_learned_scale_quant_perchannel_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name)
with tvm.target.cce():

View File

@ -58,6 +58,36 @@ def _fake_learned_scale_quant_perchannel_grad_d_tbe():
return
def _sign_function(dtype, input_div_alpha):
"""sign function imp"""
if dtype == "float32":
data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP32, dtype=dtype)
elif dtype == "float16":
data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP16, dtype=dtype)
else:
data_min = tvm.const(1, dtype=dtype)
neg_data_min = tvm.const(-1, dtype=dtype)
vmax = te.lang.cce.vmaxs(input_div_alpha, neg_data_min)
vmin = te.lang.cce.vmins(vmax, data_min)
if dtype == "float32":
# max num of float32 is 2**126
max_support_fp32 = tvm.const(2 ** 62, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp32)
res_mul2 = te.lang.cce.vmuls(res_mul1, max_support_fp32)
sign = te.lang.cce.vmuls(res_mul2, tvm.const(2 ** 2, dtype=dtype))
elif dtype == "float16":
# max num of float16 is 2**24
# but cce can only support 2**12, so use 12/12 to adaptor 24
max_support_fp16 = tvm.const(2 ** 12, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp16)
sign = te.lang.cce.vmuls(res_mul1, max_support_fp16)
else:
sign = vmin
return sign
@fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d")
def fake_learned_scale_quant_perchannel_grad_d_compute(dout, input_data, alpha_data, quant_max_data, neg_trunc,
kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
@ -86,7 +116,6 @@ def fake_learned_scale_quant_perchannel_grad_d_compute(dout, input_data, alpha_d
tensor_one = tvm.const(1.0, input_div_alpha.dtype)
tensor_one = te.lang.cce.broadcast(tensor_one, shape)
#out_of_bounds = te.lang.cce.vcmpsel(te.lang.cce.vabs(input_div_alpha), 1.0, 'gt', 1.0, 0.0)
out_of_upper_bounds = te.lang.cce.vcmpsel(input_div_alpha, 1.0, 'gt', 1.0, 0.0)
if neg_trunc:
out_of_lower_bounds = te.lang.cce.vcmpsel(input_div_alpha, 0.0, 'lt', 1.0, 0.0)
@ -96,32 +125,7 @@ def fake_learned_scale_quant_perchannel_grad_d_compute(dout, input_data, alpha_d
dx = te.lang.cce.vmul(dx, te.lang.cce.vsub(tensor_one, out_of_bounds))
# sign function imp
if dtype == "float32":
data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP32, dtype=dtype)
elif dtype == "float16":
data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP16, dtype=dtype)
else:
data_min = tvm.const(1, dtype=dtype)
neg_data_min = tvm.const(-1, dtype=dtype)
vmax = te.lang.cce.vmaxs(input_div_alpha, neg_data_min)
vmin = te.lang.cce.vmins(vmax, data_min)
if dtype == "float32":
# max num of float32 is 2**126
max_support_fp32 = tvm.const(2 ** 62, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp32)
res_mul2 = te.lang.cce.vmuls(res_mul1, max_support_fp32)
sign = te.lang.cce.vmuls(res_mul2, tvm.const(2 ** 2, dtype=dtype))
elif dtype == "float16":
# max num of float16 is 2**24
# but cce can only support 2**12, so use 12/12 to adaptor 24
max_support_fp16 = tvm.const(2 ** 12, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp16)
sign = te.lang.cce.vmuls(res_mul1, max_support_fp16)
else:
sign = vmin
sign = _sign_function(dtype, input_div_alpha)
# The following lines are equivalent to :
# dalpha_each = dout * sign if out of bounds
@ -136,10 +140,9 @@ def fake_learned_scale_quant_perchannel_grad_d_compute(dout, input_data, alpha_d
return [dx, dalpha_each]
@util.check_input_type(dict, dict, dict, dict, dict, dict, bool, int, str)
def fake_learned_scale_quant_perchannel_grad_d(dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc,
channel_axis, kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
"""FakeLearnedScaleQuantPerChannelGradD"""
def fake_learned_scale_quant_perchannel_grad_d_param(input_x, alpha, quant_max, channel_axis,
kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
"""Get and check FakeLearnedScaleQuantPerChannelGradD parameters"""
input_shape = input_x.get("shape")
input_x_shape_ = input_x.get("ori_shape")
input_x_format = input_x.get("format")
@ -179,6 +182,16 @@ def fake_learned_scale_quant_perchannel_grad_d(dout, input_x, alpha, quant_max,
input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
alpha_data = tvm.placeholder(shape_c, name="alpha_data", dtype=alpha_dtype)
quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
return dout_data, input_data, alpha_data, quant_max_data
@util.check_input_type(dict, dict, dict, dict, dict, dict, bool, int, str)
def fake_learned_scale_quant_perchannel_grad_d(dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc,
channel_axis, kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
"""FakeLearnedScaleQuantPerChannelGradD"""
dout_data, input_data, alpha_data, quant_max_data = \
fake_learned_scale_quant_perchannel_grad_d_param(input_x, alpha, quant_max, channel_axis, kernel_name)
res = fake_learned_scale_quant_perchannel_grad_d_compute(dout_data, input_data, alpha_data, quant_max_data,
neg_trunc, kernel_name)

View File

@ -71,10 +71,9 @@ def fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_
return res
@util.check_input_type(dict, dict, dict, dict, bool, str)
def fake_learned_scale_quant_perlayer(input_x, alpha, quant_max, out, neg_trunc,
kernel_name="fake_learned_scale_quant_perlayer"):
"""FakeLearnedScaleQuantPerLayer"""
def fake_learned_scale_quant_perlayer_param(input_x, alpha, quant_max,
kernel_name="fake_learned_scale_quant_perlayer"):
"""Get and check FakeLearnedScaleQuantPerLayer parameters"""
input_shape = input_x.get("shape")
input_dtype = input_x.get("dtype")
alpha_shape = alpha.get("ori_shape")
@ -105,6 +104,16 @@ def fake_learned_scale_quant_perlayer(input_x, alpha, quant_max, out, neg_trunc,
input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype)
quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
return input_data, alpha_data, quant_max_data
@util.check_input_type(dict, dict, dict, dict, bool, str)
def fake_learned_scale_quant_perlayer(input_x, alpha, quant_max, out, neg_trunc,
kernel_name="fake_learned_scale_quant_perlayer"):
"""FakeLearnedScaleQuantPerLayer"""
input_data, alpha_data, quant_max_data = \
fake_learned_scale_quant_perlayer_param(input_x, alpha, quant_max, kernel_name)
res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name)
with tvm.target.cce():

View File

@ -59,6 +59,36 @@ def _fake_learned_scale_quant_perlayer_grad_d_tbe():
return
def _sign_function(dtype, input_div_alpha):
"""sign function imp"""
if dtype == "float32":
data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP32, dtype=dtype)
elif dtype == "float16":
data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP16, dtype=dtype)
else:
data_min = tvm.const(1, dtype=dtype)
neg_data_min = tvm.const(-1, dtype=dtype)
vmax = te.lang.cce.vmaxs(input_div_alpha, neg_data_min)
vmin = te.lang.cce.vmins(vmax, data_min)
if dtype == "float32":
# max num of float32 is 2**126
max_support_fp32 = tvm.const(2 ** 62, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp32)
res_mul2 = te.lang.cce.vmuls(res_mul1, max_support_fp32)
sign = te.lang.cce.vmuls(res_mul2, tvm.const(2 ** 2, dtype=dtype))
elif dtype == "float16":
# max num of float16 is 2**24
# but cce can only support 2**12, so use 12/12 to adaptor 24
max_support_fp16 = tvm.const(2 ** 12, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp16)
sign = te.lang.cce.vmuls(res_mul1, max_support_fp16)
else:
sign = vmin
return sign
@fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d")
def fake_learned_scale_quant_perlayer_grad_d_compute(dout, input_data, alpha_data, quant_max_data, neg_trunc,
kernel_name="fake_learned_scale_quant_perlayer_grad_d"):
@ -87,7 +117,6 @@ def fake_learned_scale_quant_perlayer_grad_d_compute(dout, input_data, alpha_dat
tensor_one = tvm.const(1.0, input_div_alpha.dtype)
tensor_one = te.lang.cce.broadcast(tensor_one, shape)
#out_of_bounds = te.lang.cce.vcmpsel(te.lang.cce.vabs(input_div_alpha), 1.0, 'gt', 1.0, 0.0)
out_of_upper_bounds = te.lang.cce.vcmpsel(input_div_alpha, 1.0, 'gt', 1.0, 0.0)
if neg_trunc:
out_of_lower_bounds = te.lang.cce.vcmpsel(input_div_alpha, 0.0, 'lt', 1.0, 0.0)
@ -97,32 +126,7 @@ def fake_learned_scale_quant_perlayer_grad_d_compute(dout, input_data, alpha_dat
dx = te.lang.cce.vmul(dx, te.lang.cce.vsub(tensor_one, out_of_bounds))
# sign function imp
if dtype == "float32":
data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP32, dtype=dtype)
elif dtype == "float16":
data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype)
neg_data_min = tvm.const(NEG_SCALAR_MIN_FP16, dtype=dtype)
else:
data_min = tvm.const(1, dtype=dtype)
neg_data_min = tvm.const(-1, dtype=dtype)
vmax = te.lang.cce.vmaxs(input_div_alpha, neg_data_min)
vmin = te.lang.cce.vmins(vmax, data_min)
if dtype == "float32":
# max num of float32 is 2**126
max_support_fp32 = tvm.const(2 ** 62, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp32)
res_mul2 = te.lang.cce.vmuls(res_mul1, max_support_fp32)
sign = te.lang.cce.vmuls(res_mul2, tvm.const(2 ** 2, dtype=dtype))
elif dtype == "float16":
# max num of float16 is 2**24
# but cce can only support 2**12, so use 12/12 to adaptor 24
max_support_fp16 = tvm.const(2 ** 12, dtype=dtype)
res_mul1 = te.lang.cce.vmuls(vmin, max_support_fp16)
sign = te.lang.cce.vmuls(res_mul1, max_support_fp16)
else:
sign = vmin
sign = _sign_function(dtype, input_div_alpha)
# The following lines are equivalent to :
# dalpha_each = dout * sign if out of bounds
@ -137,10 +141,9 @@ def fake_learned_scale_quant_perlayer_grad_d_compute(dout, input_data, alpha_dat
return [dx, dalpha_each]
@util.check_input_type(dict, dict, dict, dict, dict, dict, bool, str)
def fake_learned_scale_quant_perlayer_grad_d(dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc,
kernel_name="fake_learned_scale_quant_perlayer_grad_d"):
"""FakeLearnedScaleQuantPerLayerGradD"""
def fake_learned_scale_quant_perlayer_grad_d_param(input_x, alpha, quant_max,
kernel_name="fake_learned_scale_quant_perlayer_grad_d"):
"""Get and check FakeLearnedScaleQuantPerLayerGradD parameters"""
input_shape = input_x.get("shape")
input_dtype = input_x.get("dtype")
alpha_shape = alpha.get("ori_shape")
@ -172,6 +175,15 @@ def fake_learned_scale_quant_perlayer_grad_d(dout, input_x, alpha, quant_max, dx
input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype)
quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype)
return dout_data, input_data, alpha_data, quant_max_data
@util.check_input_type(dict, dict, dict, dict, dict, dict, bool, str)
def fake_learned_scale_quant_perlayer_grad_d(dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc,
kernel_name="fake_learned_scale_quant_perlayer_grad_d"):
"""FakeLearnedScaleQuantPerLayerGradD"""
dout_data, input_data, alpha_data, quant_max_data = \
fake_learned_scale_quant_perlayer_grad_d_param(input_x, alpha, quant_max, kernel_name)
res = fake_learned_scale_quant_perlayer_grad_d_compute(dout_data, input_data, alpha_data, quant_max_data,
neg_trunc, kernel_name)

View File

@ -86,11 +86,9 @@ def fake_quant_perchannel_compute(x, min_val, max_val, y, quant_min, quant_max,
return res
@util.check_input_type(dict, dict, dict, dict, bool, bool, int, int, str)
def fake_quant_perchannel(x, min_val, max_val, y,
symmetric, narrow_range, num_bits, channel_axis,
kernel_name="fake_quant_perchannel"):
"""FakeQuantPerChannel"""
def fake_quant_perchannel_param(x, min_val, max_val, channel_axis,
kernel_name="fake_quant_perchannel"):
"""Get and check fake_quant_perchannel parameters"""
x_shape = x.get("shape")
x_shape_ = x.get("ori_shape")
x_format = x.get("format")
@ -120,15 +118,25 @@ def fake_quant_perchannel(x, min_val, max_val, y,
util.check_dtype_rule(min_dtype, check_list)
util.check_dtype_rule(max_dtype, check_list)
shape_c = [1] * len(x_shape)
shape_c[channel_axis_] = min_val.get("ori_shape")[0]
if x_format == "NC1HWC0" and channel_axis_ == 1:
shape_c = min_val.get("shape")
return x_shape, shape_c, x_dtype
@util.check_input_type(dict, dict, dict, dict, bool, bool, int, int, str)
def fake_quant_perchannel(x, min_val, max_val, y,
symmetric, narrow_range, num_bits, channel_axis,
kernel_name="fake_quant_perchannel"):
"""FakeQuantPerChannel"""
quant_min = 0
quant_max = 2 ** num_bits - 1
if narrow_range:
quant_min = quant_min + 1
shape_c = [1] * len(x_shape)
shape_c[channel_axis_] = min_val.get("ori_shape")[0]
if x_format == "NC1HWC0" and channel_axis_ == 1:
shape_c = min_val.get("shape")
x_shape, shape_c, x_dtype = fake_quant_perchannel_param(x, min_val, max_val,
channel_axis, kernel_name)
input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)

View File

@ -110,11 +110,9 @@ def fake_quant_perchannel_grad_compute(dout, x, min_val, max_val, quant_min, qua
return res
@util.check_input_type(dict, dict, dict, dict, dict, bool, bool, int, int, str)
def fake_quant_perchannel_grad(dout, x, min_val, max_val, dx,
symmetric, narrow_range, num_bits, channel_axis,
kernel_name="fake_quant_perchannel_grad"):
"""FakeQuantPerChannelGrad"""
def fake_quant_perchannel_grad_param(x, min_val, max_val, channel_axis,
kernel_name="fake_quant_perchannel_grad"):
"""Get and check FakeQuantPerChannelGrad parameters"""
x_shape = x.get("shape")
x_shape_ = x.get("ori_shape")
x_format = x.get("format")
@ -144,6 +142,18 @@ def fake_quant_perchannel_grad(dout, x, min_val, max_val, dx,
util.check_dtype_rule(min_dtype, check_list)
util.check_dtype_rule(max_dtype, check_list)
shape_c = [1] * len(x_shape)
shape_c[channel_axis_] = min_val.get("ori_shape")[0]
if x_format == "NC1HWC0" and channel_axis_ == 1:
shape_c = min_val.get("shape")
return x_shape, shape_c, x_dtype
@util.check_input_type(dict, dict, dict, dict, dict, bool, bool, int, int, str)
def fake_quant_perchannel_grad(dout, x, min_val, max_val, dx,
symmetric, narrow_range, num_bits, channel_axis,
kernel_name="fake_quant_perchannel_grad"):
"""FakeQuantPerChannelGrad"""
if symmetric:
quant_min = 0 - 2 ** (num_bits - 1)
quant_max = 2 ** (num_bits - 1) - 1
@ -153,10 +163,8 @@ def fake_quant_perchannel_grad(dout, x, min_val, max_val, dx,
if narrow_range:
quant_min = quant_min + 1
shape_c = [1] * len(x_shape)
shape_c[channel_axis_] = min_val.get("ori_shape")[0]
if x_format == "NC1HWC0" and channel_axis_ == 1:
shape_c = min_val.get("shape")
x_shape, shape_c, x_dtype = fake_quant_perchannel_grad_param(x, min_val, max_val,
channel_axis, kernel_name)
dout_data = tvm.placeholder(x_shape, name="dout", dtype=x_dtype)
input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)

View File

@ -82,7 +82,6 @@ def minmax_update_perchannel(x, min_val, max_val, min_up, max_up,
kernel_name="minmax_update_perchannel"):
"""MinMaxUpdatePerChannel op"""
x_shape = x.get("ori_shape")
x_format = x.get("format")
x_dtype = x.get("dtype")
min_shape = min_val.get("ori_shape")
min_dtype = min_val.get("dtype")

View File

@ -6,4 +6,4 @@
bprop.10:x*
bprop.10:out*
bprop.10:dout2
bprop.10:[CNode]12:2:€027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb365c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4eca0593a639478ea8dfad17fdbe39f66855cc459eb58bcaf5eac44185e03b16374a6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c0606bdbf14ec1b2b2d86ab82b5eb2ac71f1d3d0ba743f7cee45a1d9a0a2d82ac414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260
bprop.10:[CNode]12:2:€027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb3565f906930f68ca2413e9ad958d105e129e717cd183b95d11d65a8b0b030fc0d65c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4eca0593a639478ea8dfad17fdbe39f66855cc459eb58bcaf5eac44185e03b16374a6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260

View File

@ -8,4 +8,4 @@
bprop.2:x*
bprop.2:out*
bprop.2:dout2
bprop.2:[CNode]4:3:€027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb365c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4eca0593a639478ea8dfad17fdbe39f66855cc459eb58bcaf5eac44185e03b16374a6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c0606bdbf14ec1b2b2d86ab82b5eb2ac71f1d3d0ba743f7cee45a1d9a0a2d82ac414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260
bprop.2:[CNode]4:3:€027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb3565f906930f68ca2413e9ad958d105e129e717cd183b95d11d65a8b0b030fc0d65c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4eca0593a639478ea8dfad17fdbe39f66855cc459eb58bcaf5eac44185e03b16374a6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260