From 7c9fb3424f7793070477d7508744086023f6e716 Mon Sep 17 00:00:00 2001 From: zhaojichen Date: Wed, 20 May 2020 23:58:18 -0400 Subject: [PATCH] add epsilon parameter for layernorm --- mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc | 4 ++++ mindspore/ccsrc/transform/op_declare.cc | 3 ++- mindspore/nn/layer/normalization.py | 6 +++++- mindspore/nn/loss/loss.py | 14 ++++++-------- mindspore/ops/_op_impl/tbe/layer_norm.py | 1 + mindspore/ops/operations/nn_ops.py | 4 +++- .../st/networks/models/bert/bert_tdt_lossscale.py | 4 ++-- 7 files changed, 23 insertions(+), 13 deletions(-) diff --git a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc index 73c5ff79707..1322c81d664 100644 --- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc +++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.cc @@ -368,6 +368,7 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr &anf_no MS_EXCEPTION_IF_NULL(op_info); MS_EXCEPTION_IF_NULL(attrs_json); auto attrs_ptr = op_info->attrs_ptr(); + std::string op_name = AnfAlgo::GetCNodeName(anf_node); if (TbeAdapter::RunAttrPass(anf_node, attrs_ptr, attrs_json)) { return true; } @@ -377,6 +378,9 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr &anf_no std::string attr_name = attr_ptr->name(); nlohmann::json attr_obj; attr_obj["name"] = attr_name; + if (op_name == "LayerNorm" && attr_obj["name"] == "epsilon" && creater_type_ == OP_SELECT_FORMAT) { + continue; + } if (primitive->GetAttr(attr_name) != nullptr) { auto value = primitive->GetAttr(attr_name); std::string type = attr_ptr->type(); diff --git a/mindspore/ccsrc/transform/op_declare.cc b/mindspore/ccsrc/transform/op_declare.cc index 92743b9e888..4cad5aa1919 100644 --- a/mindspore/ccsrc/transform/op_declare.cc +++ b/mindspore/ccsrc/transform/op_declare.cc @@ -1085,7 +1085,8 @@ OUTPUT_MAP(SGD) = {{0, OUTPUT_DESC(parameters)}}; // LayerNorm INPUT_MAP(LayerNorm) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(gamma)}, {3, INPUT_DESC(beta)}}; ATTR_MAP(LayerNorm) = {{"begin_norm_axis", ATTR_DESC(begin_norm_axis, AnyTraits())}, - {"begin_params_axis", ATTR_DESC(begin_params_axis, AnyTraits())}}; + {"begin_params_axis", ATTR_DESC(begin_params_axis, AnyTraits())}, + {"epsilon", ATTR_DESC(epsilon, AnyTraits())}}; OUTPUT_MAP(LayerNorm) = {{0, OUTPUT_DESC(y)}, {1, OUTPUT_DESC(mean)}, {2, OUTPUT_DESC(variance)}}; // LayerNormGrad diff --git a/mindspore/nn/layer/normalization.py b/mindspore/nn/layer/normalization.py index e5015ef324a..16124c126a0 100644 --- a/mindspore/nn/layer/normalization.py +++ b/mindspore/nn/layer/normalization.py @@ -449,6 +449,7 @@ class LayerNorm(Cell): beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', 'he_uniform', etc. Default: 'zeros'. + epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7. Inputs: - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`, @@ -469,6 +470,7 @@ class LayerNorm(Cell): begin_params_axis=-1, gamma_init='ones', beta_init='zeros', + epsilon=1e-7 ): super(LayerNorm, self).__init__() if not isinstance(normalized_shape, (tuple, list)): @@ -477,11 +479,13 @@ class LayerNorm(Cell): self.normalized_shape = normalized_shape self.begin_norm_axis = begin_norm_axis self.begin_params_axis = begin_params_axis + self.epsilon = epsilon self.gamma = Parameter(initializer( gamma_init, normalized_shape), name="gamma") self.beta = Parameter(initializer( beta_init, normalized_shape), name="beta") - self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis) + self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis, + epsilon=self.epsilon) def construct(self, input_x): y, _, _ = self.layer_norm(input_x, self.gamma, self.beta) diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py index 737ae78ec17..ac419c32c3f 100644 --- a/mindspore/nn/loss/loss.py +++ b/mindspore/nn/loss/loss.py @@ -198,14 +198,12 @@ class SoftmaxCrossEntropyWithLogits(_Loss): Scores Tensor :math:`x` is of shape :math:`(N, C)` and target Tensor :math:`t` is a Tensor of shape :math:`(N, C)` which contains one-hot labels of length :math:`C`. - For each batch :math:`N_i`, the loss is given as: + For each instance :math:`N_i`, the loss is given as: .. math:: - \ell(x_i, t_i) = -w_{t_i} \log\left(\frac{\exp(x_{t_i})}{\sum_j \exp(x_j)}\right) - = w_{t_i} \left(-x_{t_i} + \log\left(\sum_j \exp(x_i)\right)\right), - where :math:`x_i` is a 1D score Tensor, :math:`t_i` is the target class and - :math:`w` is a weight Tensor to generate weighted loss for each class. When not specified, - weight Tensor is set to be None and weight is the same (:math:`1`) for all class. + \ell(x_i, t_i) = - \log\left(\frac{\exp(x_{t_i})}{\sum_j \exp(x_j)}\right) + = -x_{t_i} + \log\left(\sum_j \exp(x_i)\right), + where :math:`x_i` is a 1D score Tensor, :math:`t_i` is a scalar. Note: While the target classes are mutually exclusive, i.e., only one class is positive in the target, the predicted @@ -221,8 +219,8 @@ class SoftmaxCrossEntropyWithLogits(_Loss): num_classes (int): The number of classes in the task. It is a optional input Default: 2. Inputs: - - **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`. - - **labels** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. If `sparse` is True, The type of + - **logits** (Tensor) - Tensor of shape (N, C). + - **labels** (Tensor) - Tensor of shape (N, ). If `sparse` is True, The type of `labels` is mindspore.int32. If `sparse` is False, the type of `labels` is same as the type of `logits`. Outputs: diff --git a/mindspore/ops/_op_impl/tbe/layer_norm.py b/mindspore/ops/_op_impl/tbe/layer_norm.py index bc71fa87d32..c52be2d4eff 100644 --- a/mindspore/ops/_op_impl/tbe/layer_norm.py +++ b/mindspore/ops/_op_impl/tbe/layer_norm.py @@ -25,6 +25,7 @@ layer_norm_op_info = TBERegOp("LayerNorm") \ .partial_flag(True) \ .attr("begin_norm_axis", "required", "int", "all") \ .attr("begin_params_axis", "required", "int", "all") \ + .attr("epsilon", "optional", "float", "all") \ .input(0, "x", False, "required", "all") \ .input(1, "gamma", False, "required", "all") \ .input(2, "beta", False, "required", "all") \ diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index dcc58101054..02adef9d34d 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -1845,6 +1845,7 @@ class LayerNorm(Primitive): the value should be in [-1, rank(input)). Default: 1. begin_params_axis (int): The begin axis of the parameter input (`gamma`, `beta`) to apply LayerNorm, the value should be in [-1, rank(input)). Default: 1. + epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7. Inputs: - **input_x** (Tensor) - Tensor of shape :math:`(N, \ldots)`. @@ -1873,9 +1874,10 @@ class LayerNorm(Primitive): """ @prim_attr_register - def __init__(self, begin_norm_axis=1, begin_params_axis=1): + def __init__(self, begin_norm_axis=1, begin_params_axis=1, epsilon=1e-7): validator.check_value_type('begin_norm_axis', begin_norm_axis, [int], self.name) validator.check_value_type('begin_params_axis', begin_params_axis, [int], self.name) + validator.check_value_type('epsilon', epsilon, [float], self.name) class L2Normalize(PrimitiveWithInfer): diff --git a/tests/st/networks/models/bert/bert_tdt_lossscale.py b/tests/st/networks/models/bert/bert_tdt_lossscale.py index 65679b9d523..caacd9f16cb 100644 --- a/tests/st/networks/models/bert/bert_tdt_lossscale.py +++ b/tests/st/networks/models/bert/bert_tdt_lossscale.py @@ -171,8 +171,8 @@ def test_bert_tdt(): # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) - expect_loss_value = [12.207201, 11.980862, 11.984737, 11.879344, 11.832838, 12.411388, - 12.009449, 12.621273, 12.223175, 12.427313] + expect_loss_value = [12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333, 12.009284, + 12.621277, 12.223178, 12.427385] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)