add epsilon parameter for layernorm

This commit is contained in:
zhaojichen 2020-05-20 23:58:18 -04:00
parent 889696bcab
commit 7c9fb3424f
7 changed files with 23 additions and 13 deletions

View File

@ -368,6 +368,7 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_no
MS_EXCEPTION_IF_NULL(op_info); MS_EXCEPTION_IF_NULL(op_info);
MS_EXCEPTION_IF_NULL(attrs_json); MS_EXCEPTION_IF_NULL(attrs_json);
auto attrs_ptr = op_info->attrs_ptr(); auto attrs_ptr = op_info->attrs_ptr();
std::string op_name = AnfAlgo::GetCNodeName(anf_node);
if (TbeAdapter::RunAttrPass(anf_node, attrs_ptr, attrs_json)) { if (TbeAdapter::RunAttrPass(anf_node, attrs_ptr, attrs_json)) {
return true; return true;
} }
@ -377,6 +378,9 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_no
std::string attr_name = attr_ptr->name(); std::string attr_name = attr_ptr->name();
nlohmann::json attr_obj; nlohmann::json attr_obj;
attr_obj["name"] = attr_name; attr_obj["name"] = attr_name;
if (op_name == "LayerNorm" && attr_obj["name"] == "epsilon" && creater_type_ == OP_SELECT_FORMAT) {
continue;
}
if (primitive->GetAttr(attr_name) != nullptr) { if (primitive->GetAttr(attr_name) != nullptr) {
auto value = primitive->GetAttr(attr_name); auto value = primitive->GetAttr(attr_name);
std::string type = attr_ptr->type(); std::string type = attr_ptr->type();

View File

@ -1085,7 +1085,8 @@ OUTPUT_MAP(SGD) = {{0, OUTPUT_DESC(parameters)}};
// LayerNorm // LayerNorm
INPUT_MAP(LayerNorm) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(gamma)}, {3, INPUT_DESC(beta)}}; INPUT_MAP(LayerNorm) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(gamma)}, {3, INPUT_DESC(beta)}};
ATTR_MAP(LayerNorm) = {{"begin_norm_axis", ATTR_DESC(begin_norm_axis, AnyTraits<int>())}, ATTR_MAP(LayerNorm) = {{"begin_norm_axis", ATTR_DESC(begin_norm_axis, AnyTraits<int>())},
{"begin_params_axis", ATTR_DESC(begin_params_axis, AnyTraits<int>())}}; {"begin_params_axis", ATTR_DESC(begin_params_axis, AnyTraits<int>())},
{"epsilon", ATTR_DESC(epsilon, AnyTraits<float>())}};
OUTPUT_MAP(LayerNorm) = {{0, OUTPUT_DESC(y)}, {1, OUTPUT_DESC(mean)}, {2, OUTPUT_DESC(variance)}}; OUTPUT_MAP(LayerNorm) = {{0, OUTPUT_DESC(y)}, {1, OUTPUT_DESC(mean)}, {2, OUTPUT_DESC(variance)}};
// LayerNormGrad // LayerNormGrad

View File

@ -449,6 +449,7 @@ class LayerNorm(Cell):
beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
'he_uniform', etc. Default: 'zeros'. 'he_uniform', etc. Default: 'zeros'.
epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7.
Inputs: Inputs:
- **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`, - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
@ -469,6 +470,7 @@ class LayerNorm(Cell):
begin_params_axis=-1, begin_params_axis=-1,
gamma_init='ones', gamma_init='ones',
beta_init='zeros', beta_init='zeros',
epsilon=1e-7
): ):
super(LayerNorm, self).__init__() super(LayerNorm, self).__init__()
if not isinstance(normalized_shape, (tuple, list)): if not isinstance(normalized_shape, (tuple, list)):
@ -477,11 +479,13 @@ class LayerNorm(Cell):
self.normalized_shape = normalized_shape self.normalized_shape = normalized_shape
self.begin_norm_axis = begin_norm_axis self.begin_norm_axis = begin_norm_axis
self.begin_params_axis = begin_params_axis self.begin_params_axis = begin_params_axis
self.epsilon = epsilon
self.gamma = Parameter(initializer( self.gamma = Parameter(initializer(
gamma_init, normalized_shape), name="gamma") gamma_init, normalized_shape), name="gamma")
self.beta = Parameter(initializer( self.beta = Parameter(initializer(
beta_init, normalized_shape), name="beta") beta_init, normalized_shape), name="beta")
self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis) self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis,
epsilon=self.epsilon)
def construct(self, input_x): def construct(self, input_x):
y, _, _ = self.layer_norm(input_x, self.gamma, self.beta) y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)

View File

@ -198,14 +198,12 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
Scores Tensor :math:`x` is of shape :math:`(N, C)` and target Tensor :math:`t` is a Scores Tensor :math:`x` is of shape :math:`(N, C)` and target Tensor :math:`t` is a
Tensor of shape :math:`(N, C)` which contains one-hot labels of length :math:`C`. Tensor of shape :math:`(N, C)` which contains one-hot labels of length :math:`C`.
For each batch :math:`N_i`, the loss is given as: For each instance :math:`N_i`, the loss is given as:
.. math:: .. math::
\ell(x_i, t_i) = -w_{t_i} \log\left(\frac{\exp(x_{t_i})}{\sum_j \exp(x_j)}\right) \ell(x_i, t_i) = - \log\left(\frac{\exp(x_{t_i})}{\sum_j \exp(x_j)}\right)
= w_{t_i} \left(-x_{t_i} + \log\left(\sum_j \exp(x_i)\right)\right), = -x_{t_i} + \log\left(\sum_j \exp(x_i)\right),
where :math:`x_i` is a 1D score Tensor, :math:`t_i` is the target class and where :math:`x_i` is a 1D score Tensor, :math:`t_i` is a scalar.
:math:`w` is a weight Tensor to generate weighted loss for each class. When not specified,
weight Tensor is set to be None and weight is the same (:math:`1`) for all class.
Note: Note:
While the target classes are mutually exclusive, i.e., only one class is positive in the target, the predicted While the target classes are mutually exclusive, i.e., only one class is positive in the target, the predicted
@ -221,8 +219,8 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
num_classes (int): The number of classes in the task. It is a optional input Default: 2. num_classes (int): The number of classes in the task. It is a optional input Default: 2.
Inputs: Inputs:
- **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`. - **logits** (Tensor) - Tensor of shape (N, C).
- **labels** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. If `sparse` is True, The type of - **labels** (Tensor) - Tensor of shape (N, ). If `sparse` is True, The type of
`labels` is mindspore.int32. If `sparse` is False, the type of `labels` is same as the type of `logits`. `labels` is mindspore.int32. If `sparse` is False, the type of `labels` is same as the type of `logits`.
Outputs: Outputs:

View File

@ -25,6 +25,7 @@ layer_norm_op_info = TBERegOp("LayerNorm") \
.partial_flag(True) \ .partial_flag(True) \
.attr("begin_norm_axis", "required", "int", "all") \ .attr("begin_norm_axis", "required", "int", "all") \
.attr("begin_params_axis", "required", "int", "all") \ .attr("begin_params_axis", "required", "int", "all") \
.attr("epsilon", "optional", "float", "all") \
.input(0, "x", False, "required", "all") \ .input(0, "x", False, "required", "all") \
.input(1, "gamma", False, "required", "all") \ .input(1, "gamma", False, "required", "all") \
.input(2, "beta", False, "required", "all") \ .input(2, "beta", False, "required", "all") \

View File

@ -1845,6 +1845,7 @@ class LayerNorm(Primitive):
the value should be in [-1, rank(input)). Default: 1. the value should be in [-1, rank(input)). Default: 1.
begin_params_axis (int): The begin axis of the parameter input (`gamma`, `beta`) to begin_params_axis (int): The begin axis of the parameter input (`gamma`, `beta`) to
apply LayerNorm, the value should be in [-1, rank(input)). Default: 1. apply LayerNorm, the value should be in [-1, rank(input)). Default: 1.
epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7.
Inputs: Inputs:
- **input_x** (Tensor) - Tensor of shape :math:`(N, \ldots)`. - **input_x** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
@ -1873,9 +1874,10 @@ class LayerNorm(Primitive):
""" """
@prim_attr_register @prim_attr_register
def __init__(self, begin_norm_axis=1, begin_params_axis=1): def __init__(self, begin_norm_axis=1, begin_params_axis=1, epsilon=1e-7):
validator.check_value_type('begin_norm_axis', begin_norm_axis, [int], self.name) validator.check_value_type('begin_norm_axis', begin_norm_axis, [int], self.name)
validator.check_value_type('begin_params_axis', begin_params_axis, [int], self.name) validator.check_value_type('begin_params_axis', begin_params_axis, [int], self.name)
validator.check_value_type('epsilon', epsilon, [float], self.name)
class L2Normalize(PrimitiveWithInfer): class L2Normalize(PrimitiveWithInfer):

View File

@ -171,8 +171,8 @@ def test_bert_tdt():
# assertion occurs while the loss value, overflow state or loss_scale value is wrong # assertion occurs while the loss value, overflow state or loss_scale value is wrong
loss_value = np.array(callback.loss_list) loss_value = np.array(callback.loss_list)
expect_loss_value = [12.207201, 11.980862, 11.984737, 11.879344, 11.832838, 12.411388, expect_loss_value = [12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333, 12.009284,
12.009449, 12.621273, 12.223175, 12.427313] 12.621277, 12.223178, 12.427385]
print("loss value: {}".format(loss_value)) print("loss value: {}".format(loss_value))
assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)