forked from mindspore-Ecosystem/mindspore
add epsilon parameter for layernorm
This commit is contained in:
parent
889696bcab
commit
7c9fb3424f
|
@ -368,6 +368,7 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_no
|
||||||
MS_EXCEPTION_IF_NULL(op_info);
|
MS_EXCEPTION_IF_NULL(op_info);
|
||||||
MS_EXCEPTION_IF_NULL(attrs_json);
|
MS_EXCEPTION_IF_NULL(attrs_json);
|
||||||
auto attrs_ptr = op_info->attrs_ptr();
|
auto attrs_ptr = op_info->attrs_ptr();
|
||||||
|
std::string op_name = AnfAlgo::GetCNodeName(anf_node);
|
||||||
if (TbeAdapter::RunAttrPass(anf_node, attrs_ptr, attrs_json)) {
|
if (TbeAdapter::RunAttrPass(anf_node, attrs_ptr, attrs_json)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -377,6 +378,9 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_no
|
||||||
std::string attr_name = attr_ptr->name();
|
std::string attr_name = attr_ptr->name();
|
||||||
nlohmann::json attr_obj;
|
nlohmann::json attr_obj;
|
||||||
attr_obj["name"] = attr_name;
|
attr_obj["name"] = attr_name;
|
||||||
|
if (op_name == "LayerNorm" && attr_obj["name"] == "epsilon" && creater_type_ == OP_SELECT_FORMAT) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (primitive->GetAttr(attr_name) != nullptr) {
|
if (primitive->GetAttr(attr_name) != nullptr) {
|
||||||
auto value = primitive->GetAttr(attr_name);
|
auto value = primitive->GetAttr(attr_name);
|
||||||
std::string type = attr_ptr->type();
|
std::string type = attr_ptr->type();
|
||||||
|
|
|
@ -1085,7 +1085,8 @@ OUTPUT_MAP(SGD) = {{0, OUTPUT_DESC(parameters)}};
|
||||||
// LayerNorm
|
// LayerNorm
|
||||||
INPUT_MAP(LayerNorm) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(gamma)}, {3, INPUT_DESC(beta)}};
|
INPUT_MAP(LayerNorm) = {{1, INPUT_DESC(x)}, {2, INPUT_DESC(gamma)}, {3, INPUT_DESC(beta)}};
|
||||||
ATTR_MAP(LayerNorm) = {{"begin_norm_axis", ATTR_DESC(begin_norm_axis, AnyTraits<int>())},
|
ATTR_MAP(LayerNorm) = {{"begin_norm_axis", ATTR_DESC(begin_norm_axis, AnyTraits<int>())},
|
||||||
{"begin_params_axis", ATTR_DESC(begin_params_axis, AnyTraits<int>())}};
|
{"begin_params_axis", ATTR_DESC(begin_params_axis, AnyTraits<int>())},
|
||||||
|
{"epsilon", ATTR_DESC(epsilon, AnyTraits<float>())}};
|
||||||
OUTPUT_MAP(LayerNorm) = {{0, OUTPUT_DESC(y)}, {1, OUTPUT_DESC(mean)}, {2, OUTPUT_DESC(variance)}};
|
OUTPUT_MAP(LayerNorm) = {{0, OUTPUT_DESC(y)}, {1, OUTPUT_DESC(mean)}, {2, OUTPUT_DESC(variance)}};
|
||||||
|
|
||||||
// LayerNormGrad
|
// LayerNormGrad
|
||||||
|
|
|
@ -449,6 +449,7 @@ class LayerNorm(Cell):
|
||||||
beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
|
beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
|
||||||
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
|
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
|
||||||
'he_uniform', etc. Default: 'zeros'.
|
'he_uniform', etc. Default: 'zeros'.
|
||||||
|
epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
|
- **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
|
||||||
|
@ -469,6 +470,7 @@ class LayerNorm(Cell):
|
||||||
begin_params_axis=-1,
|
begin_params_axis=-1,
|
||||||
gamma_init='ones',
|
gamma_init='ones',
|
||||||
beta_init='zeros',
|
beta_init='zeros',
|
||||||
|
epsilon=1e-7
|
||||||
):
|
):
|
||||||
super(LayerNorm, self).__init__()
|
super(LayerNorm, self).__init__()
|
||||||
if not isinstance(normalized_shape, (tuple, list)):
|
if not isinstance(normalized_shape, (tuple, list)):
|
||||||
|
@ -477,11 +479,13 @@ class LayerNorm(Cell):
|
||||||
self.normalized_shape = normalized_shape
|
self.normalized_shape = normalized_shape
|
||||||
self.begin_norm_axis = begin_norm_axis
|
self.begin_norm_axis = begin_norm_axis
|
||||||
self.begin_params_axis = begin_params_axis
|
self.begin_params_axis = begin_params_axis
|
||||||
|
self.epsilon = epsilon
|
||||||
self.gamma = Parameter(initializer(
|
self.gamma = Parameter(initializer(
|
||||||
gamma_init, normalized_shape), name="gamma")
|
gamma_init, normalized_shape), name="gamma")
|
||||||
self.beta = Parameter(initializer(
|
self.beta = Parameter(initializer(
|
||||||
beta_init, normalized_shape), name="beta")
|
beta_init, normalized_shape), name="beta")
|
||||||
self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
|
self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis,
|
||||||
|
epsilon=self.epsilon)
|
||||||
|
|
||||||
def construct(self, input_x):
|
def construct(self, input_x):
|
||||||
y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
|
y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
|
||||||
|
|
|
@ -198,14 +198,12 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
|
||||||
Scores Tensor :math:`x` is of shape :math:`(N, C)` and target Tensor :math:`t` is a
|
Scores Tensor :math:`x` is of shape :math:`(N, C)` and target Tensor :math:`t` is a
|
||||||
Tensor of shape :math:`(N, C)` which contains one-hot labels of length :math:`C`.
|
Tensor of shape :math:`(N, C)` which contains one-hot labels of length :math:`C`.
|
||||||
|
|
||||||
For each batch :math:`N_i`, the loss is given as:
|
For each instance :math:`N_i`, the loss is given as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
\ell(x_i, t_i) = -w_{t_i} \log\left(\frac{\exp(x_{t_i})}{\sum_j \exp(x_j)}\right)
|
\ell(x_i, t_i) = - \log\left(\frac{\exp(x_{t_i})}{\sum_j \exp(x_j)}\right)
|
||||||
= w_{t_i} \left(-x_{t_i} + \log\left(\sum_j \exp(x_i)\right)\right),
|
= -x_{t_i} + \log\left(\sum_j \exp(x_i)\right),
|
||||||
where :math:`x_i` is a 1D score Tensor, :math:`t_i` is the target class and
|
where :math:`x_i` is a 1D score Tensor, :math:`t_i` is a scalar.
|
||||||
:math:`w` is a weight Tensor to generate weighted loss for each class. When not specified,
|
|
||||||
weight Tensor is set to be None and weight is the same (:math:`1`) for all class.
|
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
While the target classes are mutually exclusive, i.e., only one class is positive in the target, the predicted
|
While the target classes are mutually exclusive, i.e., only one class is positive in the target, the predicted
|
||||||
|
@ -221,8 +219,8 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
|
||||||
num_classes (int): The number of classes in the task. It is a optional input Default: 2.
|
num_classes (int): The number of classes in the task. It is a optional input Default: 2.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`.
|
- **logits** (Tensor) - Tensor of shape (N, C).
|
||||||
- **labels** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. If `sparse` is True, The type of
|
- **labels** (Tensor) - Tensor of shape (N, ). If `sparse` is True, The type of
|
||||||
`labels` is mindspore.int32. If `sparse` is False, the type of `labels` is same as the type of `logits`.
|
`labels` is mindspore.int32. If `sparse` is False, the type of `labels` is same as the type of `logits`.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
|
|
|
@ -25,6 +25,7 @@ layer_norm_op_info = TBERegOp("LayerNorm") \
|
||||||
.partial_flag(True) \
|
.partial_flag(True) \
|
||||||
.attr("begin_norm_axis", "required", "int", "all") \
|
.attr("begin_norm_axis", "required", "int", "all") \
|
||||||
.attr("begin_params_axis", "required", "int", "all") \
|
.attr("begin_params_axis", "required", "int", "all") \
|
||||||
|
.attr("epsilon", "optional", "float", "all") \
|
||||||
.input(0, "x", False, "required", "all") \
|
.input(0, "x", False, "required", "all") \
|
||||||
.input(1, "gamma", False, "required", "all") \
|
.input(1, "gamma", False, "required", "all") \
|
||||||
.input(2, "beta", False, "required", "all") \
|
.input(2, "beta", False, "required", "all") \
|
||||||
|
|
|
@ -1845,6 +1845,7 @@ class LayerNorm(Primitive):
|
||||||
the value should be in [-1, rank(input)). Default: 1.
|
the value should be in [-1, rank(input)). Default: 1.
|
||||||
begin_params_axis (int): The begin axis of the parameter input (`gamma`, `beta`) to
|
begin_params_axis (int): The begin axis of the parameter input (`gamma`, `beta`) to
|
||||||
apply LayerNorm, the value should be in [-1, rank(input)). Default: 1.
|
apply LayerNorm, the value should be in [-1, rank(input)). Default: 1.
|
||||||
|
epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **input_x** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
|
- **input_x** (Tensor) - Tensor of shape :math:`(N, \ldots)`.
|
||||||
|
@ -1873,9 +1874,10 @@ class LayerNorm(Primitive):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@prim_attr_register
|
@prim_attr_register
|
||||||
def __init__(self, begin_norm_axis=1, begin_params_axis=1):
|
def __init__(self, begin_norm_axis=1, begin_params_axis=1, epsilon=1e-7):
|
||||||
validator.check_value_type('begin_norm_axis', begin_norm_axis, [int], self.name)
|
validator.check_value_type('begin_norm_axis', begin_norm_axis, [int], self.name)
|
||||||
validator.check_value_type('begin_params_axis', begin_params_axis, [int], self.name)
|
validator.check_value_type('begin_params_axis', begin_params_axis, [int], self.name)
|
||||||
|
validator.check_value_type('epsilon', epsilon, [float], self.name)
|
||||||
|
|
||||||
|
|
||||||
class L2Normalize(PrimitiveWithInfer):
|
class L2Normalize(PrimitiveWithInfer):
|
||||||
|
|
|
@ -171,8 +171,8 @@ def test_bert_tdt():
|
||||||
|
|
||||||
# assertion occurs while the loss value, overflow state or loss_scale value is wrong
|
# assertion occurs while the loss value, overflow state or loss_scale value is wrong
|
||||||
loss_value = np.array(callback.loss_list)
|
loss_value = np.array(callback.loss_list)
|
||||||
expect_loss_value = [12.207201, 11.980862, 11.984737, 11.879344, 11.832838, 12.411388,
|
expect_loss_value = [12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333, 12.009284,
|
||||||
12.009449, 12.621273, 12.223175, 12.427313]
|
12.621277, 12.223178, 12.427385]
|
||||||
print("loss value: {}".format(loss_value))
|
print("loss value: {}".format(loss_value))
|
||||||
assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)
|
assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue