forked from mindspore-Ecosystem/mindspore
fix input value check for SparseApplyFtrl and SparseApplyAdagrad
This commit is contained in:
parent
ea37dc76f0
commit
df63a3195d
|
@ -74,6 +74,7 @@ static std::map<string, string> tbe_func_adapter_map = {
|
||||||
{"apply_adadelta", "apply_adadelta_d"},
|
{"apply_adadelta", "apply_adadelta_d"},
|
||||||
{"apply_adagrad", "apply_adagrad_d"},
|
{"apply_adagrad", "apply_adagrad_d"},
|
||||||
{"apply_adagrad_v2", "apply_adagradv2_d"},
|
{"apply_adagrad_v2", "apply_adagradv2_d"},
|
||||||
|
{"sparse_apply_adagrad", "sparse_apply_adagrad_d"},
|
||||||
{"transpose", "transpose_d"},
|
{"transpose", "transpose_d"},
|
||||||
{"fill", "fill_d"},
|
{"fill", "fill_d"},
|
||||||
{"unsorted_segment_sum", "unsorted_segment_sum_d"},
|
{"unsorted_segment_sum", "unsorted_segment_sum_d"},
|
||||||
|
|
|
@ -18,6 +18,7 @@ from mindspore.common.parameter import Parameter
|
||||||
from mindspore.common.tensor import Tensor
|
from mindspore.common.tensor import Tensor
|
||||||
import mindspore.common.dtype as mstype
|
import mindspore.common.dtype as mstype
|
||||||
from mindspore._checkparam import check_bool
|
from mindspore._checkparam import check_bool
|
||||||
|
from mindspore._checkparam import Validator as validator
|
||||||
from .optimizer import Optimizer
|
from .optimizer import Optimizer
|
||||||
|
|
||||||
momentum_opt = C.MultitypeFuncGraph("momentum_opt")
|
momentum_opt = C.MultitypeFuncGraph("momentum_opt")
|
||||||
|
@ -65,16 +66,18 @@ class Momentum(Optimizer):
|
||||||
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
||||||
decay.
|
decay.
|
||||||
|
|
||||||
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
||||||
Iterable or a Tensor and the dims of the Tensor is 1,
|
Iterable or a Tensor and the dims of the Tensor is 1,
|
||||||
use dynamic learning rate, then the i-th step will
|
use dynamic learning rate, then the i-th step will
|
||||||
take the i-th value as the learning rate.
|
take the i-th value as the learning rate.
|
||||||
When the learning_rate is float or learning_rate is a Tensor
|
When the learning_rate is float or learning_rate is a
|
||||||
but the dims of the Tensor is 0, use fixed learning rate.
|
Tensor but the dims of the Tensor is 0, use fixed learning
|
||||||
Other cases are not supported.
|
rate. Other cases are not supported. It should be equal to
|
||||||
|
or greater than 0.0.
|
||||||
momentum (float): Hyperparameter of type float, means momentum for the moving average.
|
momentum (float): Hyperparameter of type float, means momentum for the moving average.
|
||||||
weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
|
It should be at least 0.0.
|
||||||
loss_scale (float): A floating point value for the loss scale. Default: 1.0.
|
weight_decay (int, float): Weight decay (L2 penalty). It should be equal to or greater than 0.0. Default: 0.0.
|
||||||
|
loss_scale (int, float): A floating point value for the loss scale. It should be greater than 0.0. Default: 1.0.
|
||||||
use_nesterov (bool): Enable Nesterov momentum. Default: False.
|
use_nesterov (bool): Enable Nesterov momentum. Default: False.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
|
@ -109,6 +112,7 @@ class Momentum(Optimizer):
|
||||||
"""
|
"""
|
||||||
def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False):
|
def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False):
|
||||||
super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
||||||
|
validator.check_value_type("momentum", momentum, [float], self.cls_name)
|
||||||
if isinstance(momentum, float) and momentum < 0.0:
|
if isinstance(momentum, float) and momentum < 0.0:
|
||||||
raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
|
raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
|
||||||
self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
|
self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
|
||||||
|
|
|
@ -13,15 +13,15 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
"""SparseApplyAdagrad op"""
|
"""SparseApplyAdagradD op"""
|
||||||
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
|
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
|
||||||
|
|
||||||
sparse_apply_adagrad_op_info = TBERegOp("SparseApplyAdagrad") \
|
sparse_apply_adagrad_d_op_info = TBERegOp("SparseApplyAdagrad") \
|
||||||
.fusion_type("OPAQUE") \
|
.fusion_type("OPAQUE") \
|
||||||
.async_flag(False) \
|
.async_flag(False) \
|
||||||
.binfile_name("sparse_apply_adagrad.so") \
|
.binfile_name("sparse_apply_adagrad_d.so") \
|
||||||
.compute_cost(10) \
|
.compute_cost(10) \
|
||||||
.kernel_name("sparse_apply_adagrad") \
|
.kernel_name("sparse_apply_adagrad_d") \
|
||||||
.partial_flag(True) \
|
.partial_flag(True) \
|
||||||
.attr("lr", "required", "float", "all") \
|
.attr("lr", "required", "float", "all") \
|
||||||
.attr("update_slots", "optional", "bool", "all") \
|
.attr("update_slots", "optional", "bool", "all") \
|
||||||
|
@ -31,14 +31,17 @@ sparse_apply_adagrad_op_info = TBERegOp("SparseApplyAdagrad") \
|
||||||
.input(2, "grad", False, "required", "all") \
|
.input(2, "grad", False, "required", "all") \
|
||||||
.input(3, "indices", False, "required", "all") \
|
.input(3, "indices", False, "required", "all") \
|
||||||
.output(0, "var", False, "required", "all") \
|
.output(0, "var", False, "required", "all") \
|
||||||
.dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW) \
|
.output(1, "accum", False, "required", "all") \
|
||||||
.dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC, DataType.F32_NHWC) \
|
.dtype_format(DataType.F32_NCHW, DataType.F32_NCHW, DataType.F32_NCHW, DataType.I32_NCHW,
|
||||||
|
DataType.F32_NCHW, DataType.F32_NCHW) \
|
||||||
|
.dtype_format(DataType.F32_NHWC, DataType.F32_NHWC, DataType.F32_NHWC, DataType.I32_NHWC,
|
||||||
|
DataType.F32_NHWC, DataType.F32_NHWC) \
|
||||||
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.I32_Default,
|
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.I32_Default,
|
||||||
DataType.F32_Default) \
|
DataType.F32_Default, DataType.F32_Default) \
|
||||||
.get_op_info()
|
.get_op_info()
|
||||||
|
|
||||||
|
|
||||||
@op_info_register(sparse_apply_adagrad_op_info)
|
@op_info_register(sparse_apply_adagrad_d_op_info)
|
||||||
def _sparse_apply_adagrad_tbe():
|
def _sparse_apply_adagrad_tbe():
|
||||||
"""SparseApplyAdagrad TBE register"""
|
"""SparseApplyAdagradD TBE register"""
|
||||||
return
|
return
|
||||||
|
|
|
@ -3184,9 +3184,9 @@ class ApplyAdadelta(PrimitiveWithInfer):
|
||||||
.. math::
|
.. math::
|
||||||
accum = \rho * accum + (1 - \rho) * grad^2
|
accum = \rho * accum + (1 - \rho) * grad^2
|
||||||
.. math::
|
.. math::
|
||||||
update = \sqrt{accum_update + \esilon} * \rsqrt{accum + \epsilon} * grad
|
\text{update} = \sqrt{\text{accum_update} + \epsilon} * \frac{grad}{\sqrt{accum + \epsilon}}
|
||||||
.. math::
|
.. math::
|
||||||
accum_update = \rho * accum_update + (1 - \rho) * update^2
|
\text{accum_update} = \rho * \text{accum_update} + (1 - \rho) * update^2
|
||||||
.. math::
|
.. math::
|
||||||
var -= lr * update
|
var -= lr * update
|
||||||
|
|
||||||
|
@ -3377,11 +3377,12 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
lr (float): Learning rate.
|
lr (float): Learning rate.
|
||||||
|
update_slots (bool): If `True`, `accum` will be updated. Default: True.
|
||||||
use_locking (bool): If True, updating of the var and accum tensors will be protected. Default: False.
|
use_locking (bool): If True, updating of the var and accum tensors will be protected. Default: False.
|
||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
- **var** (Tensor) - Variable to be updated. The type must be float32.
|
- **var** (Parameter) - Variable to be updated. The type must be float32.
|
||||||
- **accum** (Tensor) - Accum to be updated. The shape must be the same as `var`'s shape,
|
- **accum** (Parameter) - Accum to be updated. The shape must be the same as `var`'s shape,
|
||||||
the type must be float32.
|
the type must be float32.
|
||||||
- **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape
|
- **grad** (Tensor) - Gradient. The shape must be the same as `var`'s shape
|
||||||
except first dimension, the type must be float32.
|
except first dimension, the type must be float32.
|
||||||
|
@ -3389,21 +3390,45 @@ class SparseApplyAdagrad(PrimitiveWithInfer):
|
||||||
The shape of `indices` must be the same as `grad` in first dimension, the type must be int32.
|
The shape of `indices` must be the same as `grad` in first dimension, the type must be int32.
|
||||||
|
|
||||||
Outputs:
|
Outputs:
|
||||||
Tensor, has the same shape and type as `var`.
|
Tuple of 2 Tensor, the updated parameters.
|
||||||
|
|
||||||
|
- **var** (Tensor) - The same shape and data type as `var`.
|
||||||
|
- **accum** (Tensor) - The same shape and data type as `accum`.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> var = Tensor(np.random.random((3, 3)), mindspore.float32)
|
>>> import numpy as np
|
||||||
>>> accum = Tensor(np.random.random((3, 3)), mindspore.float32)
|
>>> import mindspore.nn as nn
|
||||||
>>> grad = Tensor(np.random.random((3, 3)), mindspore.float32)
|
>>> from mindspore import Tensor, Parameter
|
||||||
>>> indices = Tensor(np.ones((3,), np.int32))
|
>>> from mindspore.ops import operations as P
|
||||||
>>> sparse_apply_ada_grad = P.SparseApplyAdagrad(0.5)
|
>>> import mindspore.common.dtype as mstype
|
||||||
>>> sparse_apply_ada_grad(var, accum, grad, indices)
|
>>> class Net(nn.Cell):
|
||||||
|
>>> def __init__(self):
|
||||||
|
>>> super(Net, self).__init__()
|
||||||
|
>>> self.sparse_apply_adagrad = P.SparseApplyAdagrad(lr=1e-8)
|
||||||
|
>>> self.var = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="var")
|
||||||
|
>>> self.accum = Parameter(Tensor(np.ones([3, 3, 3]).astype(np.float32)), name="accum")
|
||||||
|
>>> def construct(self, grad, indices):
|
||||||
|
>>> out = self.sparse_apply_adagrad(self.var, self.accum, grad, indices)
|
||||||
|
>>> return out
|
||||||
|
>>> net = Net()
|
||||||
|
>>> grad = Tensor(np.random.rand(3, 3, 3).astype(np.float32))
|
||||||
|
>>> indices = Tensor([0, 1, 2], mstype.int32)
|
||||||
|
>>> result = net(grad, indices)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
__mindspore_signature__ = (
|
||||||
|
('var', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
|
||||||
|
('accum', sig_rw.RW_WRITE, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
|
||||||
|
('grad', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T),
|
||||||
|
('indices', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD, sig_kind.KIND_EMPTY_DEFAULT_VALUE, sig_dtype.T1)
|
||||||
|
)
|
||||||
|
|
||||||
@prim_attr_register
|
@prim_attr_register
|
||||||
def __init__(self, lr, use_locking=False):
|
def __init__(self, lr, update_slots=True, use_locking=False):
|
||||||
self.lr = validator.check_value_type("lr", lr, [float], self.name)
|
validator.check_value_type("lr", lr, [float], self.name)
|
||||||
self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
|
validator.check_number_range("lr", lr, float("-inf"), float("inf"), Rel.INC_NEITHER, self.name)
|
||||||
|
validator.check_value_type("update_slots", update_slots, [bool], self.name)
|
||||||
|
validator.check_value_type("use_locking", use_locking, [bool], self.name)
|
||||||
|
|
||||||
def infer_shape(self, var_shape, accum_shape, grad_shape, indices_shape):
|
def infer_shape(self, var_shape, accum_shape, grad_shape, indices_shape):
|
||||||
validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
|
validator.check('var shape', var_shape, 'accum shape', accum_shape, Rel.EQ, self.name)
|
||||||
|
@ -3757,8 +3782,8 @@ class SparseApplyFtrl(PrimitiveWithInfer):
|
||||||
validator.check_value_type("l2", l2, [float], self.name)
|
validator.check_value_type("l2", l2, [float], self.name)
|
||||||
validator.check_value_type("lr_power", lr_power, [float], self.name)
|
validator.check_value_type("lr_power", lr_power, [float], self.name)
|
||||||
self.lr = validator.check_number_range("lr", lr, 0.0, float("inf"), Rel.INC_NEITHER, self.name)
|
self.lr = validator.check_number_range("lr", lr, 0.0, float("inf"), Rel.INC_NEITHER, self.name)
|
||||||
self.l1 = validator.check_number("l1", l1, 0.0, Rel.GE, self.name)
|
self.l1 = validator.check_number_range("l1", l1, 0.0, float("inf"), Rel.INC_LEFT, self.name)
|
||||||
self.l2 = validator.check_number("l2", l2, 0.0, Rel.GE, self.name)
|
self.l2 = validator.check_number_range("l2", l2, 0.0, float("inf"), Rel.INC_LEFT, self.name)
|
||||||
self.lr_power = validator.check_number("lr_power", lr_power, 0, Rel.LE, self.name)
|
self.lr_power = validator.check_number("lr_power", lr_power, 0, Rel.LE, self.name)
|
||||||
self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
|
self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
|
||||||
|
|
||||||
|
|
|
@ -82,7 +82,7 @@ def test_lenet_nccl():
|
||||||
net.set_train()
|
net.set_train()
|
||||||
|
|
||||||
learning_rate = multisteplr(epoch, 2)
|
learning_rate = multisteplr(epoch, 2)
|
||||||
momentum = Tensor(np.array([0.9]).astype(np.float32))
|
momentum = 0.9
|
||||||
mom_optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
mom_optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
||||||
criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
|
criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
|
||||||
net_with_criterion = WithLossCell(net, criterion)
|
net_with_criterion = WithLossCell(net, criterion)
|
||||||
|
|
|
@ -25,7 +25,6 @@ import mindspore.dataset.transforms.vision.c_transforms as CV
|
||||||
import mindspore.nn as nn
|
import mindspore.nn as nn
|
||||||
from mindspore import Tensor
|
from mindspore import Tensor
|
||||||
from mindspore.common import dtype as mstype
|
from mindspore.common import dtype as mstype
|
||||||
from mindspore.common.initializer import initializer
|
|
||||||
from mindspore.dataset.transforms.vision import Inter
|
from mindspore.dataset.transforms.vision import Inter
|
||||||
from mindspore.model_zoo.lenet import LeNet5
|
from mindspore.model_zoo.lenet import LeNet5
|
||||||
from mindspore.nn import Dense, TrainOneStepCell, WithLossCell
|
from mindspore.nn import Dense, TrainOneStepCell, WithLossCell
|
||||||
|
@ -84,7 +83,7 @@ def multisteplr(total_steps, gap, base_lr=0.9, gamma=0.1, dtype=mstype.float32):
|
||||||
def test_train_lenet():
|
def test_train_lenet():
|
||||||
epoch = 100
|
epoch = 100
|
||||||
net = LeNet()
|
net = LeNet()
|
||||||
momentum = initializer(Tensor(np.array([0.9]).astype(np.float32)), [1])
|
momentum = 0.9
|
||||||
learning_rate = multisteplr(epoch, 30)
|
learning_rate = multisteplr(epoch, 30)
|
||||||
|
|
||||||
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
||||||
|
|
|
@ -49,7 +49,7 @@ def test_momentum():
|
||||||
epoch = 3
|
epoch = 3
|
||||||
net = NetMomentum()
|
net = NetMomentum()
|
||||||
learning_rate = initializer(Tensor(np.array([0.01]).astype(np.float32)), [1])
|
learning_rate = initializer(Tensor(np.array([0.01]).astype(np.float32)), [1])
|
||||||
momentum = initializer(Tensor(np.array([0.9]).astype(np.float32)), [1])
|
momentum = 0.9
|
||||||
|
|
||||||
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
||||||
criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
|
criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
|
||||||
|
|
|
@ -351,6 +351,17 @@ class ApplyAdagradV2Net(nn.Cell):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class SparseApplyAdagradNet(nn.Cell):
|
||||||
|
def __init__(self):
|
||||||
|
super(SparseApplyAdagradNet, self).__init__()
|
||||||
|
self.sparse_apply_adagrad = P.SparseApplyAdagrad(lr=0.01)
|
||||||
|
self.var = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="var")
|
||||||
|
self.accum = Parameter(Tensor(np.random.rand(3, 3).astype(np.float32)), name="accum")
|
||||||
|
|
||||||
|
def construct(self, grad, indices):
|
||||||
|
out = self.sparse_apply_adagrad(self.var, self.accum, grad, indices)
|
||||||
|
return out
|
||||||
|
|
||||||
class ApplyRMSNet(nn.Cell):
|
class ApplyRMSNet(nn.Cell):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(ApplyRMSNet, self).__init__()
|
super(ApplyRMSNet, self).__init__()
|
||||||
|
@ -1181,8 +1192,8 @@ test_case_nn_ops = [
|
||||||
'desc_inputs': [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
|
'desc_inputs': [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
|
||||||
'desc_bprop': []}),
|
'desc_bprop': []}),
|
||||||
('SparseApplyAdagrad', {
|
('SparseApplyAdagrad', {
|
||||||
'block': P.SparseApplyAdagrad(0.5),
|
'block': SparseApplyAdagradNet(),
|
||||||
'desc_inputs': [[3, 3], [3, 3], [3, 3], Tensor(np.ones((3,), np.int32))],
|
'desc_inputs': [[3, 3], Tensor(np.ones((3,), np.int32))],
|
||||||
'desc_bprop': [[3, 3], [3, 3]],
|
'desc_bprop': [[3, 3], [3, 3]],
|
||||||
'skip': ['backward']}),
|
'skip': ['backward']}),
|
||||||
('SparseApplyFtrl', {
|
('SparseApplyFtrl', {
|
||||||
|
@ -1332,13 +1343,6 @@ test_case_nn_ops = [
|
||||||
Tensor([[-1.4, -0.7], [0.9, 0.7]], mstype.float16)],
|
Tensor([[-1.4, -0.7], [0.9, 0.7]], mstype.float16)],
|
||||||
'desc_bprop': [],
|
'desc_bprop': [],
|
||||||
'skip': ['backward']}),
|
'skip': ['backward']}),
|
||||||
('SparseApplyAdagrad', {
|
|
||||||
'block': P.SparseApplyAdagrad(0.5),
|
|
||||||
'desc_inputs': [Tensor([[0.7, 0.2], [0.1, 0.07]], mstype.float32),
|
|
||||||
Tensor([[0.2, 0.2], [0.1, 0.4]], mstype.float32),
|
|
||||||
Tensor([[0.5, 0.4], [0.6, 0.1]], mstype.float32), Tensor([1, 1], mstype.int32)],
|
|
||||||
'desc_bprop': [Tensor([[0.7, 0.2], [0.1, 0.07]], mstype.float32)],
|
|
||||||
'skip': ['backward']}),
|
|
||||||
('DataFormatDimMap', {
|
('DataFormatDimMap', {
|
||||||
'block': P.DataFormatDimMap(),
|
'block': P.DataFormatDimMap(),
|
||||||
'desc_inputs': [Tensor([0, 1, 2, 3], mstype.int32)],
|
'desc_inputs': [Tensor([0, 1, 2, 3], mstype.int32)],
|
||||||
|
|
Loading…
Reference in New Issue