forked from mindspore-Ecosystem/mindspore
change order param same as group params
This commit is contained in:
parent
71fd4321c6
commit
652093642e
|
@ -181,8 +181,7 @@ class Adam(Optimizer):
|
|||
|
||||
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
||||
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
||||
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
||||
decay.
|
||||
in the value of 'order_params' should be in one of group parameters.
|
||||
|
||||
learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
||||
Iterable or a Tensor and the dims of the Tensor is 1,
|
||||
|
@ -220,16 +219,14 @@ class Adam(Optimizer):
|
|||
>>>
|
||||
>>> #2) Use parameter groups and set different values
|
||||
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
>>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
|
||||
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
||||
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
||||
>>> {'params': bias_params, 'lr': 0.01},
|
||||
>>> {'params': no_conv_params, 'lr': 0.01},
|
||||
>>> {'order_params': net.trainable_params()}]
|
||||
>>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
|
||||
>>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
|
||||
>>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
|
||||
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
|
||||
>>> # of default value 0.1 and a weight decay of default value 0.0.
|
||||
>>>
|
||||
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
||||
>>> model = Model(net, loss_fn=loss, optimizer=optim)
|
||||
|
|
|
@ -109,6 +109,10 @@ class LazyAdam(Optimizer):
|
|||
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
|
||||
will be used. If not, the `weight_decay` in the API will be used.
|
||||
|
||||
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
||||
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
||||
in the value of 'order_params' should be in one of group parameters.
|
||||
|
||||
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
||||
Iterable or a Tensor and the dims of the Tensor is 1,
|
||||
use dynamic learning rate, then the i-th step will
|
||||
|
@ -146,12 +150,13 @@ class LazyAdam(Optimizer):
|
|||
>>> #2) Use parameter groups and set different values
|
||||
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
||||
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
|
||||
>>> {'params': no_conv_params}]
|
||||
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
||||
>>> {'params': no_conv_params, 'lr': 0.01},
|
||||
>>> {'order_params': net.trainable_params()}]
|
||||
>>> opt = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0)
|
||||
>>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
|
||||
>>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
|
||||
>>> # learning rate of 0.1 and a weight decay of 0.0.
|
||||
>>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
|
||||
>>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
|
||||
>>>
|
||||
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
||||
>>> model = Model(net, loss_fn=loss, optimizer=optim)
|
||||
|
|
|
@ -64,8 +64,7 @@ class Momentum(Optimizer):
|
|||
|
||||
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
||||
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
||||
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
||||
decay.
|
||||
in the value of 'order_params' should be in one of group parameters.
|
||||
|
||||
learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
||||
Iterable or a Tensor and the dims of the Tensor is 1,
|
||||
|
@ -97,16 +96,14 @@ class Momentum(Optimizer):
|
|||
>>>
|
||||
>>> #2) Use parameter groups and set different values
|
||||
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
>>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
|
||||
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
||||
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
||||
>>> {'params': bias_params, 'lr': 0.01},
|
||||
>>> {'params': no_conv_params, 'lr': 0.01},
|
||||
>>> {'order_params': net.trainable_params()}]
|
||||
>>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
|
||||
>>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
|
||||
>>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
|
||||
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
|
||||
>>> # of default value 0.1 and a weight decay of default value 0.0.
|
||||
>>>
|
||||
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
||||
>>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
|
||||
|
|
|
@ -77,8 +77,7 @@ class Optimizer(Cell):
|
|||
|
||||
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
||||
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
||||
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
||||
decay.
|
||||
in the value of 'order_params' should be in one of group parameters.
|
||||
|
||||
weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
|
||||
If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
|
||||
|
@ -351,16 +350,18 @@ class Optimizer(Cell):
|
|||
self.group_weight_decay.append(weight_decay_)
|
||||
|
||||
if self.is_group_params_ordered:
|
||||
self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay)
|
||||
self._order_and_adjust_group_params(ordered_parameters)
|
||||
|
||||
def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay):
|
||||
def _order_and_adjust_group_params(self, ordered_parameters):
|
||||
"""
|
||||
Order group parameter, learning rate and weight decay in group params. And assign the parameters
|
||||
which in the value of 'order_params' but not in any group to default value.
|
||||
Order group parameter, learning rate and weight decay in group params.
|
||||
"""
|
||||
params_length = len(ordered_parameters)
|
||||
ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters]
|
||||
ordered_weight_decay = [weight_decay * self.loss_scale] * params_length
|
||||
params_length = len(self.group_params)
|
||||
if len(ordered_parameters) != len(self.group_params):
|
||||
raise ValueError(f"The value of 'order_params' should be same with all group parameters.")
|
||||
|
||||
ordered_learning_rate = [None] * params_length
|
||||
ordered_weight_decay = [None] * params_length
|
||||
params_name = [param.name for param in ordered_parameters]
|
||||
|
||||
for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay):
|
||||
|
|
|
@ -107,8 +107,7 @@ class RMSProp(Optimizer):
|
|||
|
||||
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
||||
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
||||
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
||||
decay.
|
||||
in the value of 'order_params' should be in one of group parameters.
|
||||
|
||||
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
||||
Iterable or a Tensor and the dims of the Tensor is 1,
|
||||
|
@ -140,16 +139,14 @@ class RMSProp(Optimizer):
|
|||
>>>
|
||||
>>> #2) Use parameter groups and set different values
|
||||
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
>>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
|
||||
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
||||
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
||||
>>> {'params': bias_params, 'lr': 0.01},
|
||||
>>> {'params': no_conv_params, 'lr': 0.01},
|
||||
>>> {'order_params': net.trainable_params()}]
|
||||
>>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
|
||||
>>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
|
||||
>>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
|
||||
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
|
||||
>>> # of default value 0.1 and a weight decay of default value 0.0.
|
||||
>>>
|
||||
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
||||
>>> model = Model(net, loss_fn=loss, optimizer=optim)
|
||||
|
|
|
@ -64,8 +64,7 @@ class SGD(Optimizer):
|
|||
|
||||
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
||||
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
||||
in the value of 'order_params' but not in any group will use default learning rate and default weight
|
||||
decay.
|
||||
in the value of 'order_params' should be in one of group parameters.
|
||||
|
||||
learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
|
||||
Iterable or a Tensor and the dims of the Tensor is 1,
|
||||
|
@ -98,16 +97,14 @@ class SGD(Optimizer):
|
|||
>>>
|
||||
>>> #2) Use parameter groups and set different values
|
||||
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
>>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
|
||||
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
||||
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
||||
>>> {'params': bias_params, 'lr': 0.01},
|
||||
>>> {'params': no_conv_params, 'lr': 0.01},
|
||||
>>> {'order_params': net.trainable_params()}]
|
||||
>>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
|
||||
>>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
|
||||
>>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
|
||||
>>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
|
||||
>>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
|
||||
>>> # of default value 0.1 and a weight decay of default value 0.0.
|
||||
>>>
|
||||
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
||||
>>> model = Model(net, loss_fn=loss, optimizer=optim)
|
||||
|
|
|
@ -250,8 +250,9 @@ def test_get_lr_parameter_with_order_group():
|
|||
net = LeNet5()
|
||||
conv_lr = 0.1
|
||||
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
||||
group_params = [{'params': conv_params, 'lr': conv_lr},
|
||||
{'order_params': net.trainable_params()}]
|
||||
{'params': no_conv_params}]
|
||||
opt = SGD(group_params)
|
||||
assert opt.is_group_lr is True
|
||||
for param in opt.parameters:
|
||||
|
@ -278,65 +279,19 @@ def test_get_lr_parameter_with_no_group():
|
|||
opt.get_lr_parameter(params_error)
|
||||
|
||||
|
||||
def test_order_params_lr():
|
||||
net = LeNet5()
|
||||
conv_lr = 0.01
|
||||
default_lr = 0.1
|
||||
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
group_params = [{'params': conv_params, 'lr': conv_lr},
|
||||
{'order_params': net.trainable_params()}]
|
||||
opt = SGD(group_params, learning_rate=default_lr)
|
||||
assert opt.is_group is True
|
||||
assert opt.is_group_lr is True
|
||||
assert opt.is_group_params_ordered is True
|
||||
for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
|
||||
if param in conv_params:
|
||||
assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
|
||||
else:
|
||||
assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
|
||||
|
||||
assert param.name == order_param.name
|
||||
assert lr.name == 'lr_' + param.name
|
||||
|
||||
|
||||
def test_order_params_weight_decay():
|
||||
net = LeNet5()
|
||||
conv_weight_decay = 0.01
|
||||
default_wd = 0.0
|
||||
default_lr = 0.1
|
||||
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
|
||||
{'order_params': net.trainable_params()}]
|
||||
opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
|
||||
assert opt.is_group is True
|
||||
assert opt.is_group_lr is False
|
||||
assert opt.is_group_params_ordered is True
|
||||
assert opt.learning_rate.name == "learning_rate"
|
||||
assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
|
||||
for weight_decay, decay_flags, param, order_param in zip(
|
||||
opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
|
||||
if param in conv_params:
|
||||
assert weight_decay == conv_weight_decay
|
||||
assert decay_flags is True
|
||||
else:
|
||||
assert weight_decay == default_wd
|
||||
assert decay_flags is False
|
||||
assert param.name == order_param.name
|
||||
|
||||
|
||||
def test_order_params_all_1():
|
||||
def test_order_params_1():
|
||||
net = LeNet5()
|
||||
conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
||||
bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
|
||||
group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
||||
{'params': bias_params, 'lr': 0.01},
|
||||
{'order_params': net.trainable_params()}]
|
||||
{'order_params': bias_params+conv_params}]
|
||||
opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0)
|
||||
assert opt.is_group is True
|
||||
assert opt.is_group_lr is True
|
||||
assert opt.is_group_params_ordered is True
|
||||
for weight_decay, decay_flags, lr, param, order_param in zip(
|
||||
opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
|
||||
opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, bias_params+conv_params):
|
||||
if param in conv_params:
|
||||
assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
|
||||
assert weight_decay == 0.01
|
||||
|
@ -354,7 +309,7 @@ def test_order_params_all_1():
|
|||
assert lr.name == 'lr_' + param.name
|
||||
|
||||
|
||||
def test_order_params_all_2():
|
||||
def test_order_params_2():
|
||||
net = LeNet5()
|
||||
conv_weight_decay = 0.01
|
||||
fc1_lr = (0.5, 0.4, 0.3)
|
||||
|
@ -364,13 +319,13 @@ def test_order_params_all_2():
|
|||
fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params()))
|
||||
group_params = [{'params': fc1_params, 'lr': fc1_lr},
|
||||
{'params': conv_params, 'weight_decay': conv_weight_decay},
|
||||
{'order_params': net.trainable_params()}]
|
||||
{'order_params': fc1_params+conv_params}]
|
||||
opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
|
||||
assert opt.is_group is True
|
||||
assert opt.is_group_lr is True
|
||||
assert opt.is_group_params_ordered is True
|
||||
for weight_decay, decay_flags, lr, param, order_param in zip(
|
||||
opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
|
||||
opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, fc1_params+conv_params):
|
||||
if param in conv_params:
|
||||
assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
|
||||
assert weight_decay == conv_weight_decay
|
||||
|
@ -388,7 +343,7 @@ def test_order_params_all_2():
|
|||
assert lr.name == 'lr_' + param.name
|
||||
|
||||
|
||||
def test_get_order_params_with_not_include():
|
||||
def test_get_order_params_with_not_same():
|
||||
net = LeNet5()
|
||||
conv_weight_decay = 0.8
|
||||
|
||||
|
|
Loading…
Reference in New Issue