forked from mindspore-Ecosystem/mindspore
add sparse proximal ada grad optimizer
This commit is contained in:
parent
6b2724286b
commit
497067d7b2
|
@ -157,7 +157,7 @@ class Adam(Optimizer):
|
|||
|
||||
The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
|
||||
`sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse
|
||||
behavior is currently performed on the CPU, weight decay and loss scale is not supported.
|
||||
behavior is currently performed on the CPU, weight decay is not supported.
|
||||
|
||||
Args:
|
||||
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
|
||||
|
|
|
@ -73,7 +73,7 @@ class FTRL(Optimizer):
|
|||
Note:
|
||||
The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
|
||||
`sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse
|
||||
behavior is currently performed on the CPU, weight decay and loss scale is not supported.
|
||||
behavior is currently performed on the CPU, weight decay is not supported.
|
||||
|
||||
Args:
|
||||
params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
|
||||
|
|
|
@ -94,8 +94,7 @@ class LazyAdam(Optimizer):
|
|||
The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
|
||||
`sparse_grad` of `Parameter` being set as True. The sparse behavior, to be notice, is not equivalent to the
|
||||
original Adam algorithm, as only the current indices parames will be updated. The sparse feature is under
|
||||
continuous development. The sparse behavior is currently performed on the CPU, weight decay and loss scale is
|
||||
not supported.
|
||||
continuous development. The sparse behavior is currently performed on the CPU, weight decay is not supported.
|
||||
|
||||
Args:
|
||||
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
|
||||
|
|
|
@ -22,9 +22,16 @@ from .optimizer import Optimizer
|
|||
|
||||
_proximal_ada_grad_opt = C.MultitypeFuncGraph("proximal_ada_grad_opt")
|
||||
|
||||
@_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tuple", "Tensor", "Tensor")
|
||||
def _tensor_run_opt_with_sparse(opt, sparse_opt, learning_rate, l1, l2, gradient, weight, accum):
|
||||
"""Apply sparse proximal_ada_grad optimizer to the weight parameter."""
|
||||
success = True
|
||||
success = F.depend(success, sparse_opt(weight, accum, learning_rate, l1, l2, gradient[1], gradient[0]))
|
||||
return success
|
||||
|
||||
@_proximal_ada_grad_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
||||
def _tensor_run_opt(opt, learning_rate, l1, l2, gradient, weight, accum):
|
||||
|
||||
@_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
||||
def _tensor_run_opt(opt, sparse_opt, learning_rate, l1, l2, gradient, weight, accum):
|
||||
"""Apply proximal_ada_grad optimizer to the weight parameter."""
|
||||
success = True
|
||||
success = F.depend(success, opt(weight, accum, learning_rate, l1, l2, gradient))
|
||||
|
@ -50,6 +57,11 @@ class ProximalAdagrad(Optimizer):
|
|||
Refer to paper `Efficient Learning using Forward-Backward Splitting
|
||||
<http://papers.nips.cc//paper/3793-efficient-learning-using-forward-backward-splitting.pdf>`_.
|
||||
|
||||
Note:
|
||||
The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
|
||||
`sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse
|
||||
behavior is currently performed on the CPU, weight decay is not supported.
|
||||
|
||||
Args:
|
||||
params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
|
||||
should be Parameter.
|
||||
|
@ -87,6 +99,7 @@ class ProximalAdagrad(Optimizer):
|
|||
self.weight_decay = weight_decay
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.opt = P.ApplyProximalAdagrad(use_locking=use_locking)
|
||||
self.sparse_opt = P.SparseApplyProximalAdagrad(use_locking=use_locking)
|
||||
|
||||
def construct(self, grads):
|
||||
params = self.parameters
|
||||
|
@ -94,6 +107,6 @@ class ProximalAdagrad(Optimizer):
|
|||
grads = self.decay_weight(grads)
|
||||
grads = self.scale_grad(grads)
|
||||
lr = self.learning_rate
|
||||
success = self.hyper_map(F.partial(_proximal_ada_grad_opt, self.opt, lr, self.l1, self.l2),
|
||||
grads, params, accum)
|
||||
success = self.map_(F.partial(_proximal_ada_grad_opt, self.opt, self.sparse_opt, lr, self.l1, self.l2),
|
||||
grads, params, accum)
|
||||
return success
|
||||
|
|
|
@ -36,6 +36,18 @@ class Net(nn.Cell):
|
|||
x = self.biasAdd(self.matmul(x, self.weight), self.bias)
|
||||
return x
|
||||
|
||||
class NetWithSparseGatherV2(nn.Cell):
|
||||
""" NetWithSparseGatherV2 definition """
|
||||
def __init__(self):
|
||||
super(NetWithSparseGatherV2, self).__init__()
|
||||
self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True)
|
||||
self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="weight2")
|
||||
self.axis = 0
|
||||
self.gather = P.SparseGatherV2()
|
||||
|
||||
def construct(self, indices, label):
|
||||
return self.gather(self.weight1, indices, self.axis) + self.weight2
|
||||
|
||||
|
||||
def test_proximal_ada_grad():
|
||||
""" test_proximal_ada_grad """
|
||||
|
@ -48,3 +60,15 @@ def test_proximal_ada_grad():
|
|||
net_with_loss = WithLossCell(net, loss)
|
||||
train_network = TrainOneStepCell(net_with_loss, optimizer)
|
||||
_executor.compile(train_network, inputs, label)
|
||||
|
||||
|
||||
def test_spares_proximal_ada_grad_compile():
|
||||
""" test sparse proximal_ada_grad compile """
|
||||
indices = Tensor(np.array([0, 1]).astype(np.int32))
|
||||
label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
|
||||
net = NetWithSparseGatherV2()
|
||||
net.set_train()
|
||||
|
||||
optimizer = ProximalAdagrad(net.trainable_params(), loss_scale=2.0)
|
||||
train_network = TrainOneStepCell(net, optimizer)
|
||||
_executor.compile(train_network, indices, label)
|
||||
|
|
Loading…
Reference in New Issue