diff --git a/mindspore/nn/optim/ada_grad.py b/mindspore/nn/optim/ada_grad.py index 7ebb38788e..3b1253c32a 100644 --- a/mindspore/nn/optim/ada_grad.py +++ b/mindspore/nn/optim/ada_grad.py @@ -36,12 +36,22 @@ def _check_param_value(accum, update_slots, prim_name=None): class Adagrad(Optimizer): - """ + r""" Implements the Adagrad algorithm with ApplyAdagrad Operator. Adagrad is an online Learning and Stochastic Optimization. Refer to paper `Efficient Learning using Forward-Backward Splitting `_. + The updating formulas are as follows, + + .. math:: + \begin{array}{ll} \\ + h_{t} = h_{t-1} + g\\ + w_{t} = w_{t-1} - lr*\frac{1}{\sqrt{h_{t}}}*g + \end{array} + + :math:`h` represents the cumulative sum of gradient squared, :math:`g` represents `gradients`. + :math:`lr` represents `learning_rate`, :math:`w` represents `params`. Note: When separating parameter groups, the weight decay in each group will be applied on the parameters if the