diff --git a/mindspore/nn/layer/thor_layer.py b/mindspore/nn/layer/thor_layer.py index ef43da1e576..61ec31dab90 100644 --- a/mindspore/nn/layer/thor_layer.py +++ b/mindspore/nn/layer/thor_layer.py @@ -552,7 +552,7 @@ class EmbeddingThor(Cell): Tensor of output shape :math:`(\text{batch_size}, \text{input_length}, \text{embedding_size})`. Examples: - >>> net = nn.Embedding(20000, 768, True) + >>> net = nn.EmbeddingThor(20000, 768, True) >>> input_data = Tensor(np.ones([8, 128]), mindspore.int32) >>> >>> # Maps the input word IDs to word embedding. diff --git a/mindspore/nn/optim/thor.py b/mindspore/nn/optim/thor.py index d39808ce1cb..67091f618c3 100644 --- a/mindspore/nn/optim/thor.py +++ b/mindspore/nn/optim/thor.py @@ -27,7 +27,7 @@ from mindspore import context from mindspore.context import ParallelMode from mindspore.nn.layer import DenseThor, Conv2dThor, EmbeddingThor from mindspore.nn.wrap import DistributedGradReducer -from mindspore.train.train_thor.convert_utils import ConvertNetUntils +from mindspore.train.train_thor.convert_utils import ConvertNetUtils from mindspore.parallel._auto_parallel_context import auto_parallel_context # Enumerates types of Layer @@ -101,6 +101,17 @@ def clip_gradient(enable_clip_grad, gradients): C0 = 16 +def _check_param(momentum, frequency, lr, cls_name): + """Check param.""" + Validator.check_value_type("momentum", momentum, [float], cls_name) + if isinstance(momentum, float) and momentum < 0.0: + raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) + Validator.check_value_type("frequency", frequency, [int], cls_name) + if isinstance(frequency, int) and frequency < 2: + raise ValueError("frequency should be at least 2, but got frequency {}".format(frequency)) + Validator.check_value_type("learning rate", lr, [Tensor], cls_name) + + def caculate_device_shape(matrix_dim, channel, is_a): ll = (0) if is_a: @@ -188,13 +199,103 @@ def thor(net, learning_rate, damping, momentum, weight_decay=0.0, loss_scale=1.0 use_nesterov=False, decay_filter=lambda x: x.name not in [], split_indices=None, enable_clip_grad=False, frequency=100): r""" - Updates gradients by the THOR algorithm. + Updates gradients by second-order algorithm--THOR. + Trace-based Hardware-driven layer-ORiented Natural Gradient Descent Computation (THOR) algorithm is proposed in: + `THOR: Trace-based Hardware-driven layer-ORiented Natural Gradient Descent Computation `_ + + The updating formulas are as follows, + + .. math:: + \begin{array}{ll} \\ + A_i = a_i{a_i}^T \\ + G_i = D_{s_i}{ D_{s_i}}^T \\ + m_i = \beta * m_i + ({G_i^{(k)}}+\lambda I)^{-1}) g_i ({\overline A_{i-1}^{(k)}}+\lambda I)^{-1} \\ + w_i = w_i - \alpha * m_i \\ + \end{array} + + :math:`D_{s_i}` represents the derivative of the loss function of the output of the i-th layer, + :math:`a_{i-1}` represents the input of i-th layer,and which is the activations of previous layer, + :math:`\beta` represents momentum, :math:`I` represents the identity matrix, + :math:`\overline A` represents the transpose of matrix A, + :math:`\lambda` represents 'damping', :math:`g_i` represents gradients of the i-th layer, + :math:`\otimes` represents Kronecker product, :math:`\alpha` represents 'learning rate' + + Note: + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied + on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive. + + When separating parameter groups, if you want to centralize the gradient, set grad_centralization to True, + but the gradient centralization can only be applied to the parameters of the convolution layer. + If the parameters of the non convolution layer are set to True, an error will be reported. + + To improve parameter groups performance, the customized order of parameters can be supported. + + Args: + net (Cell): The training network. + + learning_rate (Tensor): A value for the learning rate. + + damping (Tensor): A value for the damping. + + momentum (float): Hyper-parameter of type float, means momentum for the moving average. It must be at least 0.0. + + weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0.0. Default: 0.0. + + loss_scale (float): A value for the loss scale. It must be greater than 0.0. In general, use the + default value. Default: 1.0. + + batch_size (int): The size of a batch. Default: 32 + + use_nesterov (bool): Enable Nesterov momentum. Default: False. + + decay_filter (function): A function to determine which layers the weight decay applied to. And it + only works when the weight_decay > 0. Default: lambda x: x.name not in [] + + split_indices (list): Set allreduce fusion strategy by A/G layer indices . Only works when distributed + computing. ResNet50 as an example, there are 54 layers of A/G respectively, when split_indices is set + to [26, 53], it means A/G is divided into two groups to allreduce, one is 0~26 layer, and the other + is 27~53. Default: None + + enable_clip_grad (bool): Whether to clip the gradients. Default: False + + frequency(int): The update interval of A/G and $A^{-1}/G^{-1}$. When frequency equals N (N is greater than 1), + A/G and $A^{-1}/G^{-1}$ will be updated every N steps, and other steps will use the stale A/G and + $A^{-1}/G^{-1}$ to update weights. Default: 100. + + Inputs: + - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. + + Outputs: + tuple[bool], all elements are True. + + Raises: + TypeError: If `learning_rate` is not Tensor. + TypeError: If `loss_scale`,`momentum` or `frequency` is not a float. + TypeError: If `weight_decay` is neither float nor int. + TypeError: If `use_nesterov` is not a bool. + ValueError: If `loss_scale` is less than or equal to 0. + ValueError: If `weight_decay` or `momentum` is less than 0. + ValueError: If `frequency` is not int. + ValueError: If `frequency` is less than 2. + + Supported Platforms: + ``Ascend`` ``GPU`` + + Examples: + >>> net = Net() + >>> optim = thor(net, lr=Tensor(1e-3), damping=Tensor(1e-3), momentum=0.9) + >>> loss = nn.SoftmaxCrossEntropyWithLogits() + >>> model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, + ... loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) + >>> model.train(config.epoch_size, dataset, callbacks=cb, sink_size=100, dataset_sink_mode=True) + """ context.set_context(max_call_depth=10000) - ConvertNetUntils().convert_to_thor_net(net) + ConvertNetUtils().convert_to_thor_net(net) if context.get_context("device_target") == "Ascend": return ThorAscend(net, learning_rate, damping, momentum, weight_decay, loss_scale, batch_size, decay_filter, split_indices=split_indices, enable_clip_grad=enable_clip_grad, frequency=frequency) @@ -212,9 +313,7 @@ class ThorGpu(Optimizer): enable_clip_grad=False, frequency=100): params = filter(lambda x: x.requires_grad, net.get_parameters()) super(ThorGpu, self).__init__(learning_rate, params, weight_decay, loss_scale) - Validator.check_value_type("momentum", momentum, [float], self.cls_name) - if isinstance(momentum, float) and momentum < 0.0: - raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) + _check_param(momentum, frequency, learning_rate, self.__class__.__name__) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) @@ -448,18 +547,14 @@ class ThorGpu(Optimizer): matrix_a = matrix_a_allreduce[thor_layer_count] matrix_g = matrix_g_allreduce[thor_layer_count] g = self.update_gradient(matrix_g, g, matrix_a) - fake_a = self.assign(self.matrix_a[thor_layer_count], matrix_a) - fake_g = self.assign(self.matrix_g[thor_layer_count], matrix_g) - g = F.depend(g, fake_a) - g = F.depend(g, fake_g) + self.assign(self.matrix_a[thor_layer_count], matrix_a) + self.assign(self.matrix_g[thor_layer_count], matrix_g) g = self._reshape_gradient(conv_layer_count, g, g_shape) elif layer_type == Embedding: matrix_a = matrix_a_allreduce[thor_layer_count] matrix_g = matrix_g_allreduce[thor_layer_count] - fake_a = self.assign(self.matrix_a[thor_layer_count], matrix_a) - fake_g = self.assign(self.matrix_g[thor_layer_count], matrix_g) - g = F.depend(g, fake_a) - g = F.depend(g, fake_g) + self.assign(self.matrix_a[thor_layer_count], matrix_a) + self.assign(self.matrix_g[thor_layer_count], matrix_g) temp_a = self.expand(matrix_a, 1) g = self.mul(temp_a, g) g = self.matmul(g, matrix_g) @@ -507,8 +602,7 @@ class ThorAscend(Optimizer): decay_filter=lambda x: x.name not in [], split_indices=None, enable_clip_grad=False, frequency=100): params = filter(lambda x: x.requires_grad, net.get_parameters()) super(ThorAscend, self).__init__(learning_rate, params, weight_decay, loss_scale) - if isinstance(momentum, float) and momentum < 0.0: - raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) + _check_param(momentum, frequency, learning_rate, self.__class__.__name__) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') @@ -845,10 +939,8 @@ class ThorAscend(Optimizer): """process thor graph fc layer""" temp_a = matrix_a_allreduce[thor_layer_count] temp_g = matrix_g_allreduce[thor_layer_count] - fake_a = self.assign(self.matrix_a_cov[thor_layer_count], temp_a) - fake_g = self.assign(self.matrix_g_cov[thor_layer_count], temp_g) - g = F.depend(g, fake_a) - g = F.depend(g, fake_g) + self.assign(self.matrix_a_cov[thor_layer_count], temp_a) + self.assign(self.matrix_g_cov[thor_layer_count], temp_g) temp_a = self.cast(temp_a, mstype.float16) temp_g = self.cast(temp_g, mstype.float16) g = self.cast(g, mstype.float16) @@ -925,10 +1017,8 @@ class ThorAscend(Optimizer): if layer_type == Embedding: temp_a_ori = matrix_a_allreduce[thor_layer_count] temp_g = matrix_g_allreduce[thor_layer_count] - fake_a = self.assign(self.matrix_a_cov[thor_layer_count], temp_a_ori) - fake_g = self.assign(self.matrix_g_cov[thor_layer_count], temp_g) - g = F.depend(g, fake_a) - g = F.depend(g, fake_g) + self.assign(self.matrix_a_cov[thor_layer_count], temp_a_ori) + self.assign(self.matrix_g_cov[thor_layer_count], temp_g) temp_a = self.expand(temp_a_ori, 1) g = self.mul(temp_a, g) temp_g = self.cast(temp_g, mstype.float16) @@ -982,12 +1072,9 @@ class ThorAscend(Optimizer): temp_a = self.cast(temp_a, mstype.float16) temp_g = self.cast(temp_g, mstype.float16) g, temp_max = self._get_second_grad_by_matmul(i, temp_a, temp_g, g, temp_max) - fake_a = self.assign(self.matrix_a[thor_layer_count], temp_a) - fake_g = self.assign(self.matrix_g[thor_layer_count], temp_g) - fake_max = self.assign(self.matrix_max_inv[thor_layer_count], temp_max) - g = F.depend(g, fake_a) - g = F.depend(g, fake_g) - g = F.depend(g, fake_max) + self.assign(self.matrix_a[thor_layer_count], temp_a) + self.assign(self.matrix_g[thor_layer_count], temp_g) + self.assign(self.matrix_max_inv[thor_layer_count], temp_max) new_grads = new_grads + (g,) gradients = new_grads else: diff --git a/mindspore/ops/operations/_thor_ops.py b/mindspore/ops/operations/_thor_ops.py index caf738fe429..537560d0ca2 100644 --- a/mindspore/ops/operations/_thor_ops.py +++ b/mindspore/ops/operations/_thor_ops.py @@ -85,7 +85,7 @@ class CusCholeskyTrsm(PrimitiveWithInfer): Examples: >>> input_x = Tensor(np.ones(shape=[256, 256]), mindspore.float32) >>> cus_choleskytrsm = ops.CusCholeskyTrsm() - >>> output = matmul(input_x) + >>> output = cus_choleskytrsm(input_x) """ @prim_attr_register diff --git a/mindspore/train/train_thor/__init__.py b/mindspore/train/train_thor/__init__.py index 1122ff25dbb..44c680b218d 100644 --- a/mindspore/train/train_thor/__init__.py +++ b/mindspore/train/train_thor/__init__.py @@ -14,6 +14,6 @@ # ============================================================================ """convert to second order related classes and functions.""" -from .convert_utils import ConvertNetUntils, ConvertModelUtils +from .convert_utils import ConvertNetUtils, ConvertModelUtils -__all__ = ["ConvertNetUntils", "ConvertModelUtils"] +__all__ = ["ConvertNetUtils", "ConvertModelUtils"] diff --git a/mindspore/train/train_thor/convert_utils.py b/mindspore/train/train_thor/convert_utils.py index d5d71123682..fd5609e9928 100644 --- a/mindspore/train/train_thor/convert_utils.py +++ b/mindspore/train/train_thor/convert_utils.py @@ -13,29 +13,28 @@ # limitations under the License. # ============================================================================ """ -convert utils for second order optimizer: thor +Conversion interface for second-order optimizer thor """ import mindspore.nn as nn import mindspore.common.dtype as mstype from mindspore import context -class ConvertNetUntils(): +class ConvertNetUtils(): """ Convert net to thor layer net """ def __init__(self): - self._convert_method_map = {nn.Dense: ConvertNetUntils._convert_dense, - nn.Embedding: ConvertNetUntils._convert_embedding, - nn.Conv2d: ConvertNetUntils._convert_conv2d} + self._convert_method_map = {nn.Dense: ConvertNetUtils._convert_dense, + nn.Embedding: ConvertNetUtils._convert_embedding, + nn.Conv2d: ConvertNetUtils._convert_conv2d} @staticmethod def _convert_dense(subcell): """ - convert dense cell to second_order cell + Convert dense cell to second-order cell """ - weight = subcell.weight act_name = None if subcell.activation_flag: @@ -69,7 +68,7 @@ class ConvertNetUntils(): @staticmethod def _convert_embedding(subcell): """ - convert embedding cell to second_order cell + Convert embedding cell to second-order cell """ new_subcell = nn.EmbeddingThor(vocab_size=subcell.vocab_size, embedding_size=subcell.embedding_size, @@ -81,7 +80,7 @@ class ConvertNetUntils(): @staticmethod def _convert_conv2d(subcell): """ - convert conv2d cell to second_order cell + Convert conv2d cell to second-order cell """ out_channel = subcell.out_channels in_channel = subcell.in_channels @@ -99,7 +98,7 @@ class ConvertNetUntils(): def _convert_to_thor_net(self, net): """ - convert net to thor net + Convert net to thor net """ cells = net.name_cells() change = False @@ -131,25 +130,76 @@ class ConvertNetUntils(): def convert_to_thor_net(self, net): """ - api for convert net to thor net + This interface is used to convert a network to thor layer network, in order to calculate and store the + second-order information matrix. + + Notes: + This interface is automatically called by the second-order optimizer thor. + + Args: + net (Cell): network to be trained by the second-order optimizer thor. + + Examples: + >>> ConvertNetUtils().convert_to_thor_net(net) """ + net.update_cell_prefix() self._convert_to_thor_net(net) - net.update_cell_type("second_order") + net.update_cell_type("second-order") class ConvertModelUtils(): """ - convert model to thor model utils + Convert model to thor model. """ - @staticmethod def convert_to_thor_model(model, network, loss_fn=None, optimizer=None, metrics=None, amp_level="O0", loss_scale_manager=None, keep_batchnorm_fp32=False): + """ + This interface is used to convert model to thor model. + Args: + model (Object): High-Level API for Training. + `Model` groups layers into an object with training features. + network (Cell): A training network. + loss_fn (Cell): Objective function. Default: None. + optimizer (Cell): Optimizer used to updating the weights. Default: None. + metrics (Union[dict, set]): A Dictionary or a set of metrics to be evaluated by the model during + training. eg: {'accuracy', 'recall'}. Default: None. + amp_level (str): Level for mixed precision training. Supports ["O0", "O2", "O3", "auto"]. Default: "O0". + - O0: Do not change. + - O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale. + - O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'. + - auto: Set level to recommended level in different devices. O2 is recommended on GPU, O3 is + recommended on Ascend. The recommended level is based on the expert experience, cannot + always generalize. User should specify the level for special network. + loss_scale_manager (Union[None, LossScaleManager]): If it is None, the loss would not be scaled. + Otherwise, scale the loss by LossScaleManager and optimizer can not be None. It is a key argument. + e.g. Use `loss_scale_manager=None` to set the value. + keep_batchnorm_fp32 (bool): Keep Batchnorm running in `float32`. If True, the level setting before + will be overwritten. Default: True. + + Returns: + model (Object): High-Level API for Training. + `Model` groups layers into an object with training features. + + Examples: + >>> from mindspore.nn.optim import thor + >>> from mindspore.train.model import Model + >>> from mindspore.train.loss_scale_manager import FixedLossScaleManager + >>> + >>> net = Net() + >>> loss_manager = FixedLossScaleManager(128, drop_overflow_update=False) + >>> opt = thor(net, lr, damping, momentum=0.9, weight_decay=1e-4, loss_scale=128, batch_size=32, + ... frequency=100) + >>> model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_manager, metrics={"acc"}, + ... amp_level="O2", keep_batchnorm_fp32=False) + >>> model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, + ... metrics={'acc'}, amp_level="O2", + ... loss_scale_manager=loss_manager, + ... keep_batchnorm_fp32=False) """ - api for convert model to thor model - """ + optim_name = type(optimizer).__name__ if optim_name in ("ThorAscend", "ThorGpu"): from .model_thor import ModelThor diff --git a/mindspore/train/train_thor/dataset_helper.py b/mindspore/train/train_thor/dataset_helper.py index b5811946e33..bb19e9ce367 100644 --- a/mindspore/train/train_thor/dataset_helper.py +++ b/mindspore/train/train_thor/dataset_helper.py @@ -53,6 +53,7 @@ class DatasetHelper: If sink_size=-1, sink the complete dataset for each epoch. If sink_size>0, sink sink_size data for each epoch. Default: -1. epoch_num (int): Control the number of epoch data to send. Default: 1. + iter_first_order (int): Control the number of steps first_order graph to execute. Default: 1. Examples: >>> dataset_helper = DatasetHelper(dataset) diff --git a/tests/st/networks/models/resnet50/src_thor/thor.py b/tests/st/networks/models/resnet50/src_thor/thor.py index 6dda7c46a51..a8ddb37c0d7 100644 --- a/tests/st/networks/models/resnet50/src_thor/thor.py +++ b/tests/st/networks/models/resnet50/src_thor/thor.py @@ -26,7 +26,7 @@ from mindspore import context from mindspore.context import ParallelMode from mindspore.nn.layer import DenseThor, Conv2dThor, EmbeddingThor from mindspore.nn.wrap import DistributedGradReducer -from mindspore.train.train_thor.convert_utils import ConvertNetUntils +from mindspore.train.train_thor.convert_utils import ConvertNetUtils from mindspore.parallel._auto_parallel_context import auto_parallel_context # Enumerates types of Layer @@ -147,7 +147,7 @@ def get_layer_counter(layer_type, layer_counter, params, idx): def THOR(net, learning_rate, damping, momentum, weight_decay=0.0, loss_scale=1.0, batch_size=32, use_nesterov=False, decay_filter=lambda x: x.name not in [], split_indices=None): context.set_context(max_call_depth=10000) - ConvertNetUntils().convert_to_thor_net(net) + ConvertNetUtils().convert_to_thor_net(net) return THOR_Ascend(net, learning_rate, damping, momentum, weight_decay, loss_scale, batch_size, decay_filter, split_indices=split_indices) @@ -486,12 +486,9 @@ class THOR_Ascend(Optimizer): temp_max = self.mul(temp_max, self.batch_size / A_normalizer) g = self.cube_matmul_left(temp_g, g) g = self.cube_matmul_right_mul(g, temp_a, temp_max) - fake_A = self.assign(self.matrix_A[thor_layer_count], temp_a) - fake_G = self.assign(self.matrix_G[thor_layer_count], temp_g) - fake_max = self.assign(self.matrix_max_inv[thor_layer_count], temp_max) - g = F.depend(g, fake_A) - g = F.depend(g, fake_G) - g = F.depend(g, fake_max) + self.assign(self.matrix_A[thor_layer_count], temp_a) + self.assign(self.matrix_G[thor_layer_count], temp_g) + self.assign(self.matrix_max_inv[thor_layer_count], temp_max) if i == 159: new_grads = new_grads + (g, gradients[i + 1]) else: