diff --git a/mindspore/boost/adasum.py b/mindspore/boost/adasum.py index 5abc0ddcad9..134728a95fb 100644 --- a/mindspore/boost/adasum.py +++ b/mindspore/boost/adasum.py @@ -150,9 +150,10 @@ class AdaSum(Cell): parallel training of Deep Learning models. Args: - network (Cell): The training network. The network only supports single output. - optimizer (Union[Cell]): Optimizer for updating the weights. - sens (numbers.Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0. + rank (int): Rank number. + device_number (int): Device number. + group_number (int): Group number. + parameter_tuple (Tuple(Parameter)): Tuple of parameters. Inputs: - **delta_weights** (Tuple(Tensor)) - Tuple of gradients. diff --git a/mindspore/boost/base.py b/mindspore/boost/base.py index 9bc4270358d..decbc68819a 100644 --- a/mindspore/boost/base.py +++ b/mindspore/boost/base.py @@ -25,7 +25,7 @@ __all__ = ["OptimizerProcess", "ParameterProcess"] class OptimizerProcess: - """ + r""" Process optimizer for Boost. Currently, this class supports adding GC(grad centralization) tags and creating new optimizers. @@ -68,7 +68,12 @@ class OptimizerProcess: self.origin_params = opt.init_params["params"] def build_params_dict(self, network): - """Build the params dict of the network""" + r""" + Build the parameter's dict of the network. + + Inputs: + - **network** (Cell) - The training network. + """ cells = network.cells_and_names() params_dict = {} for _, cell in cells: @@ -77,7 +82,13 @@ class OptimizerProcess: return params_dict def build_gc_params_group(self, params_dict, parameters): - """Build the params group that needs gc""" + r""" + Build the parameter's group with grad centralization. + + Inputs: + - **params_dict** (dict) - The network's parameter dict. + - **parameters** (list) - The network's parameter list. + """ group_params = [] for group_param in parameters: if 'order_params' in group_param.keys(): @@ -107,7 +118,12 @@ class OptimizerProcess: return group_params def add_grad_centralization(self, network): - """Add gradient centralization.""" + r""" + Add gradient centralization. + + Inputs: + - **network** (Cell) - The training network. + """ params_dict = self.build_params_dict(network) parameters = self.origin_params @@ -137,7 +153,7 @@ class OptimizerProcess: class ParameterProcess: - """ + r""" Process parameter for Boost. Currently, this class supports creating group parameters and automatically setting gradient segmentation point. @@ -171,7 +187,13 @@ class ParameterProcess: self._parameter_indices = 1 def assign_parameter_group(self, parameters, split_point=None): - """Assign parameter group.""" + r""" + Assign parameter group. + + Inputs: + - **parameters** (list) - The network's parameter list. + - **split_point** (list) - The gradient split point of this network. default: None. + """ if not isinstance(parameters, (list, tuple)) or not parameters: return parameters @@ -187,7 +209,13 @@ class ParameterProcess: return parameters def generate_group_params(self, parameters, origin_params): - """Generate group parameters.""" + r""" + Generate group parameters. + + Inputs: + - **parameters** (list) - The network's parameter list. + - **origin_params** (list) - The network's origin parameter list. + """ origin_params_copy = origin_params if origin_params_copy is not None: if not isinstance(origin_params_copy, list): diff --git a/mindspore/boost/boost.py b/mindspore/boost/boost.py index a8a8d077705..856bec33382 100644 --- a/mindspore/boost/boost.py +++ b/mindspore/boost/boost.py @@ -37,7 +37,7 @@ _boost_config_level = { class AutoBoost: - """ + r""" Provide auto accelerating for network. Args: @@ -68,7 +68,13 @@ class AutoBoost: self._boost_config_func_map[key](self, val) def network_auto_process_train(self, network, optimizer): - """Network train.""" + r""" + Boost network train. + + Inputs: + - **network** (Cell) - The training network. + - **optimizer** (Cell) - Optimizer for updating the weights. + """ if self._boost_config["less_bn"]: network = LessBN(network, fn_flag=self._fn_flag) optimizer_process = OptimizerProcess(optimizer) @@ -90,7 +96,12 @@ class AutoBoost: return network, optimizer def network_auto_process_eval(self, network): - """Network eval.""" + r""" + Boost network eval. + + Args: + - **network** (Cell) - The inference network. + """ if self._boost_config["less_bn"]: network = LessBN(network) diff --git a/mindspore/boost/boost_cell_wrapper.py b/mindspore/boost/boost_cell_wrapper.py index 122698fe9d0..21ce50a177d 100644 --- a/mindspore/boost/boost_cell_wrapper.py +++ b/mindspore/boost/boost_cell_wrapper.py @@ -209,7 +209,12 @@ class BoostTrainOneStepCell(TrainOneStepCell): return loss def gradient_freeze_process(self, *inputs): - """gradient freeze algorithm process.""" + r""" + Gradient freeze algorithm process. + + Inputs: + - **(*inputs)** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \ldots)`. + """ if self.train_strategy is None: step = self.step max_index = len(self.freeze_nets) @@ -224,7 +229,13 @@ class BoostTrainOneStepCell(TrainOneStepCell): return loss def gradient_accumulation_process(self, loss, grads): - """gradient accumulation algorithm process.""" + r""" + Gradient accumulation algorithm process. + + Inputs: + - **loss** (Tensor) - Tensor with shape :math:`()`. + - **grads** (Tuple(Tensor)) - Tuple of gradient tensors. + """ loss = F.depend(loss, self.hyper_map(F.partial(gradient_accumulation_op, self.max_accumulation_step), self.grad_accumulation, grads)) self.accumulation_step += 1 @@ -242,7 +253,13 @@ class BoostTrainOneStepCell(TrainOneStepCell): return loss def adasum_process(self, loss, grads): - """adasum algorithm process.""" + r""" + Adasum algorithm process. + + Inputs: + - **loss** (Tensor) - Tensor with shape :math:`()`. + - **grads** (Tuple(Tensor)) - Tuple of gradient tensors. + """ loss = F.depend(loss, self.optimizer(grads)) rank_weights = self.weights[self.start[self.server_rank]: self.end[self.server_rank]] grad_clone = F.depend(self.grad_clone, loss) @@ -261,7 +278,13 @@ class BoostTrainOneStepCell(TrainOneStepCell): return loss def check_adasum_enable(self, optimizer, reducer_flag): - """check adasum enable.""" + r""" + Check adasum enable. + + Inputs: + - **optimizer** (Union[Cell]) - Optimizer for updating the weights. + - **reducer_flag** (bool) - Reducer flag. + """ if not getattr(optimizer, "adasum", None) or not reducer_flag: return False _rank_size = get_group_size() @@ -280,7 +303,7 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell): BoostTrainOneStepWithLossScaleCell will be compiled to be graph which takes `*inputs` as input data. The Tensor type of `scale_sense` is acting as loss scaling value. If you want to update it on host side, the value must be provided. If the Tensor type of `scale_sense` is not given, the loss scale update logic - must be provied by Cell type of `scale_sense`. + must be provide by Cell type of `scale_sense`. Args: network (Cell): The training network. The network only supports single output. diff --git a/mindspore/boost/grad_freeze.py b/mindspore/boost/grad_freeze.py index 60d1a54d2bd..6dd692b0606 100644 --- a/mindspore/boost/grad_freeze.py +++ b/mindspore/boost/grad_freeze.py @@ -114,7 +114,7 @@ class FreezeOpt(Cell): class _TrainFreezeCell(Cell): - """ + r""" Gradient freezing training network. Args: @@ -157,7 +157,7 @@ class _TrainFreezeCell(Cell): class GradientFreeze: - """ + r""" Freezing the gradients of some layers randomly. The number and probability of frozen layers can be configured by users @@ -180,7 +180,13 @@ class GradientFreeze: self._param_processer = ParameterProcess() def split_parameters_groups(self, net, freeze_para_groups_number): - """Split parameter groups for gradients freezing training.""" + r""" + Split parameter groups for gradients freezing training. + + Inputs: + - **net** (Cell) - The training network. + - **freeze_para_groups_number** (int) - The number of gradient freeze groups. + """ grouped_params = [] tmp = [] for para in net.trainable_params(): @@ -201,7 +207,15 @@ class GradientFreeze: return freeze_grouped_params def generate_freeze_index_sequence(self, parameter_groups_number, freeze_strategy, freeze_p, total_steps): - """Generate index sequence for gradient freezing training.""" + r""" + Generate index sequence for gradient freezing training. + + Inputs: + - **parameter_groups_number** (int) - The number of parameter groups. + - **freeze_strategy** (int) - Gradient freeze grouping strategy, select from [0, 1]. + - **freeze_p** (float) - Gradient freezing probability. + - **total_steps** (int) - Total training steps. + """ total_step = int(total_steps * 1.01) if parameter_groups_number <= 1: return [0 for _ in range(total_step)] @@ -235,7 +249,13 @@ class GradientFreeze: f"Unsupported freezing training strategy '{freeze_strategy}'") def freeze_generate(self, network, optimizer): - """Generate freeze network and optimizer.""" + r""" + Generate freeze network and optimizer. + + Inputs: + - **network** (Cell) - The training network. + - **optimizer** (Cell) - Optimizer for updating the weights. + """ train_para_groups = self.split_parameters_groups( network, self._param_groups) for i in range(self._param_groups): @@ -250,7 +270,43 @@ class GradientFreeze: def freeze_cell(reducer_flag, network, optimizer, sens, grad, use_grad_accumulation, mean=None, degree=None, max_accumulation_step=1): - """Provide freeze network cell.""" + r""" + Generate freeze network and optimizer. + + Inputs: + - **reducer_flag** (bool) - Reducer flag. + - **network** (Cell) - The training network. + - **optimizer** (Cell) - Optimizer for updating the weights. + - **sens** (Tensor) - Tensor with shape :math:`()` + - **grad** (Tuple(Tensor)) - Tuple of gradient tensors. + - **use_grad_accumulation** (bool) - Use gradient accumulation flag. + - **mean** (bool) - Gradients mean flag. default: None. + - **degree** (int) - Device number. default: None. + - **max_accumulation_step** (int) - Max accumulation steps. default: 1. + + Examples: + >>> import numpy as np + >>> from mindspore import Tensor, Parameter, nn + >>> import mindspore.ops as ops + >>> from mindspore.boost.grad_freeze import freeze_cell + >>> + >>> class Net(nn.Cell): + ... def __init__(self, in_features, out_features): + ... super(Net, self).__init__() + ... self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)), + ... name='weight') + ... self.matmul = ops.MatMul() + ... + ... def construct(self, x): + ... output = self.matmul(x, self.weight) + ... return output + ... + >>> in_features, out_features = 16, 10 + >>> network = Net(in_features, out_features) + >>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) + >>> grad = ops.GradOperation(get_by_list=True, sens_param=True) + >>> freeze_nets = freeze_cell(False, network, optimizer, 1.0, grad, False, None, None, 1) + """ if reducer_flag: param_processer = ParameterProcess() grad_reducers = (DistributedGradReducer(param_processer.assign_parameter_group(opt.parameters), diff --git a/mindspore/nn/wrap/loss_scale.py b/mindspore/nn/wrap/loss_scale.py index 3488a6c4b61..2d5cfedc89c 100644 --- a/mindspore/nn/wrap/loss_scale.py +++ b/mindspore/nn/wrap/loss_scale.py @@ -232,7 +232,7 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell): TrainOneStepWithLossScaleCell will be compiled to be graph which takes `*inputs` as input data. The Tensor type of `scale_sense` is acting as loss scaling value. If you want to update it on host side, the value must be provided. If the Tensor type of `scale_sense` is not given, the loss scale update logic - must be provied by Cell type of `scale_sense`. + must be provide by Cell type of `scale_sense`. Args: network (Cell): The training network. The network only supports single output.