forked from mindspore-Ecosystem/mindspore
!18667 Remove redundant depend
Merge pull request !18667 from huangbingjian/remove_redundant_depend
This commit is contained in:
commit
8a8851dc52
|
@ -310,8 +310,8 @@ class TrainingWrapper(nn.Cell):
|
|||
else:
|
||||
cond = self.less_equal(self.base, flag_sum)
|
||||
|
||||
ret = (loss, cond, sens)
|
||||
return F.depend(ret, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, sens)
|
||||
|
||||
|
||||
class CenterFaceWithNms(nn.Cell):
|
||||
|
|
|
@ -135,10 +135,8 @@ class CNNCTCTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
#apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
success = self.optimizer(grads)
|
||||
|
||||
ret = (loss, scaling_sens)
|
||||
return F.depend(ret, success)
|
||||
self.optimizer(grads)
|
||||
return (loss, scaling_sens)
|
||||
|
||||
class CNNCTC_Model(nn.Cell):
|
||||
|
||||
|
|
|
@ -108,4 +108,5 @@ class TrainOneStepCellWithGradClip(Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -184,4 +184,5 @@ class TrainingWrapper(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -18,7 +18,6 @@ import time
|
|||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore import ParameterTuple
|
||||
from mindspore.train.callback import Callback
|
||||
|
@ -140,4 +139,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(x, gt_bbox, gt_label, gt_num, img_shape, self.sens)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -18,7 +18,6 @@ import time
|
|||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore import ParameterTuple
|
||||
from mindspore.train.callback import Callback
|
||||
|
@ -150,4 +149,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -18,7 +18,6 @@ import time
|
|||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore import ParameterTuple
|
||||
from mindspore.train.callback import Callback
|
||||
|
@ -147,4 +146,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -18,7 +18,6 @@ import time
|
|||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore import ParameterTuple
|
||||
from mindspore.train.callback import Callback
|
||||
|
@ -146,5 +145,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, gt_mask, self.sens)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -177,7 +177,8 @@ class TrainOneStepCell(nn.Cell):
|
|||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
class MaskRcnn_Mobilenetv1_Infer(nn.Cell):
|
||||
def __init__(self, config):
|
||||
|
|
|
@ -934,4 +934,5 @@ class NASNetAMobileTrainOneStepWithClipGradient(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -199,4 +199,5 @@ class TrainOneStepWithClipGradientCell(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -23,7 +23,6 @@ from mindspore import ParameterTuple
|
|||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.train.callback import Callback
|
||||
|
||||
__all__ = ['LossCallBack', 'WithLossCell', 'TrainOneStepCell']
|
||||
|
@ -144,4 +143,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(img, gt_text, gt_kernels, training_mask, self.sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -19,7 +19,6 @@ import numpy as np
|
|||
|
||||
import mindspore
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore import context, Tensor
|
||||
|
@ -524,4 +523,5 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -316,7 +316,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
class resnet(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -525,7 +525,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.use_global_norm:
|
||||
grads = self.hyper_map(F.partial(grad_scale, F.scalar_to_array(self.sens)), grads)
|
||||
grads = C.clip_by_global_norm(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class SSDWithMobileNetV2(nn.Cell):
|
||||
|
|
|
@ -105,4 +105,5 @@ class TrainOneStepCellWithGradClip(Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -444,4 +444,5 @@ class TrainingWrapper(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -436,4 +436,5 @@ class TrainingWrapper(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -672,7 +672,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class YoloBoxScores(nn.Cell):
|
||||
|
|
|
@ -515,7 +515,8 @@ class TrainingWrapper(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class Giou(nn.Cell):
|
||||
|
|
|
@ -427,7 +427,8 @@ class TrainingWrapper(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class Giou(nn.Cell):
|
||||
|
|
|
@ -18,7 +18,6 @@ from mindspore.common.parameter import ParameterTuple
|
|||
from mindspore import Tensor
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
|
||||
|
@ -150,7 +149,8 @@ class TrainOneStepCell(nn.Cell):
|
|||
loss = self.network(feature, biases)
|
||||
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
|
||||
grads = self.grad(self.network, weights)(feature, biases, sens)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class TrainGAT(nn.Cell):
|
||||
|
|
|
@ -152,12 +152,9 @@ class BertFinetuneCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
class BertSquadCell(nn.Cell):
|
||||
"""
|
||||
|
@ -245,12 +242,9 @@ class BertSquadCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
class BertCLS(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -311,8 +311,8 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
|
|||
if self.enable_clip_grad:
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
grads = self.grad_reducer(grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
grad_scale = C.MultitypeFuncGraph("grad_scale")
|
||||
|
@ -400,12 +400,9 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell):
|
||||
|
@ -475,9 +472,8 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell)
|
|||
overflow = cond
|
||||
if self.loss_scaling_manager is not None:
|
||||
overflow = self.loss_scaling_manager(scaling_sens, cond)
|
||||
succ = self.optimizer(grads, overflow)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
self.optimizer(grads, overflow)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
cast = P.Cast()
|
||||
add_grads = C.MultitypeFuncGraph("add_grads")
|
||||
|
@ -634,9 +630,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
accu_overflow = self.select(overflow, self.one, self.zero)
|
||||
self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
|
||||
|
||||
if is_accu_step:
|
||||
succ = False
|
||||
else:
|
||||
if not is_accu_step:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(self.accu_grads)
|
||||
scaling = scaling_sens * self.degree * self.accumulation_steps
|
||||
|
@ -653,13 +647,10 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
overflow = self.reshape(overflow, (()))
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, overflow)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
ret = (mean_loss, overflow, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
return (mean_loss, overflow, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell):
|
||||
|
|
|
@ -311,8 +311,8 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
|
|||
if self.enable_clip_grad:
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
grads = self.grad_reducer(grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
grad_scale = C.MultitypeFuncGraph("grad_scale")
|
||||
|
@ -400,12 +400,9 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell):
|
||||
|
@ -475,9 +472,8 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell)
|
|||
overflow = cond
|
||||
if self.loss_scaling_manager is not None:
|
||||
overflow = self.loss_scaling_manager(scaling_sens, cond)
|
||||
succ = self.optimizer(grads, overflow)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
self.optimizer(grads, overflow)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
cast = P.Cast()
|
||||
add_grads = C.MultitypeFuncGraph("add_grads")
|
||||
|
@ -634,9 +630,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
accu_overflow = self.select(overflow, self.one, self.zero)
|
||||
self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
|
||||
|
||||
if is_accu_step:
|
||||
succ = False
|
||||
else:
|
||||
if not is_accu_step:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(self.accu_grads)
|
||||
scaling = scaling_sens * self.degree * self.accumulation_steps
|
||||
|
@ -653,13 +647,10 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
overflow = self.reshape(overflow, (()))
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, overflow)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
ret = (mean_loss, overflow, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
return (mean_loss, overflow, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell):
|
||||
|
|
|
@ -254,11 +254,9 @@ class CPMTrainOneStepWithLossScaleCell(TrainOneStepWithLossScaleCell):
|
|||
|
||||
cond = self.get_overflow_status(status, grads)
|
||||
overflow = self.process_loss_scale(cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ), cond, scaling_sens
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return loss, cond, scaling_sens
|
||||
|
||||
|
||||
cast = P.Cast()
|
||||
|
@ -352,7 +350,6 @@ class CPMTrainAccuStepsWithLossScaleCell(TrainOneStepWithLossScaleCell):
|
|||
accu_overflow = self.select(overflow, self.one, self.zero)
|
||||
|
||||
if self.accumulation:
|
||||
succ = False
|
||||
self.accu_overflow = accu_overflow
|
||||
else:
|
||||
my_zero = F.depend(self.zero, accu_overflow)
|
||||
|
@ -378,9 +375,7 @@ class CPMTrainAccuStepsWithLossScaleCell(TrainOneStepWithLossScaleCell):
|
|||
overflow = self.reshape(overflow, (()))
|
||||
overflow = self.process_loss_scale(overflow)
|
||||
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
return F.depend(loss, succ), overflow, scaling_sens
|
||||
return loss, overflow, scaling_sens
|
||||
|
|
|
@ -152,12 +152,9 @@ class BertFinetuneCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
class BertSquadCell(nn.Cell):
|
||||
"""
|
||||
|
@ -245,12 +242,9 @@ class BertSquadCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
class BertCLS(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -308,8 +308,8 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
|
|||
mstype.float32))
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
grads = self.grad_reducer(grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
grad_scale = C.MultitypeFuncGraph("grad_scale")
|
||||
|
@ -397,12 +397,9 @@ class BertTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell):
|
||||
|
@ -472,9 +469,8 @@ class BertTrainOneStepWithLossScaleCellForAdam(nn.TrainOneStepWithLossScaleCell)
|
|||
overflow = cond
|
||||
if self.loss_scaling_manager is not None:
|
||||
overflow = self.loss_scaling_manager(scaling_sens, cond)
|
||||
succ = self.optimizer(grads, overflow)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
self.optimizer(grads, overflow)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
cast = P.Cast()
|
||||
add_grads = C.MultitypeFuncGraph("add_grads")
|
||||
|
@ -631,9 +627,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
accu_overflow = self.select(overflow, self.one, self.zero)
|
||||
self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
|
||||
|
||||
if is_accu_step:
|
||||
succ = False
|
||||
else:
|
||||
if not is_accu_step:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(self.accu_grads)
|
||||
scaling = scaling_sens * self.degree * self.accumulation_steps
|
||||
|
@ -650,13 +644,10 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
overflow = self.reshape(overflow, (()))
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, overflow)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
ret = (mean_loss, overflow, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
return (mean_loss, overflow, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainAccumulationAllReduceEachWithLossScaleCell(nn.Cell):
|
||||
|
|
|
@ -172,12 +172,9 @@ class ErnieFinetuneCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
class ErnieCLS(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -138,5 +138,5 @@ class FastTextTrainOneStepCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -284,9 +284,6 @@ class GNMTTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -151,9 +151,6 @@ class GPTTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -234,12 +234,9 @@ class GRUTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
class GRUTrainOneStepCell(nn.TrainOneStepCell):
|
||||
"""
|
||||
|
|
|
@ -368,10 +368,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -147,11 +147,9 @@ class PanguAlphaTrainOneStepWithLossScaleCell(TrainOneStepWithLossScaleCell):
|
|||
overflow = self.process_loss_scale(cond)
|
||||
# If overflow, surpass weights update
|
||||
# if not, update weights
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ), cond, scaling_sens
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return loss, cond, scaling_sens
|
||||
|
||||
class PanguAlphaTrainPipelineWithLossScaleCell(nn.Cell):
|
||||
"""
|
||||
|
@ -255,9 +253,6 @@ class PanguAlphaTrainPipelineWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, overflow, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, overflow, scaling_sens)
|
||||
|
|
|
@ -212,12 +212,9 @@ class BertTrainWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class BertTrainCell(nn.Cell):
|
||||
|
@ -271,8 +268,8 @@ class BertTrainCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class BertNetworkWithLoss_td(nn.Cell):
|
||||
|
@ -451,12 +448,9 @@ class BertEvaluationWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
|
||||
class BertEvaluationCell(nn.Cell):
|
||||
|
@ -507,5 +501,5 @@ class BertEvaluationCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -285,12 +285,9 @@ class BertTrainWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
class BertTrainCell(nn.Cell):
|
||||
"""
|
||||
|
@ -343,8 +340,8 @@ class BertTrainCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
class BertNetworkWithLoss_td(nn.Cell):
|
||||
"""
|
||||
|
@ -551,12 +548,9 @@ class BertEvaluationWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class BertEvaluationCell(nn.Cell):
|
||||
|
@ -606,5 +600,5 @@ class BertEvaluationCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -187,8 +187,8 @@ class TransformerTrainOneStepCell(nn.TrainOneStepCell):
|
|||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
grad_scale = C.MultitypeFuncGraph("grad_scale")
|
||||
|
@ -277,12 +277,9 @@ class TransformerTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell)
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
cast = P.Cast()
|
||||
|
@ -444,9 +441,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
accu_overflow = self.select(overflow, self.one, self.zero)
|
||||
self.accu_overflow = self.select(is_accu_step, accu_overflow, self.zero)
|
||||
|
||||
if is_accu_step:
|
||||
succ = False
|
||||
else:
|
||||
if not is_accu_step:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(self.accu_grads)
|
||||
scaling = scaling_sens * self.degree * self.accumulation_steps
|
||||
|
@ -463,10 +458,7 @@ class TransformerTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
|
|||
overflow = self.reshape(overflow, (()))
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, overflow)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
ret = (mean_loss, overflow, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
return (mean_loss, overflow, scaling_sens)
|
||||
|
|
|
@ -20,7 +20,6 @@ from mindspore.nn.layer.activation import get_activation
|
|||
import mindspore.common.dtype as mstype
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.common.initializer import initializer
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
|
@ -261,7 +260,8 @@ class TrainStepWrap(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class PredictWithSigmoid(nn.Cell):
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
"""define network"""
|
||||
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore import ParameterTuple
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
|
@ -83,4 +82,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.net_with_loss, weights)(data3, data2, data1, label)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
# ============================================================================
|
||||
"""define pretrain network"""
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore import ParameterTuple
|
||||
|
@ -85,4 +84,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.net_with_loss, weights)(data1, data2, data3, label)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
# ============================================================================
|
||||
"""define training network"""
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore import ParameterTuple
|
||||
|
@ -84,4 +83,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.net_with_loss, weights)(data, label)
|
||||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -116,7 +116,8 @@ class TrainOneStepCellGen(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(img_a, att_a, att_a_, att_b, att_b_, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads)), gf_loss, gc_loss, gr_loss
|
||||
self.optimizer(grads)
|
||||
return loss, gf_loss, gc_loss, gr_loss
|
||||
|
||||
|
||||
class TrainOneStepCellDis(nn.Cell):
|
||||
|
@ -152,4 +153,5 @@ class TrainOneStepCellDis(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
return F.depend(loss, self.optimizer(grads)), d_real_loss, d_fake_loss, dc_loss, df_gp
|
||||
self.optimizer(grads)
|
||||
return loss, d_real_loss, d_fake_loss, dc_loss, df_gp
|
||||
|
|
|
@ -138,10 +138,8 @@ class TrainOneStepWithLossScaleCell(nn.Cell):
|
|||
else:
|
||||
cond = self.less_equal(self.base, flag_sum)
|
||||
|
||||
opt = self.optimizer(grads)
|
||||
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, opt)
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class BuildTrainNetworkV2(nn.Cell):
|
||||
|
|
|
@ -144,12 +144,9 @@ class IPTTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class SupConLoss(nn.Cell):
|
||||
|
|
|
@ -23,7 +23,6 @@ from mindspore.common import dtype as mstype
|
|||
from mindspore.context import ParallelMode
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
from mindspore.parallel._utils import _get_parallel_mode
|
||||
from mindspore.train.serialization import save_checkpoint
|
||||
|
@ -82,7 +81,8 @@ class MyTrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
def sub_mean(x):
|
||||
|
|
|
@ -225,11 +225,7 @@ class GNMTTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
self.loss_scalar("loss", loss)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
|
||||
return F.depend(ret, succ)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -22,7 +22,6 @@ from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits, L1Loss
|
|||
from mindspore.nn import Momentum
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.common.initializer import HeNormal
|
||||
from mindspore.common.initializer import Normal
|
||||
from mindspore import Tensor
|
||||
|
@ -382,7 +381,8 @@ class TrainStepWrap(nn.Cell):
|
|||
if not self.is_train:
|
||||
return loss
|
||||
grads = self.grad(self.network, weights)(x, labels1, labels2)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class TestStepWrap(nn.Cell):
|
||||
|
|
|
@ -59,7 +59,8 @@ class TrainOneStepD(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads_d = self.grad_reducer(grads_d)
|
||||
return ops.depend(ld, self.optimizer(grads_d))
|
||||
self.optimizer(grads_d)
|
||||
return ld
|
||||
|
||||
class TrainOnestepG(nn.Cell):
|
||||
"""
|
||||
|
@ -103,4 +104,5 @@ class TrainOnestepG(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads_g = self.grad_reducer(grads_g)
|
||||
return ops.depend(lg, self.optimizer(grads_g))
|
||||
self.optimizer(grads_g)
|
||||
return lg
|
||||
|
|
|
@ -59,5 +59,6 @@ class TrainOnestepPSNR(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return ops.depend(psnr_loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return psnr_loss
|
||||
|
|
@ -413,7 +413,8 @@ class TrainOneStepGenerator(nn.Cell):
|
|||
grads = self.grad(self.network, self.weights)(real_x, c_org, c_trg,
|
||||
attr_diff, sens)
|
||||
grads = self.grad_reducer(grads)
|
||||
return (ops.depend(loss_G, self.optimizer(grads)), fake_x, loss_G,
|
||||
self.optimizer(grads)
|
||||
return (loss_G, fake_x, loss_G,
|
||||
loss_fake_G, loss_cls_G, loss_rec_G, loss_adv_G)
|
||||
|
||||
|
||||
|
@ -451,5 +452,6 @@ class TrainOneStepDiscriminator(nn.Cell):
|
|||
grads = self.grad(self.network, self.weights)(real_x, c_org, c_trg,
|
||||
attr_diff, alpha, sens)
|
||||
grads = self.grad_reducer(grads)
|
||||
return (ops.depend(loss_D, self.optimizer(grads)), loss_D, loss_real_D,
|
||||
self.optimizer(grads)
|
||||
return (loss_D, loss_D, loss_real_D,
|
||||
loss_fake_D, loss_cls_D, loss_gp_D, loss_adv_D, attr_diff)
|
||||
|
|
|
@ -19,7 +19,6 @@ import mindspore
|
|||
import mindspore.nn as nn
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import ResizeNearestNeighbor
|
||||
from mindspore import Tensor, ParameterTuple, Parameter
|
||||
from mindspore.common.initializer import initializer, TruncatedNormal
|
||||
|
@ -410,7 +409,8 @@ class TrainStepWrap(nn.Cell):
|
|||
loss = self.network(image, label)
|
||||
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
|
||||
grads = self.grad(self.network, weights)(image, label, sens)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
def get_AdvancedEast_net(args):
|
||||
|
|
|
@ -232,9 +232,8 @@ class CenterNetWithoutLossScaleCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(image, hm, reg_mask, ind, wh, kps,
|
||||
kps_mask, reg, hm_hp, hp_offset,
|
||||
hp_ind, hp_mask)
|
||||
succ = self.optimizer(grads)
|
||||
ret = loss
|
||||
return ops.depend(ret, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class CenterNetWithLossScaleCell(nn.Cell):
|
||||
|
@ -309,9 +308,8 @@ class CenterNetWithLossScaleCell(nn.Cell):
|
|||
else:
|
||||
cond = self.less_equal(self.base, flag_sum)
|
||||
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return ops.depend(ret, succ)
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
class CenterNetMultiPoseEval(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -250,9 +250,8 @@ class CenterNetWithoutLossScaleCell(nn.Cell):
|
|||
weights = self.weights
|
||||
loss = self.network(image, hm, reg_mask, ind, wh, reg)
|
||||
grads = self.grad(self.network, weights)(image, hm, reg_mask, ind, wh, reg)
|
||||
succ = self.optimizer(grads)
|
||||
ret = loss
|
||||
return ops.depend(ret, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class CenterNetWithLossScaleCell(nn.Cell):
|
||||
|
@ -320,12 +319,9 @@ class CenterNetWithLossScaleCell(nn.Cell):
|
|||
else:
|
||||
cond = self.less_equal(self.base, flag_sum)
|
||||
overflow = cond
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return ops.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class CenterNetDetEval(nn.Cell):
|
||||
|
|
|
@ -208,9 +208,8 @@ class CenterNetWithoutLossScaleCell(nn.Cell):
|
|||
weights = self.weights
|
||||
loss = self.network(image, hm, reg_mask, ind, wh, reg)
|
||||
grads = self.grad(self.network, weights)(image, hm, reg_mask, ind, wh, reg)
|
||||
succ = self.optimizer(grads)
|
||||
ret = loss
|
||||
return ops.depend(ret, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class CenterNetWithLossScaleCell(nn.Cell):
|
||||
|
@ -279,12 +278,9 @@ class CenterNetWithLossScaleCell(nn.Cell):
|
|||
else:
|
||||
cond = self.less_equal(self.base, flag_sum)
|
||||
overflow = cond
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return ops.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class CenterNetDetEval(nn.Cell):
|
||||
|
|
|
@ -125,4 +125,5 @@ class MyTrainOneStepCell(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*inputs, sens)
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = ops.clip_by_global_norm(grads, 0.2)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -22,7 +22,6 @@ from mindspore.ops import operations as P
|
|||
from mindspore.ops import composite as C
|
||||
from mindspore.ops.operations import Add, Split, Concat
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
from mindspore.ops import functional as F
|
||||
from src.custom_op import SEBlock, GroupConv
|
||||
from src.blocks_ms import Interpolate, FeatureFusionBlock
|
||||
from src.loss import ScaleAndShiftInvariantLoss
|
||||
|
@ -390,4 +389,5 @@ class TrainOneStepCell(nn.Cell):
|
|||
if self.reduce_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -246,7 +246,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class retinanetInferWithDecoder(nn.Cell):
|
||||
|
|
|
@ -246,7 +246,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class retinanetInferWithDecoder(nn.Cell):
|
||||
|
|
|
@ -591,7 +591,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class SSDWithGhostNet(nn.Cell):
|
||||
|
|
|
@ -388,7 +388,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.use_global_norm:
|
||||
grads = self.hyper_map(F.partial(grad_scale, F.scalar_to_array(self.sens)), grads)
|
||||
grads = C.clip_by_global_norm(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class SSDWithMobileNetV2(nn.Cell):
|
||||
|
|
|
@ -296,7 +296,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.use_global_norm:
|
||||
grads = self.hyper_map(F.partial(grad_scale, F.scalar_to_array(self.sens)), grads)
|
||||
grads = C.clip_by_global_norm(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -457,7 +457,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.use_global_norm:
|
||||
grads = self.hyper_map(F.partial(grad_scale, F.scalar_to_array(self.sens)), grads)
|
||||
grads = C.clip_by_global_norm(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
class SsdInferWithDecoder(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -160,12 +160,9 @@ class GPT2FinetuneCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
|
||||
class GPT2LM(nn.Cell):
|
||||
|
|
|
@ -365,12 +365,8 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
|
||||
self.loss_scalar("loss", loss)
|
||||
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -21,7 +21,6 @@ from mindspore.common.initializer import initializer
|
|||
|
||||
import mindspore.ops as P
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
|
||||
from mindspore import Parameter, ParameterTuple
|
||||
from mindspore import Tensor
|
||||
|
@ -351,7 +350,8 @@ class TrainStepWrap(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(cats_vals, num_vals, label, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class ModelBuilder:
|
||||
|
|
|
@ -18,7 +18,6 @@ import os
|
|||
import numpy as np
|
||||
from sklearn.metrics import roc_auc_score
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.nn import Dropout
|
||||
|
@ -333,7 +332,8 @@ class TrainStepWrap(nn.Cell):
|
|||
loss = self.network(batch_ids, batch_wts, label)
|
||||
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) #
|
||||
grads = self.grad(self.network, weights)(batch_ids, batch_wts, label, sens)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class PredictWithSigmoid(nn.Cell):
|
||||
|
|
|
@ -295,5 +295,5 @@ class NetworkNoClientTrainCell(nn.Cell):
|
|||
self.cast(F.tuple_to_array((self.sens,)),
|
||||
mstype.float32))
|
||||
grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -671,7 +671,8 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
class YoloBoxScores(nn.Cell):
|
||||
|
|
|
@ -59,7 +59,7 @@ class YoloBlock(nn.Cell):
|
|||
|
||||
Args:
|
||||
in_channels: Integer. Input channel.
|
||||
out_chls: Interger. Middle channel.
|
||||
out_chls: Integer. Middle channel.
|
||||
out_channels: Integer. Output channel.
|
||||
|
||||
Returns:
|
||||
|
@ -108,7 +108,7 @@ class YOLOv3(nn.Cell):
|
|||
Args:
|
||||
backbone_shape: List. Darknet output channels shape.
|
||||
backbone: Cell. Backbone Network.
|
||||
out_channel: Interger. Output channel.
|
||||
out_channel: Integer. Output channel.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
@ -436,4 +436,5 @@ class TrainingWrapper(nn.Cell):
|
|||
grads = self.grad(self.network, weights)(*args, sens)
|
||||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
|
|
@ -321,8 +321,8 @@ class BertTrainOneStepCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
grad_scale = C.MultitypeFuncGraph("grad_scale")
|
||||
|
@ -431,9 +431,6 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
|
|
@ -122,12 +122,9 @@ class BertFinetuneCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond)
|
||||
|
||||
class BertCLSModel(nn.Cell):
|
||||
"""
|
||||
|
|
|
@ -20,7 +20,6 @@ from mindspore.common.parameter import ParameterTuple, Parameter
|
|||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.nn.optim import Momentum
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
|
||||
|
@ -67,10 +66,11 @@ class TrainOneStepWithLarsCell(nn.Cell):
|
|||
bias_grads = grads[self.slice_index: self.params_len]
|
||||
lars_grads = self.lars(non_bias_weights, non_bias_grads, self.weight_decay)
|
||||
new_grads = lars_grads + bias_grads
|
||||
return F.depend(loss, self.optimizer(new_grads))
|
||||
self.optimizer(new_grads)
|
||||
return loss
|
||||
|
||||
|
||||
# fn is a funcation use i as input
|
||||
# fn is a function use i as input
|
||||
def lr_gen(fn, epoch_size):
|
||||
for i in range(epoch_size):
|
||||
yield fn(i)
|
||||
|
|
|
@ -21,7 +21,7 @@ from mindspore import context
|
|||
from mindspore.common.parameter import Parameter, ParameterTuple
|
||||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.nn.optim.momentum import Momentum
|
||||
from mindspore.ops import composite as C, functional as F, operations as P
|
||||
from mindspore.ops import composite as C, operations as P
|
||||
from mindspore.train import Model
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.loss_scale_manager import DynamicLossScaleManager
|
||||
|
@ -114,7 +114,8 @@ class TrainOneStepCell(nn.Cell):
|
|||
weights = self.weights
|
||||
loss = self.network(data)
|
||||
grads = self.grad(self.network, weights)(data, sens)
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
def loss_scale_manager_sens(strategy1, sens):
|
||||
|
|
|
@ -25,7 +25,6 @@ from mindspore.nn import Dense, Cell
|
|||
from mindspore.nn.loss.loss import LossBase
|
||||
from mindspore.nn.optim import Momentum
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.train import Model
|
||||
from mindspore.context import ParallelMode
|
||||
|
@ -121,7 +120,8 @@ class TrainOneStepCell(Cell):
|
|||
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
|
||||
grads = self.grad(self.network, weights)(data, sens)
|
||||
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
def net_trains(criterion, rank):
|
||||
|
|
|
@ -105,12 +105,9 @@ class TrainOneStepWithLossScaleCell(nn.Cell):
|
|||
overflow = cond
|
||||
if sens is None:
|
||||
overflow = self.loss_scaling_manager(self.loss_scale, cond)
|
||||
if overflow:
|
||||
succ = False
|
||||
else:
|
||||
succ = self.optimizer(grads)
|
||||
ret = (loss, cond, scaling_sens)
|
||||
return F.depend(ret, succ)
|
||||
if not overflow:
|
||||
self.optimizer(grads)
|
||||
return (loss, cond, scaling_sens)
|
||||
|
||||
|
||||
class DatasetLenet(MindData):
|
||||
|
|
|
@ -24,7 +24,6 @@ from mindspore.common.parameter import ParameterTuple
|
|||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.nn.optim.momentum import Momentum
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.nn.wrap.cell_wrapper import _VirtualDatasetCell
|
||||
from mindspore.parallel import set_algo_parameters
|
||||
|
@ -419,7 +418,8 @@ class TrainOneStepCell(nn.Cell):
|
|||
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
|
||||
grads = self.grad(self.network, weights)(data, sens)
|
||||
|
||||
return F.depend(loss, self.optimizer(grads))
|
||||
self.optimizer(grads)
|
||||
return loss
|
||||
|
||||
|
||||
def reshape_common2(parallel_mode, net):
|
||||
|
|
Loading…
Reference in New Issue