diff --git a/model_zoo/official/cv/posenet/README_CN.md b/model_zoo/official/cv/posenet/README_CN.md index 93253f0a743..aef7e640550 100644 --- a/model_zoo/official/cv/posenet/README_CN.md +++ b/model_zoo/official/cv/posenet/README_CN.md @@ -79,7 +79,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度 bash run_standalone_train.sh [DATASET_NAME] [DEVICE_ID] # 运行分布式训练示例 - bash run_distribute_train.sh [DATASET_NAME] [RANK_SIZE] + bash run_distribute_train.sh [DATASET_NAME] [RANK_TABLE] # 运行评估示例 bash run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH] @@ -100,7 +100,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度 bash run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID] # 运行分布式训练示例 - bash run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE] + bash run_distribute_train_gpu.sh [DATASET_NAME] [RANK_TABLE] # 运行评估示例 bash run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH] @@ -211,7 +211,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度 - Ascend处理器环境运行 ```bash - bash run_distribute_train.sh [DATASET_NAME] [RANK_SIZE] + bash run_distribute_train.sh [DATASET_NAME] [RANK_TABLE] ``` 上述shell脚本将在后台运行分布训练。您可以通过device[X]/log文件查看结果。采用以下方式达到损失值: diff --git a/model_zoo/official/cv/posenet/eval.py b/model_zoo/official/cv/posenet/eval.py index c74a389acfe..dda86fc76f0 100644 --- a/model_zoo/official/cv/posenet/eval.py +++ b/model_zoo/official/cv/posenet/eval.py @@ -41,6 +41,9 @@ if __name__ == '__main__': parser.add_argument('--is_modelarts', type=ast.literal_eval, default=False, help='Train in Modelarts.') parser.add_argument('--data_url', default=None, help='Location of data.') parser.add_argument('--train_url', default=None, help='Location of training outputs.') + parser.add_argument('--device_target', type=str, default='Ascend', + choices=['Ascend', 'GPU'], + help='Name of device target.') args_opt = parser.parse_args() cfg = common_config @@ -49,8 +52,8 @@ if __name__ == '__main__': elif args_opt.dataset == "StMarysChurch": dataset_cfg = StMarysChurch - device_target = cfg.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) + device_target = args_opt.device_target + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) if args_opt.device_id is not None: context.set_context(device_id=args_opt.device_id) diff --git a/model_zoo/official/cv/posenet/scripts/run_distribute_train.sh b/model_zoo/official/cv/posenet/scripts/run_distribute_train.sh index 55f7a800caa..df941706476 100644 --- a/model_zoo/official/cv/posenet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/posenet/scripts/run_distribute_train.sh @@ -17,37 +17,30 @@ echo "==============================================================================================================" echo "Please run the script as: " echo "bash run.sh DATASET_NAME RANK_SIZE" -echo "For example: bash run_distribute.sh dataset_name 8" +echo "For example: bash run_distribute.sh dataset_name rank_table" echo "It is better to use the absolute path." echo "==============================================================================================================" set -e +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} DATASET_NAME=$1 -RANK_SIZE=$2 export DATASET_NAME -export RANK_SIZE +export RANK_TABLE_FILE=$(get_real_path $2) +export RANK_SIZE=8 EXEC_PATH=$(pwd) echo "$EXEC_PATH" -test_dist_8pcs() -{ - export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json - export RANK_SIZE=8 -} - -test_dist_2pcs() -{ - export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json - export RANK_SIZE=2 -} - -test_dist_${RANK_SIZE}pcs - export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python cd ../ -for((i=0;i<${RANK_SIZE};i++)) +for((i=0;i<8;i++)) do rm -rf device$i mkdir device$i @@ -61,7 +54,10 @@ do export RANK_ID=$i echo "start training for device $i" env > env$i.log - python train.py --run_distribute True --dataset $1 --device_num $2 --is_modelarts False > train$i.log 2>&1 & + python train.py --run_distribute True \ + --dataset $1 --device_num $2 \ + --is_modelarts False \ + --device_target "Ascend" > train$i.log 2>&1 & echo "$i finish" cd ../ done diff --git a/model_zoo/official/cv/posenet/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/posenet/scripts/run_distribute_train_gpu.sh index 2906990d518..69877d2ebb3 100644 --- a/model_zoo/official/cv/posenet/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/posenet/scripts/run_distribute_train_gpu.sh @@ -39,5 +39,8 @@ cd ./train_parallel env > env.log echo "start training" mpirun -n $1 --allow-run-as-root \ - python train.py --device_num $1 --dataset $2 --is_modelarts False --run_distribute True > train.log 2>&1 & + python train.py --device_num $1 \ + --dataset $2 --is_modelarts False \ + --run_distribute True \ + --device_target "GPU" > train.log 2>&1 & diff --git a/model_zoo/official/cv/posenet/scripts/run_eval_gpu.sh b/model_zoo/official/cv/posenet/scripts/run_eval_gpu.sh index b43fe278d56..fdf84e7226f 100644 --- a/model_zoo/official/cv/posenet/scripts/run_eval_gpu.sh +++ b/model_zoo/official/cv/posenet/scripts/run_eval_gpu.sh @@ -46,7 +46,7 @@ cd ./eval env > env0.log echo "Eval begin." -python eval.py --device_id $1 --dataset $2 --ckpt_url $3 --is_modelarts False > ./eval.log 2>&1 & +python eval.py --device_id $1 --dataset $2 --ckpt_url $3 --is_modelarts False --device_target "GPU" > ./eval.log 2>&1 & if [ $? -eq 0 ];then echo "evaling success" diff --git a/model_zoo/official/cv/posenet/scripts/run_standalone_train.sh b/model_zoo/official/cv/posenet/scripts/run_standalone_train.sh index 6eb6c28c0c3..99289269512 100644 --- a/model_zoo/official/cv/posenet/scripts/run_standalone_train.sh +++ b/model_zoo/official/cv/posenet/scripts/run_standalone_train.sh @@ -45,7 +45,10 @@ cd ./train env > env0.log echo "Standalone train begin." -python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False > ./train_alone.log 2>&1 & +python train.py --run_distribute False \ + --device_id $2 --dataset $1 \ + --device_num 1 --is_modelarts False \ + --device_target "Ascend" > ./train_alone.log 2>&1 & if [ $? -eq 0 ];then echo "training success" diff --git a/model_zoo/official/cv/posenet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/posenet/scripts/run_standalone_train_gpu.sh index 6eb6c28c0c3..20894964d36 100644 --- a/model_zoo/official/cv/posenet/scripts/run_standalone_train_gpu.sh +++ b/model_zoo/official/cv/posenet/scripts/run_standalone_train_gpu.sh @@ -45,7 +45,7 @@ cd ./train env > env0.log echo "Standalone train begin." -python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False > ./train_alone.log 2>&1 & +python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False --device_target "Ascend" > ./train_alone.log 2>&1 & if [ $? -eq 0 ];then echo "training success" diff --git a/model_zoo/official/cv/posenet/src/config.py b/model_zoo/official/cv/posenet/src/config.py index 0f749b92b5b..631c1a99aff 100644 --- a/model_zoo/official/cv/posenet/src/config.py +++ b/model_zoo/official/cv/posenet/src/config.py @@ -18,7 +18,6 @@ network config setting from easydict import EasyDict as edict common_config = edict({ - 'device_target': 'GPU', 'device_id': 0, 'pre_trained': True, 'max_steps': 30000, diff --git a/model_zoo/official/cv/posenet/src/loss.py b/model_zoo/official/cv/posenet/src/loss.py index 31ec464234a..65ad97e77bb 100644 --- a/model_zoo/official/cv/posenet/src/loss.py +++ b/model_zoo/official/cv/posenet/src/loss.py @@ -14,7 +14,7 @@ # ============================================================================ """define evaluation loss function for network.""" import mindspore.nn as nn -from mindspore.nn.loss.loss import _Loss +from mindspore.nn.loss.loss import LossBase from mindspore.common import dtype as mstype from mindspore.ops import operations as P from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -42,7 +42,7 @@ class EuclideanDistance(nn.Cell): return res -class PoseLoss(_Loss): +class PoseLoss(LossBase): """define loss function""" def __init__(self, w1_x, w2_x, w3_x, w1_q, w2_q, w3_q): super(PoseLoss, self).__init__() diff --git a/model_zoo/official/cv/posenet/src/posenet.py b/model_zoo/official/cv/posenet/src/posenet.py index 642b75c09ab..bed571b5dc9 100644 --- a/model_zoo/official/cv/posenet/src/posenet.py +++ b/model_zoo/official/cv/posenet/src/posenet.py @@ -16,6 +16,12 @@ import mindspore.nn as nn from mindspore.common.initializer import TruncatedNormal from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore.ops import composite as C +from mindspore.context import ParallelMode +from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean, + _get_parallel_mode) +from mindspore.nn.wrap.grad_reducer import DistributedGradReducer def weight_variable(): """Weight variable.""" @@ -172,3 +178,80 @@ class PoseNet(nn.Cell): return cls1_fc_pose_xyz, cls1_fc_pose_wpqr, \ cls2_fc_pose_xyz, cls2_fc_pose_wpqr, \ cls3_fc_pose_xyz, cls3_fc_pose_wpqr + + +GRADIENT_CLIP_TYPE = 1 +GRADIENT_CLIP_VALUE = 50.0 + +clip_grad = C.MultitypeFuncGraph("clip_grad") + +@clip_grad.register("Number", "Number", "Tensor") +def _clip_grad(clip_type, clip_value, grad): + """ + Clip gradients + Inputs: + clip_type: The way to clip, 0 for 'value', 1 for 'norm' + clip_value: Specifies how much to clip + grad: Gradients + Outputs: + tuple[Tensor], clipped gradients + """ + if clip_type not in (0, 1): + return grad + dt = F.dtype(grad) + if clip_type == 0: + new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), + F.cast(F.tuple_to_array((clip_value,)), dt)) + else: + new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) + return new_grad + +class PoseTrainOneStepCell(nn.Cell): + r""" + Network training package class. + + Wraps the network with an optimizer. The resulting Cell is trained with input '\*inputs'. + The backward graph will be created in the construct function to update the parameter. Different + parallel modes are available for training. + + Args: + network (Cell): The training network. The network only supports single output. + optimizer (Union[Cell]): Optimizer for updating the weights. + sens (numbers.Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0. + + Inputs: + - **(\*inputs)** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \ldots)`. + + Outputs: + Tensor, a tensor means the loss value, the shape of which is usually :math:`()`. + + Raises: + TypeError: If `sens` is not a number. + """ + + def __init__(self, network, optimizer, sens=1.0): + super(PoseTrainOneStepCell, self).__init__(auto_prefix=False) + self.network = network + self.network.set_grad() + self.optimizer = optimizer + self.weights = self.optimizer.parameters + self.grad = C.GradOperation(get_by_list=True, sens_param=True) + self.sens = sens + self.reducer_flag = False + self.grad_reducer = F.identity + self.hyper_map = C.HyperMap() + self.parallel_mode = _get_parallel_mode() + self.reducer_flag = self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) + if self.reducer_flag: + self.mean = _get_gradients_mean() + self.degree = _get_device_num() + self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree) + + def construct(self, *inputs): + loss = self.network(*inputs) + sens = F.fill(loss.dtype, loss.shape, self.sens) + grads = self.grad(self.network, self.weights)(*inputs, sens) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + grads = self.grad_reducer(grads) + loss = F.depend(loss, self.optimizer(grads)) + return loss diff --git a/model_zoo/official/cv/posenet/train.py b/model_zoo/official/cv/posenet/train.py index eb2f65f2317..570e20b61da 100644 --- a/model_zoo/official/cv/posenet/train.py +++ b/model_zoo/official/cv/posenet/train.py @@ -29,6 +29,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni from src.config import common_config, KingsCollege, StMarysChurch from src.dataset import data_to_mindrecord, create_posenet_dataset from src.loss import PosenetWithLoss +from src.posenet import PoseTrainOneStepCell set_seed(1) @@ -45,6 +46,9 @@ if __name__ == '__main__': parser.add_argument('--is_modelarts', type=ast.literal_eval, default=False, help='Train in Modelarts.') parser.add_argument('--data_url', default=None, help='Location of data.') parser.add_argument('--train_url', default=None, help='Location of training outputs.') + parser.add_argument('--device_target', type=str, default='Ascend', + choices=['Ascend', 'GPU'], + help='Name of device target.') args_opt = parser.parse_args() cfg = common_config @@ -53,7 +57,7 @@ if __name__ == '__main__': elif args_opt.dataset == "StMarysChurch": dataset_cfg = StMarysChurch - device_target = cfg.device_target + device_target = args_opt.device_target context.set_context(mode=context.GRAPH_MODE, device_target=device_target) if args_opt.run_distribute: if device_target == "Ascend": @@ -104,7 +108,8 @@ if __name__ == '__main__': opt = Adagrad(params=net_with_loss.trainable_params(), learning_rate=dataset_cfg.lr_init, weight_decay=dataset_cfg.weight_decay) - model = Model(net_with_loss, optimizer=opt) + net_with_grad = PoseTrainOneStepCell(net_with_loss, opt) + model = Model(net_with_grad) time_cb = TimeMonitor(data_size=step_per_epoch) loss_cb = LossMonitor()