diff --git a/model_zoo/official/cv/mobilenetv2/README.md b/model_zoo/official/cv/mobilenetv2/README.md index 597d67affd6..6809ef5dd39 100644 --- a/model_zoo/official/cv/mobilenetv2/README.md +++ b/model_zoo/official/cv/mobilenetv2/README.md @@ -91,6 +91,12 @@ You can start training using python or shell scripts. The usage of shell scripts - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] - CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] +> RANK_TABLE_FILE is HCCL configuration file when running on Ascend. +> The common restrictions on using the distributed service are as follows. For details, see the HCCL documentation. +> +> - In a single-node system, a cluster of 1, 2, 4, or 8 devices is supported. In a multi-node system, a cluster of 8 x N devices is supported. +> - Each host has four devices numbered 0 to 3 and four devices numbered 4 to 7 deployed on two different networks. During training of 2 or 4 devices, the devices must be connected and clusters cannot be created across networks. + ### Launch ```shell diff --git a/model_zoo/official/cv/mobilenetv2/README_CN.md b/model_zoo/official/cv/mobilenetv2/README_CN.md index 135b26e76b3..122336f8dde 100644 --- a/model_zoo/official/cv/mobilenetv2/README_CN.md +++ b/model_zoo/official/cv/mobilenetv2/README_CN.md @@ -100,6 +100,12 @@ MobileNetV2总体网络架构如下: - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] - CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] +> RANK_TABLE_FILE 是在Ascned上运行分布式任务时HCCL的配置文件 +> 我们列出使用分布式服务常见的使用限制,详细的可以查看HCCL对应的使用文档。 +> +> - 单机场景下支持1、2、4、8卡设备集群,多机场景下支持8*n卡设备集群。 +> - 每台机器的0-3卡和4-7卡各为1个组网,2卡和4卡训练时卡必须相连且不支持跨组网创建集群。 + ### 启动 ```shell diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh index 350e2e6d3db..2c812c57060 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh @@ -29,6 +29,13 @@ run_ascend() fi BASEPATH=$(cd "`dirname $0`" || exit; pwd) + VISIABLE_DEVICES=$3 + IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES" + if [ ${#CANDIDATE_DEVICE[@]} -ne $2 ] + then + echo "error: DEVICE_NUM=$2 is not equal to the length of VISIABLE_DEVICES=$3" + exit 1 + fi export PYTHONPATH=${BASEPATH}:$PYTHONPATH export RANK_TABLE_FILE=$4 export RANK_SIZE=$2 @@ -40,7 +47,7 @@ run_ascend() cd ../train || exit for((i=0; i<${RANK_SIZE}; i++)) do - export DEVICE_ID=$i + export DEVICE_ID=${CANDIDATE_DEVICE[i]} export RANK_ID=$i rm -rf ./rank$i mkdir ./rank$i diff --git a/model_zoo/official/cv/mobilenetv2/src/args.py b/model_zoo/official/cv/mobilenetv2/src/args.py index d972848d0a2..39a721b8dc5 100644 --- a/model_zoo/official/cv/mobilenetv2/src/args.py +++ b/model_zoo/official/cv/mobilenetv2/src/args.py @@ -16,26 +16,6 @@ import argparse import ast -def launch_parse_args(): - - launch_parser = argparse.ArgumentParser(description="mindspore distributed training launch helper utilty \ - that will spawn up multiple distributed processes") - launch_parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), \ - help='run platform, only support GPU, CPU and Ascend') - launch_parser.add_argument("--nproc_per_node", type=int, default=1, choices=(1, 2, 3, 4, 5, 6, 7, 8), \ - help="The number of processes to launch on each node, for D training, this is recommended to be set \ - to the number of D in your system so that each process can be bound to a single D.") - launch_parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", help="will use the \ - visible devices sequentially") - launch_parser.add_argument("--training_script", type=str, default="./train.py", help="The full path to \ - the single D training program/script to be launched in parallel, followed by all the arguments for \ - the training script") - - launch_args, unknown = launch_parser.parse_known_args() - launch_args.training_script_args = unknown - launch_args.training_script_args += ["--platform", launch_args.platform] - return launch_args - def train_parse_args(): train_parser = argparse.ArgumentParser(description='Image classification trian') train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \ @@ -48,6 +28,8 @@ def train_parse_args(): train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') train_args = train_parser.parse_args() train_args.is_training = True + if train_args.platform == "CPU": + train_args.run_distribute = False return train_args def eval_parse_args(): diff --git a/model_zoo/official/cv/mobilenetv2/src/config.py b/model_zoo/official/cv/mobilenetv2/src/config.py index 414491838e3..ecea8e8ad3c 100644 --- a/model_zoo/official/cv/mobilenetv2/src/config.py +++ b/model_zoo/official/cv/mobilenetv2/src/config.py @@ -40,6 +40,7 @@ def set_config(args): "keep_checkpoint_max": 20, "save_checkpoint_path": "./", "platform": args.platform, + "run_distribute": args.run_distribute, "activation": "Softmax", "export_format": "MINDIR", "export_file": "mobilenetv2" diff --git a/model_zoo/official/cv/mobilenetv2/src/mobilenetV2.py b/model_zoo/official/cv/mobilenetv2/src/mobilenetV2.py index 08c1b68b74a..1ea94c64d05 100644 --- a/model_zoo/official/cv/mobilenetv2/src/mobilenetV2.py +++ b/model_zoo/official/cv/mobilenetv2/src/mobilenetV2.py @@ -331,7 +331,7 @@ class MobileNetV2Combine(nn.Cell): Tensor, output tensor. Examples: - >>> MobileNetV2(num_classes=1000) + >>> MobileNetV2Combine(backbone, head) """ def __init__(self, backbone, head): diff --git a/model_zoo/official/cv/mobilenetv2/src/models.py b/model_zoo/official/cv/mobilenetv2/src/models.py index d74a97f755b..5917ba914c8 100644 --- a/model_zoo/official/cv/mobilenetv2/src/models.py +++ b/model_zoo/official/cv/mobilenetv2/src/models.py @@ -114,6 +114,13 @@ def load_ckpt(network, pretrain_ckpt_path, trainable=True): incremental_learning or not """ param_dict = load_checkpoint(pretrain_ckpt_path) + if hasattr(network, "head"): + head_param = network.head.parameters_dict() + for k, v in head_param.items(): + if param_dict[k].shape != v.shape: + param_dict.pop(k) + param_dict.pop(f"moments.{k}") + print(f"Filter {k} don't load weights from checkpoint.") load_param_into_net(network, param_dict) if not trainable: for param in network.get_parameters(): diff --git a/model_zoo/official/cv/mobilenetv2/train.py b/model_zoo/official/cv/mobilenetv2/train.py index 1fa730ac638..fe244090a74 100644 --- a/model_zoo/official/cv/mobilenetv2/train.py +++ b/model_zoo/official/cv/mobilenetv2/train.py @@ -53,21 +53,14 @@ if __name__ == '__main__': # define network backbone_net, head_net, net = define_net(config, args_opt.is_training) - - if args_opt.pretrain_ckpt != "" and args_opt.freeze_layer == "backbone": - load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) - step_size = extract_features(backbone_net, args_opt.dataset_path, config) - - else: - if args_opt.platform == "CPU": - raise ValueError("CPU only support fine tune the head net, doesn't support fine tune the all net") - - if args_opt.pretrain_ckpt: - load_ckpt(backbone_net, args_opt.pretrain_ckpt) - - dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) - step_size = dataset.get_dataset_size() - + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) + step_size = dataset.get_dataset_size() + if args_opt.pretrain_ckpt: + if args_opt.freeze_layer == "backbone": + load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) + step_size = extract_features(backbone_net, args_opt.dataset_path, config) + else: + load_ckpt(net, args_opt.pretrain_ckpt) if step_size == 0: raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \ than batch_size in config.py") @@ -93,7 +86,7 @@ if __name__ == '__main__': total_epochs=epoch_size, steps_per_epoch=step_size)) - if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer == "none": + if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer != "backbone": loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \ config.weight_decay, config.loss_scale)