forked from mindspore-Ecosystem/mindspore
!9811 fix mobilentv2 CPU not support full training
From: @zhao_ting_v Reviewed-by: @c_34,@wuxuejian Signed-off-by: @c_34
This commit is contained in:
commit
c329ed4d27
|
@ -91,6 +91,12 @@ You can start training using python or shell scripts. The usage of shell scripts
|
|||
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER]
|
||||
- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER]
|
||||
|
||||
> RANK_TABLE_FILE is HCCL configuration file when running on Ascend.
|
||||
> The common restrictions on using the distributed service are as follows. For details, see the HCCL documentation.
|
||||
>
|
||||
> - In a single-node system, a cluster of 1, 2, 4, or 8 devices is supported. In a multi-node system, a cluster of 8 x N devices is supported.
|
||||
> - Each host has four devices numbered 0 to 3 and four devices numbered 4 to 7 deployed on two different networks. During training of 2 or 4 devices, the devices must be connected and clusters cannot be created across networks.
|
||||
|
||||
### Launch
|
||||
|
||||
```shell
|
||||
|
|
|
@ -100,6 +100,12 @@ MobileNetV2总体网络架构如下:
|
|||
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER]
|
||||
- CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER]
|
||||
|
||||
> RANK_TABLE_FILE 是在Ascned上运行分布式任务时HCCL的配置文件
|
||||
> 我们列出使用分布式服务常见的使用限制,详细的可以查看HCCL对应的使用文档。
|
||||
>
|
||||
> - 单机场景下支持1、2、4、8卡设备集群,多机场景下支持8*n卡设备集群。
|
||||
> - 每台机器的0-3卡和4-7卡各为1个组网,2卡和4卡训练时卡必须相连且不支持跨组网创建集群。
|
||||
|
||||
### 启动
|
||||
|
||||
```shell
|
||||
|
|
|
@ -29,6 +29,13 @@ run_ascend()
|
|||
fi
|
||||
|
||||
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
||||
VISIABLE_DEVICES=$3
|
||||
IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES"
|
||||
if [ ${#CANDIDATE_DEVICE[@]} -ne $2 ]
|
||||
then
|
||||
echo "error: DEVICE_NUM=$2 is not equal to the length of VISIABLE_DEVICES=$3"
|
||||
exit 1
|
||||
fi
|
||||
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
|
||||
export RANK_TABLE_FILE=$4
|
||||
export RANK_SIZE=$2
|
||||
|
@ -40,7 +47,7 @@ run_ascend()
|
|||
cd ../train || exit
|
||||
for((i=0; i<${RANK_SIZE}; i++))
|
||||
do
|
||||
export DEVICE_ID=$i
|
||||
export DEVICE_ID=${CANDIDATE_DEVICE[i]}
|
||||
export RANK_ID=$i
|
||||
rm -rf ./rank$i
|
||||
mkdir ./rank$i
|
||||
|
|
|
@ -16,26 +16,6 @@
|
|||
import argparse
|
||||
import ast
|
||||
|
||||
def launch_parse_args():
|
||||
|
||||
launch_parser = argparse.ArgumentParser(description="mindspore distributed training launch helper utilty \
|
||||
that will spawn up multiple distributed processes")
|
||||
launch_parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), \
|
||||
help='run platform, only support GPU, CPU and Ascend')
|
||||
launch_parser.add_argument("--nproc_per_node", type=int, default=1, choices=(1, 2, 3, 4, 5, 6, 7, 8), \
|
||||
help="The number of processes to launch on each node, for D training, this is recommended to be set \
|
||||
to the number of D in your system so that each process can be bound to a single D.")
|
||||
launch_parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", help="will use the \
|
||||
visible devices sequentially")
|
||||
launch_parser.add_argument("--training_script", type=str, default="./train.py", help="The full path to \
|
||||
the single D training program/script to be launched in parallel, followed by all the arguments for \
|
||||
the training script")
|
||||
|
||||
launch_args, unknown = launch_parser.parse_known_args()
|
||||
launch_args.training_script_args = unknown
|
||||
launch_args.training_script_args += ["--platform", launch_args.platform]
|
||||
return launch_args
|
||||
|
||||
def train_parse_args():
|
||||
train_parser = argparse.ArgumentParser(description='Image classification trian')
|
||||
train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \
|
||||
|
@ -48,6 +28,8 @@ def train_parse_args():
|
|||
train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute')
|
||||
train_args = train_parser.parse_args()
|
||||
train_args.is_training = True
|
||||
if train_args.platform == "CPU":
|
||||
train_args.run_distribute = False
|
||||
return train_args
|
||||
|
||||
def eval_parse_args():
|
||||
|
|
|
@ -40,6 +40,7 @@ def set_config(args):
|
|||
"keep_checkpoint_max": 20,
|
||||
"save_checkpoint_path": "./",
|
||||
"platform": args.platform,
|
||||
"run_distribute": args.run_distribute,
|
||||
"activation": "Softmax",
|
||||
"export_format": "MINDIR",
|
||||
"export_file": "mobilenetv2"
|
||||
|
|
|
@ -331,7 +331,7 @@ class MobileNetV2Combine(nn.Cell):
|
|||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>> MobileNetV2(num_classes=1000)
|
||||
>>> MobileNetV2Combine(backbone, head)
|
||||
"""
|
||||
|
||||
def __init__(self, backbone, head):
|
||||
|
|
|
@ -114,6 +114,13 @@ def load_ckpt(network, pretrain_ckpt_path, trainable=True):
|
|||
incremental_learning or not
|
||||
"""
|
||||
param_dict = load_checkpoint(pretrain_ckpt_path)
|
||||
if hasattr(network, "head"):
|
||||
head_param = network.head.parameters_dict()
|
||||
for k, v in head_param.items():
|
||||
if param_dict[k].shape != v.shape:
|
||||
param_dict.pop(k)
|
||||
param_dict.pop(f"moments.{k}")
|
||||
print(f"Filter {k} don't load weights from checkpoint.")
|
||||
load_param_into_net(network, param_dict)
|
||||
if not trainable:
|
||||
for param in network.get_parameters():
|
||||
|
|
|
@ -53,21 +53,14 @@ if __name__ == '__main__':
|
|||
|
||||
# define network
|
||||
backbone_net, head_net, net = define_net(config, args_opt.is_training)
|
||||
|
||||
if args_opt.pretrain_ckpt != "" and args_opt.freeze_layer == "backbone":
|
||||
load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False)
|
||||
step_size = extract_features(backbone_net, args_opt.dataset_path, config)
|
||||
|
||||
else:
|
||||
if args_opt.platform == "CPU":
|
||||
raise ValueError("CPU only support fine tune the head net, doesn't support fine tune the all net")
|
||||
|
||||
if args_opt.pretrain_ckpt:
|
||||
load_ckpt(backbone_net, args_opt.pretrain_ckpt)
|
||||
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config)
|
||||
step_size = dataset.get_dataset_size()
|
||||
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config)
|
||||
step_size = dataset.get_dataset_size()
|
||||
if args_opt.pretrain_ckpt:
|
||||
if args_opt.freeze_layer == "backbone":
|
||||
load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False)
|
||||
step_size = extract_features(backbone_net, args_opt.dataset_path, config)
|
||||
else:
|
||||
load_ckpt(net, args_opt.pretrain_ckpt)
|
||||
if step_size == 0:
|
||||
raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \
|
||||
than batch_size in config.py")
|
||||
|
@ -93,7 +86,7 @@ if __name__ == '__main__':
|
|||
total_epochs=epoch_size,
|
||||
steps_per_epoch=step_size))
|
||||
|
||||
if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer == "none":
|
||||
if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer != "backbone":
|
||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \
|
||||
config.weight_decay, config.loss_scale)
|
||||
|
|
Loading…
Reference in New Issue