!18868 CleanCode for cpm networks.

Merge pull request !18868 from casgj/master_0625
This commit is contained in:
i-robot 2021-06-26 02:09:06 +00:00 committed by Gitee
commit a52b732a8b
12 changed files with 16 additions and 13 deletions

View File

@ -413,7 +413,7 @@ The finetune performance and accuracy of 4 machines and 32 cards are as follows:
| Optimizer | Adam |
| Accuracy | 81.4% |
| Speed | 2740ms/step (32pcs) |
| Loss | 0.008 |
| Loss | 0.03 |
| Params (M) | 2597.1 |
| Checkpoint for inference | 57G .ckpt file |
| Scripts | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/cpm> |

View File

@ -415,7 +415,7 @@ Zero-shot单机双卡推理性能和精度如下
| 优化器 | Adam |
| 准确率 | 81.4% |
| 速度 | 2740毫秒/步(32卡) |
| 损失 | 0.08 |
| 损失 | 0.03 |
| 参数 (M) | 2597.1 |
| 推理检查点 | 57G .ckpt文件 |
| 脚本 | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/cpm> |

View File

@ -273,6 +273,8 @@ if __name__ == '__main__':
parser.add_argument("--has_train_strategy", type=ast.literal_eval, default=True,
help='Model has distributed training strategy.')
args_eval = parser.parse_args()
if args_eval.distribute:
set_parallel_env()
ckpt_file_list_test = None
if args_eval.has_train_strategy:
@ -286,7 +288,6 @@ if __name__ == '__main__':
result_accuracy = 0.0
if args_eval.distribute:
set_parallel_env()
print("Start validation on 2 devices with model parallel.")
result_accuracy = run_eval(args_eval, finetune_test_distrubute, ckpt_file_list_test)
else:

View File

@ -57,7 +57,7 @@ if __name__ == '__main__':
cpm_model = CPM_LAYER(config_eval)
if not args.has_train_strategy:
weights = load_checkpoint(args.ckpt_path)
weights = load_checkpoint(args.ckpt_path_doc)
can_be_loaded = {}
print("+++++++loading weights+++++")
for name, _ in weights.items():

View File

@ -59,7 +59,7 @@ do
export DEVICE_ID=$i
echo "start eval for rank $RANK_ID, device $DEVICE_ID"
env > env.log
python ../../eval.py --dataset $DATASET --data_path $LABEL --ckpt_path $MODEL_CKPT --ckpt_partition $CKPT_NUMBER --distribute True --has_train_strategy True> log_cpm.log 2>&1 &
python ../../eval.py --dataset $DATASET --data_path $LABEL --ckpt_path_doc $MODEL_CKPT --ckpt_partition $CKPT_NUMBER --distribute True --has_train_strategy True> log_cpm.log 2>&1 &
cd ${current_exec_path}
done
cd ${current_exec_path}

View File

@ -69,7 +69,7 @@ do
env > env.log
python ../../test.py --dev_dataset $DEV_DATASET --dev_data_path $DEV_LABEL \
--test_dataset $TEST_DATASET --test_data_path $TEST_LABEL \
--ckpt_path $MODEL_CKPT --ckpt_partition $CKPT_NUMBER \
--ckpt_path_doc $MODEL_CKPT --ckpt_partition $CKPT_NUMBER \
--distribute True --has_train_strategy True> log_cpm.log 2>&1 &
cd ${current_exec_path}

View File

@ -71,7 +71,7 @@ do
env > env.log
python ../../test.py --dev_dataset $DEV_DATASET --dev_data_path $DEV_LABEL \
--test_dataset $TEST_DATASET --test_data_path $TEST_LABEL \
--ckpt_path $MODEL_CKPT --ckpt_partition $CKPT_NUMBER \
--ckpt_path_doc $MODEL_CKPT --ckpt_partition $CKPT_NUMBER \
--ckpt_epoch $ckptepoch --result_path $result_path \
--distribute False --has_train_strategy True> log_cpm.log 2>&1

View File

@ -58,7 +58,7 @@ do
export DEVICE_ID=$i
echo "start eval for rank $RANK_ID, device $DEVICE_ID"
env > env.log
python ../../zero-shot.py --dataset $DATASET --truth_labels_path $LABEL --ckpt_path $MODEL_CKPT --distribute True --has_train_strategy False> log_cpm.log 2>&1 &
python ../../zero-shot.py --dataset $DATASET --truth_labels_path $LABEL --ckpt_path_doc $MODEL_CKPT --distribute True --has_train_strategy False> log_cpm.log 2>&1 &
cd ${current_exec_path}
done
cd ${current_exec_path}

View File

@ -54,6 +54,6 @@ cp -r ../scripts/*.sh ./eval
cd ./eval || exit
echo "start training for device $DEVICE_ID"
env > env.log
python ../../zero-shot.py --dataset $DATASET --truth_labels_path $LABEL --ckpt_path $MODEL_CKPT --has_train_strategy False > log_cpm.log 2>&1 &
python ../../zero-shot.py --dataset $DATASET --truth_labels_path $LABEL --ckpt_path_doc $MODEL_CKPT --has_train_strategy False > log_cpm.log 2>&1 &
cd ..

View File

@ -18,7 +18,7 @@ import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Accuracy sort.")
parser.add_argument("--result_path", type=str, default="scripts/result.txt",
parser.add_argument("--result_path", type=str, default="/home/result.txt",
help='Text save address.')
args_eval = parser.parse_args()

View File

@ -262,8 +262,8 @@ if __name__ == '__main__':
args = parser.parse_args()
if args.multi_machine:
print("Training on multiple machines")
print("Training on multiple machines.")
train_paralle(args.dataset, args.pretrain_ckpt_path, config_train_multi_machine)
else:
print("Training on single machine and using 8 cards.")
print("Training on single machine.")
train_paralle(args.dataset, args.pretrain_ckpt_path, config_train_single_machine)

View File

@ -271,6 +271,9 @@ if __name__ == '__main__':
help='Whether the loaded checkpoints have distributed training strategy.')
parser.add_argument('--ckpt_partition', type=int, default=1, help="Number of checkpoint partition.")
args_parse = parser.parse_args()
if args_parse.distribute:
set_parallel_env()
ckpt_file_list_test = None
if args_parse.has_train_strategy:
# Get the checkpoint with train strategy.
@ -281,7 +284,6 @@ if __name__ == '__main__':
ckpt_file_list_test = create_ckpt_file_list(args_parse)
print("Get checkpoint file lists++++", ckpt_file_list_test, flush=True)
if args_parse.distribute:
set_parallel_env()
print("Staring evaluating on 2 devices with model parallel.")
run_eval(args_parse, config_zero_shot_distrubute, ckpt_file_list_test)
else: