!18868 CleanCode for cpm networks.

Merge pull request !18868 from casgj/master_0625
2021-06-26 02:09:06 +00:00 · 2021-06-26 02:09:06 +00:00 · a52b732a8b
parent 69cedc28f4 b3d486eef7
commit a52b732a8b
12 changed files with 16 additions and 13 deletions
--- a/model_zoo/official/nlp/cpm/README.md
+++ b/model_zoo/official/nlp/cpm/README.md
@ -413,7 +413,7 @@ The finetune performance and accuracy of 4 machines and 32 cards are as follows:
 | Optimizer                 | Adam                                             |
 | Accuracy                 | 81.4%                                         |
 | Speed                  | 2740ms/step (32pcs)                                      |
-| Loss                  | 0.008                                                 |
+| Loss                  | 0.03                                                 |
 | Params (M)             | 2597.1                                                |
 | Checkpoint for inference   | 57G （.ckpt file）                                            |
 | Scripts                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/cpm> |
--- a/model_zoo/official/nlp/cpm/README_CN.md
+++ b/model_zoo/official/nlp/cpm/README_CN.md
@ -415,7 +415,7 @@ Zero-shot单机双卡推理性能和精度如下：
 | 优化器                 | Adam                                             |
 | 准确率                 | 81.4%                                         |
 | 速度                  | 2740毫秒/步(32卡)                                      |
-| 损失                  | 0.08                                                |
+| 损失                  | 0.03                                                |
 | 参数 (M)             | 2597.1                                                |
 | 推理检查点   | 57G （.ckpt文件）                                            |
 | 脚本                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/cpm> |
--- a/model_zoo/official/nlp/cpm/eval.py
+++ b/model_zoo/official/nlp/cpm/eval.py
@ -273,6 +273,8 @@ if __name__ == '__main__':
    parser.add_argument("--has_train_strategy", type=ast.literal_eval, default=True,
                        help='Model has distributed training strategy.')
    args_eval = parser.parse_args()
+    if args_eval.distribute:
+        set_parallel_env()

    ckpt_file_list_test = None
    if args_eval.has_train_strategy:
@ -286,7 +288,6 @@ if __name__ == '__main__':

    result_accuracy = 0.0
    if args_eval.distribute:
-        set_parallel_env()
        print("Start validation on 2 devices with model parallel.")
        result_accuracy = run_eval(args_eval, finetune_test_distrubute, ckpt_file_list_test)
    else:
--- a/model_zoo/official/nlp/cpm/export.py
+++ b/model_zoo/official/nlp/cpm/export.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
    cpm_model = CPM_LAYER(config_eval)

    if not args.has_train_strategy:
-        weights = load_checkpoint(args.ckpt_path)
+        weights = load_checkpoint(args.ckpt_path_doc)
        can_be_loaded = {}
        print("+++++++loading weights+++++")
        for name, _ in weights.items():
--- a/model_zoo/official/nlp/cpm/scripts/run_eval_distribute_ascend.sh
+++ b/model_zoo/official/nlp/cpm/scripts/run_eval_distribute_ascend.sh
@ -59,7 +59,7 @@ do
    export DEVICE_ID=$i
    echo "start eval for rank $RANK_ID, device $DEVICE_ID"
    env > env.log
-    python ../../eval.py  --dataset $DATASET --data_path $LABEL --ckpt_path $MODEL_CKPT  --ckpt_partition $CKPT_NUMBER --distribute True --has_train_strategy True> log_cpm.log 2>&1 &
+    python ../../eval.py  --dataset $DATASET --data_path $LABEL --ckpt_path_doc $MODEL_CKPT  --ckpt_partition $CKPT_NUMBER --distribute True --has_train_strategy True> log_cpm.log 2>&1 &
    cd ${current_exec_path}
 done
 cd ${current_exec_path}
--- a/model_zoo/official/nlp/cpm/scripts/run_test_distribute_ascend.sh
+++ b/model_zoo/official/nlp/cpm/scripts/run_test_distribute_ascend.sh
@ -69,7 +69,7 @@ do
    env > env.log
    python ../../test.py  --dev_dataset $DEV_DATASET --dev_data_path $DEV_LABEL \
       --test_dataset $TEST_DATASET --test_data_path $TEST_LABEL \
-       --ckpt_path $MODEL_CKPT  --ckpt_partition $CKPT_NUMBER \
+       --ckpt_path_doc $MODEL_CKPT  --ckpt_partition $CKPT_NUMBER \
       --distribute True --has_train_strategy True> log_cpm.log 2>&1 &
    
    cd ${current_exec_path}
--- a/model_zoo/official/nlp/cpm/scripts/run_test_standalone_ascend.sh
+++ b/model_zoo/official/nlp/cpm/scripts/run_test_standalone_ascend.sh
@ -71,7 +71,7 @@ do
  env > env.log
  python ../../test.py  --dev_dataset $DEV_DATASET --dev_data_path $DEV_LABEL \
       --test_dataset $TEST_DATASET --test_data_path $TEST_LABEL \
-       --ckpt_path $MODEL_CKPT  --ckpt_partition $CKPT_NUMBER \
+       --ckpt_path_doc $MODEL_CKPT  --ckpt_partition $CKPT_NUMBER \
       --ckpt_epoch $ckptepoch --result_path $result_path \
       --distribute False --has_train_strategy True> log_cpm.log 2>&1

--- a/model_zoo/official/nlp/cpm/scripts/run_zero-shot_inference_distribute_ascend.sh
+++ b/model_zoo/official/nlp/cpm/scripts/run_zero-shot_inference_distribute_ascend.sh
@ -58,7 +58,7 @@ do
    export DEVICE_ID=$i
    echo "start eval for rank $RANK_ID, device $DEVICE_ID"
    env > env.log
-    python ../../zero-shot.py  --dataset $DATASET --truth_labels_path $LABEL --ckpt_path $MODEL_CKPT  --distribute True --has_train_strategy False> log_cpm.log 2>&1 &
+    python ../../zero-shot.py  --dataset $DATASET --truth_labels_path $LABEL --ckpt_path_doc $MODEL_CKPT  --distribute True --has_train_strategy False> log_cpm.log 2>&1 &
    cd ${current_exec_path}
 done
 cd ${current_exec_path}
--- a/model_zoo/official/nlp/cpm/scripts/run_zero-shot_inference_standalone_ascend.sh
+++ b/model_zoo/official/nlp/cpm/scripts/run_zero-shot_inference_standalone_ascend.sh
@ -54,6 +54,6 @@ cp -r ../scripts/*.sh ./eval
 cd ./eval || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
-python ../../zero-shot.py  --dataset $DATASET --truth_labels_path $LABEL --ckpt_path $MODEL_CKPT --has_train_strategy False > log_cpm.log 2>&1 &
+python ../../zero-shot.py  --dataset $DATASET --truth_labels_path $LABEL --ckpt_path_doc $MODEL_CKPT --has_train_strategy False > log_cpm.log 2>&1 &
 cd ..

--- a/model_zoo/official/nlp/cpm/sort.py
+++ b/model_zoo/official/nlp/cpm/sort.py
@ -18,7 +18,7 @@ import argparse

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Accuracy sort.")
-    parser.add_argument("--result_path", type=str, default="scripts/result.txt",
+    parser.add_argument("--result_path", type=str, default="/home/result.txt",
                        help='Text save address.')

    args_eval = parser.parse_args()
--- a/model_zoo/official/nlp/cpm/train.py
+++ b/model_zoo/official/nlp/cpm/train.py
@ -262,8 +262,8 @@ if __name__ == '__main__':

    args = parser.parse_args()
    if args.multi_machine:
-        print("Training on multiple machines")
+        print("Training on multiple machines.")
        train_paralle(args.dataset, args.pretrain_ckpt_path, config_train_multi_machine)
    else:
-        print("Training on single machine and using 8 cards.")
+        print("Training on single machine.")
        train_paralle(args.dataset, args.pretrain_ckpt_path, config_train_single_machine)
--- a/model_zoo/official/nlp/cpm/zero-shot.py
+++ b/model_zoo/official/nlp/cpm/zero-shot.py
@ -271,6 +271,9 @@ if __name__ == '__main__':
                        help='Whether the loaded checkpoints have distributed training strategy.')
    parser.add_argument('--ckpt_partition', type=int, default=1, help="Number of checkpoint partition.")
    args_parse = parser.parse_args()
+    if args_parse.distribute:
+        set_parallel_env()
+
    ckpt_file_list_test = None
    if args_parse.has_train_strategy:
        # Get the checkpoint with train strategy.
@ -281,7 +284,6 @@ if __name__ == '__main__':
        ckpt_file_list_test = create_ckpt_file_list(args_parse)
        print("Get checkpoint file lists++++", ckpt_file_list_test, flush=True)
    if args_parse.distribute:
-        set_parallel_env()
        print("Staring evaluating on 2 devices with model parallel.")
        run_eval(args_parse, config_zero_shot_distrubute, ckpt_file_list_test)
    else: