!20956 fix network issue

Merge pull request !20956 from JichenZhao/master
2021-07-29 12:31:23 +00:00 · 2021-07-29 12:31:23 +00:00 · cb0792d32c
parent bb8550e561 5b461b7eeb
commit cb0792d32c
15 changed files with 63 additions and 220 deletions
--- a/model_zoo/research/cv/arcface/README_CN.md
+++ b/model_zoo/research/cv/arcface/README_CN.md
@ -55,7 +55,7 @@

 ```python
 # 分布式训练运行示例
-sh scripts/run_distribute_train.sh rank_size /path/dataset
+sh scripts/run_distribute_train.sh /path/dataset /path/rank_table

 # 单机训练运行示例
 sh scripts/run_standalone_train.sh /path/dataset
@ -108,7 +108,7 @@ train.py和val.py中主要参数如下：
 ### 分布式训练

 ```shell
-sh scripts/run_distribute_train.sh rank_size /path/dataset
+sh scripts/run_distribute_train.sh /path/dataset /path/rank_table
 ```

 上述shell脚本将在后台运行分布训练。可以通过`device[X]/train.log`文件查看结果。
@ -154,7 +154,7 @@ epoch time: 1104929.793 ms, per step time: 97.162 ms
  同时，情确保传入的评估数据集路径为“IJB_release/IJBB/”或“IJB_release/IJBC/”。

  ```bash
-  sh scripts/run_eval_ijbc.sh /path/evalset /path/ckpt
+  sh scripts/run_eval_ijbc.sh /path/evalset /path/ckpt target_name
  ```

  上述python命令将在后台运行，您可以通过eval.log文件查看结果。测试数据集的准确性如下：
--- a/model_zoo/research/cv/arcface/script/run_distribute_train.sh
+++ b/model_zoo/research/cv/arcface/script/run_distribute_train.sh
@ -1,63 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash run.sh RANK_SIZE DATA_PATH"
-echo "For example: bash run.sh 8 path/dataset"
-echo "It is better to use the absolute path."
-echo "=============================================================================================================="
-
-RANK_SIZE=$1
-DATA_PATH=$2
-
-EXEC_PATH=$(pwd)
-echo "$EXEC_PATH"
-
-test_dist_8pcs()
-{
-    export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
-    export RANK_SIZE=8
-}
-
-test_dist_2pcs()
-{
-    export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
-    echo "$RANK_TABLE_FILE"
-    export RANK_SIZE=2
-}
-
-test_dist_${RANK_SIZE}pcs
-
-for((i=0;i<RANK_SIZE;i++))
-do
-    rm -rf device$i
-    mkdir device$i
-    cp -r ./src/ ./device$i
-    cp train.py  ./device$i
-    cd ./device$i
-    export DEVICE_ID=$i
-    export RANK_ID=$i
-    echo "start training for device $i"
-    env > env$i.log
-    python train.py  \
-    --data_url $DATA_PATH \
-    --device_num RANK_SIZE \
-    > train.log$i 2>&1 &
-    cd ../
-done
-echo "finish"
-cd ../
--- a/model_zoo/research/cv/arcface/script/run_eval.sh
+++ b/model_zoo/research/cv/arcface/script/run_eval.sh
@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash run.sh EVAL_PATH CKPT_PATH"
-echo "For example: bash run.sh path/evalset path/ckpt"
-echo "It is better to use the absolute path."
-echo "=============================================================================================================="
-
-EVAL_PATH=$1
-CKPT_PATH=$2
-
-python val.py \
--ckpt_url "$CKPT_PATH" \
--device_id 1 \
--eval_url "$EVAL_PATH" \
--target lfw,cfp_fp,agedb_30,calfw,cplfw \
-> eval.log 2>&1 &
--- a/model_zoo/research/cv/arcface/script/run_eval_ijbc.sh
+++ b/model_zoo/research/cv/arcface/script/run_eval_ijbc.sh
@ -1,34 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash run.sh EVAL_PATH CKPT_PATH"
-echo "For example: bash run.sh path/evalset path/ckpt"
-echo "It is better to use the absolute path."
-echo "=============================================================================================================="
-
-EVAL_PATH=$1
-CKPT_PATH=$2
-
-python eval_ijbc.py \
--model-prefix "$CKPT_PATH" \
--image-path "$EVAL_PATH" \
--result-dir ms1mv2_arcface_r100 \
--batch-size 128 \
--job ms1mv2_arcface_r100 \
--target IJBC \
--network iresnet100 \
-> eval.log 2>&1 &
--- a/model_zoo/research/cv/arcface/script/run_standalone_train.sh
+++ b/model_zoo/research/cv/arcface/script/run_standalone_train.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash run.sh DATA_PATH"
-echo "For example: bash run.sh path/MS1M"
-echo "It is better to use the absolute path."
-echo "=============================================================================================================="
-
-# shellcheck disable=SC2034
-DATA_PATH=$1
-
-python train.py  \
--data_url DATA_PATH \
--device_num 1 \
-> train.log 2>&1 &
--- a/model_zoo/research/cv/arcface/scripts/run_distribute_train.sh
+++ b/model_zoo/research/cv/arcface/scripts/run_distribute_train.sh
@ -16,31 +16,24 @@

 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "bash run.sh RANK_SIZE DATA_PATH"
-echo "For example: bash run.sh 8 path/dataset"
+echo "bash run.sh DATA_PATH RANK_TABLE"
+echo "For example: bash run.sh /path/dataset /path/rank_table"
 echo "It is better to use the absolute path."
 echo "=============================================================================================================="
-
-RANK_SIZE=$1
-DATA_PATH=$2
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+RANK_SIZE=8
+DATA_PATH=$(get_real_path $1)
+RANK_TABLE=$(get_real_path $2)

 EXEC_PATH=$(pwd)
 echo "$EXEC_PATH"
-
-test_dist_8pcs()
-{
-    export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
-    export RANK_SIZE=8
-}
-
-test_dist_2pcs()
-{
-    export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
-    echo "$RANK_TABLE_FILE"
-    export RANK_SIZE=2
-}
-
-test_dist_${RANK_SIZE}pcs
+export RANK_TABLE_FILE=$RANK_TABLE

 for((i=0;i<RANK_SIZE;i++))
 do
--- a/model_zoo/research/cv/arcface/scripts/run_eval_ijbc.sh
+++ b/model_zoo/research/cv/arcface/scripts/run_eval_ijbc.sh
@ -15,8 +15,8 @@
 # ============================================================================
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "bash run.sh EVAL_PATH CKPT_PATH"
-echo "For example: bash run.sh path/evalset path/ckpt"
+echo "bash run.sh EVAL_PATH CKPT_PATH TARGET"
+echo "For example: bash run.sh path/evalset path/ckpt IJBC"
 echo "It is better to use the absolute path."
 echo "=============================================================================================================="

@ -29,6 +29,6 @@ python eval_ijbc.py \
 --result-dir ms1mv2_arcface_r100 \
 --batch-size 128 \
 --job ms1mv2_arcface_r100 \
--target IJBC \
+--target $3 \
 --network iresnet100 \
 > eval.log 2>&1 &
--- a/model_zoo/research/cv/arcface/scripts/run_standalone_train.sh
+++ b/model_zoo/research/cv/arcface/scripts/run_standalone_train.sh
@ -25,6 +25,6 @@ echo "==========================================================================
 DATA_PATH=$1

 python train.py  \
--data_url DATA_PATH \
+--data_url $DATA_PATH \
 --device_num 1 \
 > train.log 2>&1 &
--- a/model_zoo/research/cv/hardnet/train.py
+++ b/model_zoo/research/cv/hardnet/train.py
@ -61,8 +61,8 @@ if __name__ == '__main__':
    if args.distribute:
        if target == "Ascend":
            init()
-            device_id = int(os.getenv('DEVICE_ID'))
-            context.set_auto_parallel_context(device_id=device_id,
+            device_num = int(os.getenv('RANK_SIZE'))
+            context.set_auto_parallel_context(device_num=device_num,
                                              parallel_mode=ParallelMode.DATA_PARALLEL,
                                              gradients_mean=True)
        if target == "GPU":
--- a/model_zoo/research/cv/simple_baselines/train.py
+++ b/model_zoo/research/cv/simple_baselines/train.py
@ -18,9 +18,9 @@ train
 from __future__ import division

 import os
+import ast
 import argparse
 import numpy as np
-
 from mindspore import context, Tensor
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init
@ -66,7 +66,7 @@ def parse_args():
    parser.add_argument('--train_url', required=False, default=None, help='Location of training outputs.')
    parser.add_argument('--device_id', required=False, default=None, type=int, help='Location of training outputs.')
    parser.add_argument('--run_distribute', required=False, default=False, help='Location of training outputs.')
-    parser.add_argument('--is_model_arts', required=False, default=False, help='Location of training outputs.')
+    parser.add_argument('--is_model_arts', type=ast.literal_eval, default=False, help='Location of training outputs.')
    args = parser.parse_args()
    return args

--- a/model_zoo/research/nlp/seq2seq/README_CN.md
+++ b/model_zoo/research/nlp/seq2seq/README_CN.md
@ -54,15 +54,21 @@ bash wmt14_en_fr.sh

  ```python
  # 运行训练示例
-  python train.py > train.log 2>&1 &
+  cd ./scripts
+  bash run_standalone_train_ascend.sh [TRAIN_DATASET]

  # 运行分布式训练示例
-  sh scripts/run_train.sh rank_table.json
+  cd ./scripts
+  bash run_distributed_train_ascend [RANK_TABLE] [TRAIN_DATASET]

  # 运行评估示例
-  python eval.py > eval.log 2>&1 &
-  或
-  sh run_eval.sh
+  cd ./scripts
+  bash run_standalone_eval_ascend.sh \
+       seq2seq/dataset_menu/newstest2014.en.mindrecord \
+       seq2seq/scripts/device0/text_translation/ckpt_0/seq2seq-8_3437.ckpt \
+       seq2seq/dataset_menu/vocab.bpe.32000  \
+       seq2seq/dataset_menu/bpe.32000   \
+       seq2seq/dataset_menu/newstest2014.fr
  ```

  对于分布式训练，需要提前创建JSON格式的hccl配置文件。
@ -159,7 +165,8 @@ bash wmt14_en_fr.sh
 - Ascend处理器环境运行

  ```bash
-  bash scripts/run_standalone_train_ascend.sh
+  cd ./scripts
+  bash run_standalone_train_ascend.sh [TRAIN_DATASET]
  ```

  上述python命令将在后台运行，您可以通过scripts/train/log_seq2seq_network.log文件查看结果。loss值保存在scripts/train/loss.log
@ -171,7 +178,8 @@ bash wmt14_en_fr.sh
 - Ascend处理器环境运行

  ```bash
-  bash scripts/run_distributed_train_ascend rank_table.json
+  cd ./scripts
+  bash run_distributed_train_ascend [RANK_TABLE] [TRAIN_DATASET]
  ```

  上述shell脚本将在后台运行分布训练。您可以通过scripts/device[X]/log_seq2seq_network.log文件查看结果。loss值保存在scripts/device[X]/loss.log
@ -185,7 +193,8 @@ bash wmt14_en_fr.sh
 - 在Ascend环境运行时评估，脚本示例如下

  ```bash
-  sh run_standalone_eval_ascend.sh \
+  cd ./scripts
+  bash run_standalone_eval_ascend.sh \
       seq2seq/dataset_menu/newstest2014.en.mindrecord \
       seq2seq/scripts/device0/text_translation/ckpt_0/seq2seq-8_3437.ckpt \
       seq2seq/dataset_menu/vocab.bpe.32000  \
@ -208,7 +217,7 @@ bash wmt14_en_fr.sh

 | 参数          | Ascend                                                       |
 | ------------- | ------------------------------------------------------------ |
-| 模型版本      | Inception V1                                                 |
+| 模型版本      | Seq2Seq                                                 |
 | 资源          | Ascend 910, CPU 2.60GHz, 56核, 内存：314G                    |
 | 上传日期      | 2021-3-29                                                    |
 | MindSpore版本 | 1.1.1                                                        |
@ -227,7 +236,7 @@ bash wmt14_en_fr.sh

 | 参数          | Ascend         |
 | ------------- | -------------- |
-| 模型版本      | Inception V1   |
+| 模型版本      | Seq2Seq       |
 | 资源          | Ascend 910     |
 | 上传日期      | 2021-03-29     |
 | MindSpore版本 | 1.1.1          |
--- a/model_zoo/research/nlp/seq2seq/scripts/run_distributed_train_ascend.sh
+++ b/model_zoo/research/nlp/seq2seq/scripts/run_distributed_train_ascend.sh
@ -48,6 +48,7 @@ do
    export RANK_ID=$i
    export DEVICE_ID=$i
  python ../../train.py \
+    --is_modelarts=False \
    --config=${current_exec_path}/device${i}/config/config.json \
    --pre_train_dataset=$PRE_TRAIN_DATASET > log_seq2seq_network${i}.log 2>&1 &
    cd ${current_exec_path} || exit
--- a/model_zoo/research/nlp/seq2seq/scripts/run_standalone_train_ascend.sh
+++ b/model_zoo/research/nlp/seq2seq/scripts/run_standalone_train_ascend.sh
@ -41,6 +41,7 @@ cd ./train || exit
 echo "start for training"
 env > env.log
 python train.py \
+  --is_modelarts=False \
  --config=${current_exec_path}/train/config/config.json \
  --pre_train_dataset=$PRE_TRAIN_DATASET > log_seq2seq_network.log 2>&1 &
 cd ..
--- a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/seq2seq_for_train.py
+++ b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/seq2seq_for_train.py
@ -81,7 +81,7 @@ class PredLogProbs(nn.Cell):
        Tensor, log softmax output.
    """

-    def __init__(self, config):
+    def __init__(self):
        super(PredLogProbs, self).__init__()
        self.reshape = P.Reshape()
        self.log_softmax = nn.LogSoftmax(axis=-1)
@ -172,7 +172,7 @@ class Seq2seqTraining(nn.Cell):
    def __init__(self, config, is_training, use_one_hot_embeddings):
        super(Seq2seqTraining, self).__init__()
        self.seq2seq = Seq2seqModel(config, is_training, use_one_hot_embeddings)
-        self.projection = PredLogProbs(config)
+        self.projection = PredLogProbs()

    def construct(self, source_ids, source_mask, target_ids):
        """
@ -285,7 +285,6 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
-        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
--- a/model_zoo/research/nlp/seq2seq/train.py
+++ b/model_zoo/research/nlp/seq2seq/train.py
@ -14,9 +14,9 @@
 # ============================================================================
 """Train api."""
 import os
+import ast
 import argparse
 import numpy as np
-import moxing as mox

 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
@ -25,7 +25,7 @@ from mindspore.nn.optim import Lamb
 from mindspore.train.model import Model
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
-from mindspore.train.callback import  LossMonitor, SummaryCollector
+from mindspore.train.callback import LossMonitor, SummaryCollector
 from mindspore import context, Parameter
 from mindspore.context import ParallelMode
 from mindspore.communication import management as MultiAscend
@ -42,28 +42,28 @@ from src.utils.optimizer import Adam

 parser = argparse.ArgumentParser(description='Seq2seq train entry point.')

-is_modelarts = False
-
-if is_modelarts:
-    parser.add_argument("--config", type=str, required=True, help="model config json file path.")
-    parser.add_argument("--data_url", type=str, required=True, help="pre-train dataset address.")
-    parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
-
+parser.add_argument("--is_modelarts", type=ast.literal_eval, default=False, help="model config json file path.")
+parser.add_argument("--data_url", type=str, default=None, help="pre-train dataset address.")
+parser.add_argument('--train_url', required=True, default=None, help='Location of training outputs.')
 parser.add_argument("--config", type=str, required=True, help="model config json file path.")
 parser.add_argument("--pre_train_dataset", type=str, required=True, help="pre-train dataset address.")
-
+args = parser.parse_args()
+if args.is_modelarts:
+    import moxing as mox
 context.set_context(
    mode=context.GRAPH_MODE,
    save_graphs=True,
    device_target="Ascend",
    reserve_class_name_in_scope=True)

+
 def get_config(config):
    config = Seq2seqConfig.from_json_file(config)
    config.compute_type = mstype.float16
    config.dtype = mstype.float32
    return config

+
 def _train(model, config: Seq2seqConfig,
           pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None,
           callbacks: list = None):
@ -333,17 +333,16 @@ def _check_args(config):
    if not isinstance(config, str):
        raise ValueError("`config` must be type of str.")

+
 if __name__ == '__main__':
    _rank_size = os.getenv('RANK_SIZE')
-    args, _ = parser.parse_known_args()
-
-    if is_modelarts:
-        mox.file.copy_parallel(src_url=args.data_url, dst_url='/cache/dataset_menu/')
-        _config.pre_train_dataset = '/cache/dataset_menu/train.tok.clean.bpe.32000.en.mindrecord'
-        _config.ckpt_path = '/cache/train_output/'

    _check_args(args.config)
    _config = get_config(args.config)
+    if args.is_modelarts:
+        mox.file.copy_parallel(src_url=args.data_url, dst_url='/cache/dataset_menu/')
+        _config.pre_train_dataset = '/cache/dataset_menu/train.tok.clean.bpe.32000.en.mindrecord'
+        _config.ckpt_path = '/cache/train_output/'
    _config.pre_train_dataset = args.pre_train_dataset

    set_seed(_config.random_seed)
@ -353,5 +352,5 @@ if __name__ == '__main__':
    else:
        train_single(_config)

-    if is_modelarts:
+    if args.is_modelarts:
        mox.file.copy_parallel(src_url='/cache/train_output/', dst_url=args.train_url)