!16308 fix googlenet、deeplabv3、ncf、FaceRecognition network bug

From: @zhanghuiyao Reviewed-by: @c_34,@oacjiewen Signed-off-by: @c_34
2021-05-14 14:20:49 +08:00 · 2021-05-14 14:20:49 +08:00 · 7ccb14330f
parent 001648e443 57661121f4
commit 7ccb14330f
30 changed files with 147 additions and 45 deletions
--- a/model_zoo/official/cv/deeplabv3/eval.py
+++ b/model_zoo/official/cv/deeplabv3/eval.py
@ -26,9 +26,9 @@ from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from src.nets import net_factory

-from utils.config import config
-from utils.moxing_adapter import moxing_wrapper
-from utils.device_adapter import get_device_id, get_device_num, get_rank_id
+from model_utils.config import config
+from model_utils.moxing_adapter import moxing_wrapper
+from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id

 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False,
                    device_id=get_device_id())
--- a/model_zoo/official/cv/deeplabv3/export.py
+++ b/model_zoo/official/cv/deeplabv3/export.py
@ -16,8 +16,9 @@
 import argparse
 import numpy as np

+import mindspore.nn as nn
+import mindspore.ops as ops
 from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export
-from eval import BuildEvalNetwork
 from src.nets import net_factory

 parser = argparse.ArgumentParser(description='checkpoint export')
@ -40,6 +41,21 @@ context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
 if args.device_target == "Ascend":
    context.set_context(device_id=args.device_id)

+class BuildEvalNetwork(nn.Cell):
+    def __init__(self, net, input_format="NCHW"):
+        super(BuildEvalNetwork, self).__init__()
+        self.network = net
+        self.softmax = nn.Softmax(axis=1)
+        self.transpose = ops.Transpose()
+        self.format = input_format
+
+    def construct(self, x):
+        if self.format == "NHWC":
+            x = self.transpose(x, (0, 3, 1, 2))
+        output = self.network(x)
+        output = self.softmax(output)
+        return output
+
 if __name__ == '__main__':
    if args.model == 'deeplab_v3_s16':
        network = net_factory.nets_map['deeplab_v3_s16']('eval', args.num_classes, 16, True)
--- a/model_zoo/official/cv/deeplabv3/model_utils/init.py
+++ b/model_zoo/official/cv/deeplabv3/model_utils/init.py
--- a/model_zoo/official/cv/deeplabv3/model_utils/config.py
+++ b/model_zoo/official/cv/deeplabv3/model_utils/config.py
--- a/model_zoo/official/cv/deeplabv3/model_utils/device_adapter.py
+++ b/model_zoo/official/cv/deeplabv3/model_utils/device_adapter.py
--- a/model_zoo/official/cv/deeplabv3/model_utils/local_adapter.py
+++ b/model_zoo/official/cv/deeplabv3/model_utils/local_adapter.py
--- a/model_zoo/official/cv/deeplabv3/model_utils/moxing_adapter.py
+++ b/model_zoo/official/cv/deeplabv3/model_utils/moxing_adapter.py
--- a/model_zoo/official/cv/deeplabv3/train.py
+++ b/model_zoo/official/cv/deeplabv3/train.py
@ -30,9 +30,9 @@ from src.data import dataset as data_generator
 from src.loss import loss
 from src.nets import net_factory
 from src.utils import learning_rates
-from utils.config import config
-from utils.moxing_adapter import moxing_wrapper
-from utils.device_adapter import get_device_id, get_device_num, get_rank_id
+from model_utils.config import config
+from model_utils.moxing_adapter import moxing_wrapper
+from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id

 set_seed(1)

--- a/model_zoo/official/cv/googlenet/README.md
+++ b/model_zoo/official/cv/googlenet/README.md
@ -167,6 +167,51 @@ We use CIFAR-10 dataset by default. Your can also pass `$dataset_type` to the sc
      # (8) Create your job.
      ```

+    - Train cifar10 8p on ModelArts
+
+      ```python
+      # (1) Add "config_path='/path_to_code/cifar10_config.yaml'" on the website UI interface.
+      # (2) Perform a or b.
+      #       a. Set "enable_modelarts=True" on cifar10_config.yaml file.
+      #          Set "dataset_name='cifar10'" on cifar10_config.yaml file.
+      #          Set "train_data_path='/cache/data/'" on cifar10_config.yaml file.
+      #          Set other parameters on cifar10_config.yaml file you need.
+      #       b. Add "enable_modelarts=True" on the website UI interface.
+      #          Add "dataset_name=cifar10" on the website UI interface.
+      #          Add "train_data_path=/cache/data/" on the website UI interface.
+      #          Add other parameters on the website UI interface.
+      # (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
+      # (4) Set the code directory to "/path/googlenet" on the website UI interface.
+      # (5) Set the startup file to "train.py" on the website UI interface.
+      # (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
+      # (7) Create your job.
+      ```
+
+    - Eval imagenet on ModelArts
+
+      ```python
+      # (1) Add "config_path='/path_to_code/cifar10_config.yaml'" on the website UI interface.
+      # (2) Perform a or b.
+      #       a. Set "enable_modelarts=True" on cifar10_config.yaml file.
+      #          Set "dataset_name='cifar10'" on cifar10_config.yaml file.
+      #          Set "val_data_path='/cache/data/'" on cifar10_config.yaml file.
+      #          Set "checkpoint_url='s3://dir_to_trained_ckpt/'" on cifar10_config.yaml file.
+      #          Set "checkpoint_path='/cache/checkpoint_path/model.ckpt'" on cifar10_config.yaml file.
+      #          Set other parameters on cifar10_config.yaml file you need.
+      #       b. Add "enable_modelarts=True" on the website UI interface.
+      #          Add "dataset_name=cifar10" on the website UI interface.
+      #          Add "val_data_path=/cache/data/" on the website UI interface.
+      #          Add "checkpoint_url='s3://dir_to_trained_ckpt/'" on the website UI interface.
+      #          Add "checkpoint_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
+      #          Add other parameters on the website UI interface.
+      # (3) Upload or copy your pretrained model to S3 bucket.
+      # (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
+      # (5) Set the code directory to "/path/googlenet" on the website UI interface.
+      # (6) Set the startup file to "eval.py" on the website UI interface.
+      # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
+      # (8) Create your job.
+      ```
+
 # [Script Description](#contents)

 ## [Script and Sample Code](#contents)
--- a/model_zoo/official/cv/googlenet/README_CN.md
+++ b/model_zoo/official/cv/googlenet/README_CN.md
@ -174,6 +174,51 @@ GoogleNet由多个inception模块串联起来，可以更加深入。  降维的
      # (8) 创建训练作业
      ```

+    - 在 ModelArts 上使用8卡训练 cifar10 数据集
+
+      ```python
+      # (1) 在网页上设置 "config_path='/path_to_code/cifar10_config.yaml'"
+      # (2) 执行a或者b
+      #       a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
+      #          在 cifar10_config.yaml 文件中设置 "dataset_name='cifar10'"
+      #          在 cifar10_config.yaml 文件中设置 "train_data_path='/cache/data/'"
+      #          在 cifar10_config.yaml 文件中设置 其他参数
+      #       b. 在网页上设置 "enable_modelarts=True"
+      #          在网页上设置 "dataset_name=cifar10"
+      #          在网页上设置 "train_data_path=/cache/data/"
+      #          在网页上设置 其他参数
+      # (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集，但那可能会很慢。)
+      # (4) 在网页上设置你的代码路径为 "/path/googlenet"
+      # (5) 在网页上设置启动文件为 "train.py"
+      # (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
+      # (7) 创建训练作业
+      ```
+
+    - 在 ModelArts 上使用单卡验证 cifar10 数据集
+
+      ```python
+      # (1) 在网页上设置 "config_path='/path_to_code/cifar10_config.yaml'"
+      # (2) 执行a或者b
+      #       a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
+      #          在 cifar10_config.yaml 文件中设置 "dataset_name='cifar10'"
+      #          在 cifar10_config.yaml 文件中设置 "val_data_path='/cache/data/'"
+      #          在 cifar10_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_trained_ckpt/'"
+      #          在 cifar10_config.yaml 文件中设置 "checkpoint_path='/cache/checkpoint_path/model.ckpt'"
+      #          在 cifar10_config.yaml 文件中设置 其他参数
+      #       b. 在网页上设置 "enable_modelarts=True"
+      #          在网页上设置 "dataset_name=cifar10"
+      #          在网页上设置 "val_data_path=/cache/data/"
+      #          在网页上设置 "checkpoint_url='s3://dir_to_trained_ckpt/'"
+      #          在网页上设置 "checkpoint_path='/cache/checkpoint_path/model.ckpt'"
+      #          在网页上设置 其他参数
+      # (3) 上传你的预训练模型到 S3 桶上
+      # (4) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集，但那可能会很慢。)
+      # (5) 在网页上设置你的代码路径为 "/path/googlenet"
+      # (6) 在网页上设置启动文件为 "eval.py"
+      # (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
+      # (8) 创建训练作业
+      ```
+
 # 脚本说明

 ## 脚本及样例代码
--- a/model_zoo/official/cv/googlenet/export.py
+++ b/model_zoo/official/cv/googlenet/export.py
@ -21,7 +21,6 @@ import numpy as np
 import mindspore as ms
 from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context

-from src.config import cifar_cfg, imagenet_cfg
 from src.googlenet import GoogleNet

 from model_utils.config import config
@ -32,18 +31,11 @@ if config.device_target == "Ascend":
    context.set_context(device_id=get_device_id())

 if __name__ == '__main__':
-    if config.dataset_name == 'cifar10':
-        cfg = cifar_cfg
-    elif config.dataset_name == 'imagenet':
-        cfg = imagenet_cfg
-    else:
-        raise ValueError("dataset is not support.")
+    net = GoogleNet(num_classes=config.num_classes)

-    net = GoogleNet(num_classes=cfg.num_classes)
-
-    assert cfg.checkpoint_path is not None, "cfg.checkpoint_path is None."
+    assert config.checkpoint_path is not None, "config.checkpoint_path is None."
    param_dict = load_checkpoint(config.ckpt_file)
    load_param_into_net(net, param_dict)

-    input_arr = Tensor(np.ones([config.batch_size, 3, cfg.image_height, cfg.image_width]), ms.float32)
+    input_arr = Tensor(np.ones([config.batch_size, 3, config.image_height, config.image_width]), ms.float32)
    export(net, input_arr, file_name=config.file_name, file_format=config.file_format)
--- a/model_zoo/official/cv/googlenet/scripts/run_eval.sh
+++ b/model_zoo/official/cv/googlenet/scripts/run_eval.sh
@ -30,7 +30,7 @@ then
    fi
    dataset_type=$1
 fi
-config_path="./${dataset_type}_config.yaml"
+config_path="${BASEPATH}/../${dataset_type}_config.yaml"
 echo "config path is : ${config_path}"

 python ${BASEPATH}/../eval.py --config_path=$config_path --dataset_name=$dataset_type > ./eval.log 2>&1 &
--- a/model_zoo/official/cv/googlenet/scripts/run_eval_gpu.sh
+++ b/model_zoo/official/cv/googlenet/scripts/run_eval_gpu.sh
@ -39,8 +39,6 @@ then
    fi
    dataset_type=$2
 fi
-config_path="./${dataset_type}_config.yaml"
-echo "config path is : ${config_path}"

 BASEPATH=$(cd "`dirname $0`" || exit; pwd)
 export PYTHONPATH=${BASEPATH}:$PYTHONPATH
@ -53,4 +51,7 @@ fi
 mkdir ../eval
 cd ../eval || exit

+config_path="${BASEPATH}/../${dataset_type}_config.yaml"
+echo "config path is : ${config_path}"
+
 python3 ${BASEPATH}/../eval.py --config_path=$config_path --checkpoint_path=$1 --dataset_name=$dataset_type > ./eval.log 2>&1 &
--- a/model_zoo/official/cv/googlenet/scripts/run_train.sh
+++ b/model_zoo/official/cv/googlenet/scripts/run_train.sh
@ -37,8 +37,6 @@ then
    fi
    dataset_type=$2
 fi
-config_path="./${dataset_type}_config.yaml"
-echo "config path is : ${config_path}"

 ulimit -u unlimited
 export DEVICE_NUM=8
@ -47,6 +45,10 @@ PATH1=$(realpath $1)
 export RANK_TABLE_FILE=$PATH1
 echo "RANK_TABLE_FILE=${PATH1}"

+EXECUTE_PATH=$(pwd)
+config_path="${EXECUTE_PATH}/${dataset_type}_config.yaml"
+echo "config path is : ${config_path}"
+
 export SERVER_ID=0
 rank_start=$((DEVICE_NUM * SERVER_ID))
 for((i=0; i<${DEVICE_NUM}; i++))
--- a/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh
+++ b/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh
@ -53,7 +53,8 @@ then
    fi
    dataset_type=$3
 fi
-config_path="./${dataset_type}_config.yaml"
+
+config_path="${BASEPATH}/../${dataset_type}_config.yaml"
 echo "config path is : ${config_path}"


--- a/model_zoo/official/recommend/ncf/eval.py
+++ b/model_zoo/official/recommend/ncf/eval.py
@ -25,9 +25,9 @@ from src.dataset import create_dataset
 from src.metrics import NCFMetric
 from src.ncf import NCFModel, NetWithLossClass, TrainStepWrap, PredictWithSigmoid

-from utils.config import config
-from utils.moxing_adapter import moxing_wrapper
-from utils.device_adapter import get_device_id
+from model_utils.config import config
+from model_utils.moxing_adapter import moxing_wrapper
+from model_utils.device_adapter import get_device_id

 logging.set_verbosity(logging.INFO)

--- a/model_zoo/official/recommend/ncf/export.py
+++ b/model_zoo/official/recommend/ncf/export.py
@ -18,7 +18,7 @@ import numpy as np
 from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export

 import src.constants as rconst
-from utils.config import config
+from model_utils.config import config
 from ncf import NCFModel, PredictWithSigmoid

 context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
--- a/model_zoo/official/recommend/ncf/model_utils/init.py
+++ b/model_zoo/official/recommend/ncf/model_utils/init.py
--- a/model_zoo/official/recommend/ncf/model_utils/config.py
+++ b/model_zoo/official/recommend/ncf/model_utils/config.py
--- a/model_zoo/official/recommend/ncf/model_utils/device_adapter.py
+++ b/model_zoo/official/recommend/ncf/model_utils/device_adapter.py
@ -15,12 +15,12 @@

 """Device adapter for ModelArts"""

-from utils.config import config
+from model_utils.config import config

 if config.enable_modelarts:
-    from utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+    from model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
 else:
-    from utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+    from model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id

 __all__ = [
    "get_device_id", "get_device_num", "get_rank_id", "get_job_id"
--- a/model_zoo/official/recommend/ncf/model_utils/local_adapter.py
+++ b/model_zoo/official/recommend/ncf/model_utils/local_adapter.py
--- a/model_zoo/official/recommend/ncf/model_utils/moxing_adapter.py
+++ b/model_zoo/official/recommend/ncf/model_utils/moxing_adapter.py
@ -19,7 +19,7 @@ import os
 import functools
 from mindspore import context
 from mindspore.profiler import Profiler
-from utils.config import config
+from model_utils.config import config

 _global_sync_count = 0

--- a/model_zoo/official/recommend/ncf/train.py
+++ b/model_zoo/official/recommend/ncf/train.py
@ -25,9 +25,9 @@ from mindspore.common import set_seed
 from src.dataset import create_dataset
 from src.ncf import NCFModel, NetWithLossClass, TrainStepWrap

-from utils.moxing_adapter import moxing_wrapper
-from utils.config import config
-from utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+from model_utils.moxing_adapter import moxing_wrapper
+from model_utils.config import config
+from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id

 set_seed(1)

--- a/model_zoo/research/cv/FaceRecognition/eval.py
+++ b/model_zoo/research/cv/FaceRecognition/eval.py
@ -29,9 +29,9 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from src.backbone.resnet import get_backbone
 from src.my_logging import get_logger

-from utils.config import config
-from utils.moxing_adapter import moxing_wrapper
-from utils.device_adapter import get_device_id, get_device_num, get_rank_id
+from model_utils.config import config
+from model_utils.moxing_adapter import moxing_wrapper
+from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id

 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())

--- a/model_zoo/research/cv/FaceRecognition/model_utils/init.py
+++ b/model_zoo/research/cv/FaceRecognition/model_utils/init.py
--- a/model_zoo/research/cv/FaceRecognition/model_utils/config.py
+++ b/model_zoo/research/cv/FaceRecognition/model_utils/config.py
--- a/model_zoo/research/cv/FaceRecognition/model_utils/device_adapter.py
+++ b/model_zoo/research/cv/FaceRecognition/model_utils/device_adapter.py
@ -15,12 +15,12 @@

 """Device adapter for ModelArts"""

-from utils.config import config
+from model_utils.config import config

 if config.enable_modelarts:
-    from utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+    from model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
 else:
-    from utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+    from model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id

 __all__ = [
    "get_device_id", "get_device_num", "get_rank_id", "get_job_id"
--- a/model_zoo/research/cv/FaceRecognition/model_utils/local_adapter.py
+++ b/model_zoo/research/cv/FaceRecognition/model_utils/local_adapter.py
--- a/model_zoo/research/cv/FaceRecognition/model_utils/moxing_adapter.py
+++ b/model_zoo/research/cv/FaceRecognition/model_utils/moxing_adapter.py
@ -18,7 +18,7 @@
 import os
 import functools
 from mindspore import context
-from utils.config import config
+from model_utils.config import config

 _global_sync_count = 0

--- a/model_zoo/research/cv/FaceRecognition/train.py
+++ b/model_zoo/research/cv/FaceRecognition/train.py
@ -36,9 +36,9 @@ from src.loss_factory import get_loss
 from src.lrsche_factory import warmup_step_list, list_to_gen
 from src.callback_factory import ProgressMonitor

-from utils.moxing_adapter import moxing_wrapper
-from utils.config import config
-from utils.device_adapter import get_device_id, get_device_num, get_rank_id
+from model_utils.moxing_adapter import moxing_wrapper
+from model_utils.config import config
+from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id

 mindspore.common.seed.set_seed(1)
 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False,