!16308 fix googlenet、deeplabv3、ncf、FaceRecognition network bug

From: @zhanghuiyao
Reviewed-by: @c_34,@oacjiewen
Signed-off-by: @c_34
This commit is contained in:
mindspore-ci-bot 2021-05-14 14:20:49 +08:00 committed by Gitee
commit 7ccb14330f
30 changed files with 147 additions and 45 deletions

View File

@ -26,9 +26,9 @@ from mindspore import context
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.nets import net_factory
from utils.config import config
from utils.moxing_adapter import moxing_wrapper
from utils.device_adapter import get_device_id, get_device_num, get_rank_id
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False,
device_id=get_device_id())

View File

@ -16,8 +16,9 @@
import argparse
import numpy as np
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export
from eval import BuildEvalNetwork
from src.nets import net_factory
parser = argparse.ArgumentParser(description='checkpoint export')
@ -40,6 +41,21 @@ context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
if args.device_target == "Ascend":
context.set_context(device_id=args.device_id)
class BuildEvalNetwork(nn.Cell):
def __init__(self, net, input_format="NCHW"):
super(BuildEvalNetwork, self).__init__()
self.network = net
self.softmax = nn.Softmax(axis=1)
self.transpose = ops.Transpose()
self.format = input_format
def construct(self, x):
if self.format == "NHWC":
x = self.transpose(x, (0, 3, 1, 2))
output = self.network(x)
output = self.softmax(output)
return output
if __name__ == '__main__':
if args.model == 'deeplab_v3_s16':
network = net_factory.nets_map['deeplab_v3_s16']('eval', args.num_classes, 16, True)

View File

@ -30,9 +30,9 @@ from src.data import dataset as data_generator
from src.loss import loss
from src.nets import net_factory
from src.utils import learning_rates
from utils.config import config
from utils.moxing_adapter import moxing_wrapper
from utils.device_adapter import get_device_id, get_device_num, get_rank_id
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
set_seed(1)

View File

@ -167,6 +167,51 @@ We use CIFAR-10 dataset by default. Your can also pass `$dataset_type` to the sc
# (8) Create your job.
```
- Train cifar10 8p on ModelArts
```python
# (1) Add "config_path='/path_to_code/cifar10_config.yaml'" on the website UI interface.
# (2) Perform a or b.
# a. Set "enable_modelarts=True" on cifar10_config.yaml file.
# Set "dataset_name='cifar10'" on cifar10_config.yaml file.
# Set "train_data_path='/cache/data/'" on cifar10_config.yaml file.
# Set other parameters on cifar10_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "dataset_name=cifar10" on the website UI interface.
# Add "train_data_path=/cache/data/" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (4) Set the code directory to "/path/googlenet" on the website UI interface.
# (5) Set the startup file to "train.py" on the website UI interface.
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (7) Create your job.
```
- Eval imagenet on ModelArts
```python
# (1) Add "config_path='/path_to_code/cifar10_config.yaml'" on the website UI interface.
# (2) Perform a or b.
# a. Set "enable_modelarts=True" on cifar10_config.yaml file.
# Set "dataset_name='cifar10'" on cifar10_config.yaml file.
# Set "val_data_path='/cache/data/'" on cifar10_config.yaml file.
# Set "checkpoint_url='s3://dir_to_trained_ckpt/'" on cifar10_config.yaml file.
# Set "checkpoint_path='/cache/checkpoint_path/model.ckpt'" on cifar10_config.yaml file.
# Set other parameters on cifar10_config.yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "dataset_name=cifar10" on the website UI interface.
# Add "val_data_path=/cache/data/" on the website UI interface.
# Add "checkpoint_url='s3://dir_to_trained_ckpt/'" on the website UI interface.
# Add "checkpoint_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add other parameters on the website UI interface.
# (3) Upload or copy your pretrained model to S3 bucket.
# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
# (5) Set the code directory to "/path/googlenet" on the website UI interface.
# (6) Set the startup file to "eval.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
```
# [Script Description](#contents)
## [Script and Sample Code](#contents)

View File

@ -174,6 +174,51 @@ GoogleNet由多个inception模块串联起来可以更加深入。 降维的
# (8) 创建训练作业
```
- 在 ModelArts 上使用8卡训练 cifar10 数据集
```python
# (1) 在网页上设置 "config_path='/path_to_code/cifar10_config.yaml'"
# (2) 执行a或者b
# a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
# 在 cifar10_config.yaml 文件中设置 "dataset_name='cifar10'"
# 在 cifar10_config.yaml 文件中设置 "train_data_path='/cache/data/'"
# 在 cifar10_config.yaml 文件中设置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "dataset_name=cifar10"
# 在网页上设置 "train_data_path=/cache/data/"
# 在网页上设置 其他参数
# (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
# (4) 在网页上设置你的代码路径为 "/path/googlenet"
# (5) 在网页上设置启动文件为 "train.py"
# (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (7) 创建训练作业
```
- 在 ModelArts 上使用单卡验证 cifar10 数据集
```python
# (1) 在网页上设置 "config_path='/path_to_code/cifar10_config.yaml'"
# (2) 执行a或者b
# a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True"
# 在 cifar10_config.yaml 文件中设置 "dataset_name='cifar10'"
# 在 cifar10_config.yaml 文件中设置 "val_data_path='/cache/data/'"
# 在 cifar10_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_trained_ckpt/'"
# 在 cifar10_config.yaml 文件中设置 "checkpoint_path='/cache/checkpoint_path/model.ckpt'"
# 在 cifar10_config.yaml 文件中设置 其他参数
# b. 在网页上设置 "enable_modelarts=True"
# 在网页上设置 "dataset_name=cifar10"
# 在网页上设置 "val_data_path=/cache/data/"
# 在网页上设置 "checkpoint_url='s3://dir_to_trained_ckpt/'"
# 在网页上设置 "checkpoint_path='/cache/checkpoint_path/model.ckpt'"
# 在网页上设置 其他参数
# (3) 上传你的预训练模型到 S3 桶上
# (4) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。)
# (5) 在网页上设置你的代码路径为 "/path/googlenet"
# (6) 在网页上设置启动文件为 "eval.py"
# (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等
# (8) 创建训练作业
```
# 脚本说明
## 脚本及样例代码

View File

@ -21,7 +21,6 @@ import numpy as np
import mindspore as ms
from mindspore import Tensor, load_checkpoint, load_param_into_net, export, context
from src.config import cifar_cfg, imagenet_cfg
from src.googlenet import GoogleNet
from model_utils.config import config
@ -32,18 +31,11 @@ if config.device_target == "Ascend":
context.set_context(device_id=get_device_id())
if __name__ == '__main__':
if config.dataset_name == 'cifar10':
cfg = cifar_cfg
elif config.dataset_name == 'imagenet':
cfg = imagenet_cfg
else:
raise ValueError("dataset is not support.")
net = GoogleNet(num_classes=config.num_classes)
net = GoogleNet(num_classes=cfg.num_classes)
assert cfg.checkpoint_path is not None, "cfg.checkpoint_path is None."
assert config.checkpoint_path is not None, "config.checkpoint_path is None."
param_dict = load_checkpoint(config.ckpt_file)
load_param_into_net(net, param_dict)
input_arr = Tensor(np.ones([config.batch_size, 3, cfg.image_height, cfg.image_width]), ms.float32)
input_arr = Tensor(np.ones([config.batch_size, 3, config.image_height, config.image_width]), ms.float32)
export(net, input_arr, file_name=config.file_name, file_format=config.file_format)

View File

@ -30,7 +30,7 @@ then
fi
dataset_type=$1
fi
config_path="./${dataset_type}_config.yaml"
config_path="${BASEPATH}/../${dataset_type}_config.yaml"
echo "config path is : ${config_path}"
python ${BASEPATH}/../eval.py --config_path=$config_path --dataset_name=$dataset_type > ./eval.log 2>&1 &

View File

@ -39,8 +39,6 @@ then
fi
dataset_type=$2
fi
config_path="./${dataset_type}_config.yaml"
echo "config path is : ${config_path}"
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
@ -53,4 +51,7 @@ fi
mkdir ../eval
cd ../eval || exit
config_path="${BASEPATH}/../${dataset_type}_config.yaml"
echo "config path is : ${config_path}"
python3 ${BASEPATH}/../eval.py --config_path=$config_path --checkpoint_path=$1 --dataset_name=$dataset_type > ./eval.log 2>&1 &

View File

@ -37,8 +37,6 @@ then
fi
dataset_type=$2
fi
config_path="./${dataset_type}_config.yaml"
echo "config path is : ${config_path}"
ulimit -u unlimited
export DEVICE_NUM=8
@ -47,6 +45,10 @@ PATH1=$(realpath $1)
export RANK_TABLE_FILE=$PATH1
echo "RANK_TABLE_FILE=${PATH1}"
EXECUTE_PATH=$(pwd)
config_path="${EXECUTE_PATH}/${dataset_type}_config.yaml"
echo "config path is : ${config_path}"
export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID))
for((i=0; i<${DEVICE_NUM}; i++))

View File

@ -53,7 +53,8 @@ then
fi
dataset_type=$3
fi
config_path="./${dataset_type}_config.yaml"
config_path="${BASEPATH}/../${dataset_type}_config.yaml"
echo "config path is : ${config_path}"

View File

@ -25,9 +25,9 @@ from src.dataset import create_dataset
from src.metrics import NCFMetric
from src.ncf import NCFModel, NetWithLossClass, TrainStepWrap, PredictWithSigmoid
from utils.config import config
from utils.moxing_adapter import moxing_wrapper
from utils.device_adapter import get_device_id
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id
logging.set_verbosity(logging.INFO)

View File

@ -18,7 +18,7 @@ import numpy as np
from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export
import src.constants as rconst
from utils.config import config
from model_utils.config import config
from ncf import NCFModel, PredictWithSigmoid
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)

View File

@ -15,12 +15,12 @@
"""Device adapter for ModelArts"""
from utils.config import config
from model_utils.config import config
if config.enable_modelarts:
from utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
from model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
from model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"

View File

@ -19,7 +19,7 @@ import os
import functools
from mindspore import context
from mindspore.profiler import Profiler
from utils.config import config
from model_utils.config import config
_global_sync_count = 0

View File

@ -25,9 +25,9 @@ from mindspore.common import set_seed
from src.dataset import create_dataset
from src.ncf import NCFModel, NetWithLossClass, TrainStepWrap
from utils.moxing_adapter import moxing_wrapper
from utils.config import config
from utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.config import config
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
set_seed(1)

View File

@ -29,9 +29,9 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.backbone.resnet import get_backbone
from src.my_logging import get_logger
from utils.config import config
from utils.moxing_adapter import moxing_wrapper
from utils.device_adapter import get_device_id, get_device_num, get_rank_id
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())

View File

@ -15,12 +15,12 @@
"""Device adapter for ModelArts"""
from utils.config import config
from model_utils.config import config
if config.enable_modelarts:
from utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
from model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
from model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"

View File

@ -18,7 +18,7 @@
import os
import functools
from mindspore import context
from utils.config import config
from model_utils.config import config
_global_sync_count = 0

View File

@ -36,9 +36,9 @@ from src.loss_factory import get_loss
from src.lrsche_factory import warmup_step_list, list_to_gen
from src.callback_factory import ProgressMonitor
from utils.moxing_adapter import moxing_wrapper
from utils.config import config
from utils.device_adapter import get_device_id, get_device_num, get_rank_id
from model_utils.moxing_adapter import moxing_wrapper
from model_utils.config import config
from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
mindspore.common.seed.set_seed(1)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False,