modify model_zoo ssd for clould

This commit is contained in:
lilei 2021-05-29 16:39:31 +08:00
parent a2dc98f972
commit 44139b80da
60 changed files with 1947 additions and 889 deletions

View File

@ -83,25 +83,58 @@ After installing MindSpore via the official website, you can start training and
```bash ```bash
# distributed training # distributed training
Usage: sh scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) Usage: sh scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training # standalone training
Usage: sh scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) Usage: sh scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)
# run evaluation example # run evaluation example
Usage: sh scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATASET_PATH] [CHECKPOINT_PATH] Usage: sh scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [CHECKPOINT_PATH]
``` ```
- running on CPU - running on CPU
```bash ```bash
# standalone training # standalone training
Usage: bash scripts/run_train_cpu.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) Usage: bash scripts/run_train_cpu.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)
# run evaluation example # run evaluation example
Usage: bash scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATASET_PATH] [CHECKPOINT_PATH] Usage: bash scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATA_PATH] [CHECKPOINT_PATH]
``` ```
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
```python
# run distributed training on modelarts example
# (1) First, Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set other parameters on yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add other parameters on the website UI interface.
# (2) Set the config directory to "config_path=/The path of config in S3/"
# (3) Set the Dataset directory in config file.
# (4) Set the code directory to "/path/squeezenet" on the website UI interface.
# (5) Set the startup file to "train.py" on the website UI interface.
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (7) Create your job.
# run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the config directory to "config_path=/The path of config in S3/"
# (4) Set the Dataset directory in config file.
# (5) Set the code directory to "/path/squeezenet" on the website UI interface.
# (6) Set the startup file to "eval.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job.
```
# [Script Description](#contents) # [Script Description](#contents)
## [Script and Sample Code](#contents) ## [Script and Sample Code](#contents)
@ -117,14 +150,22 @@ After installing MindSpore via the official website, you can start training and
├── run_eval.sh # launch ascend evaluation ├── run_eval.sh # launch ascend evaluation
├── run_infer_310.sh # shell script for 310 infer ├── run_infer_310.sh # shell script for 310 infer
├── src ├── src
├── config.py # parameter configuration
├── dataset.py # data preprocessing ├── dataset.py # data preprocessing
├── CrossEntropySmooth.py # loss definition for ImageNet dataset ├── CrossEntropySmooth.py # loss definition for ImageNet dataset
├── lr_generator.py # generate learning rate for each step ├── lr_generator.py # generate learning rate for each step
└── squeezenet.py # squeezenet architecture, including squeezenet and squeezenet_residual └── squeezenet.py # squeezenet architecture, including squeezenet and squeezenet_residual
├── train.py # train net ├── model_utils
├── eval.py # eval net │ ├── device_adapter.py # device adapter
└── export.py # export checkpoint files into geir/onnx │ ├── local_adapter.py # local adapter
│ ├── moxing_adapter.py # moxing adapter
│ ├── config.py # parameter analysis
├── squeezenet_cifar10_config.yaml # parameter configuration
├── squeezenet_imagenet_config.yaml # parameter configuration
├── squeezenet_residual_cifar10_config.yaml # parameter configuration
├── squeezenet_residual_imagenet_config.yaml # parameter configuration
├── train.py # train net
├── eval.py # eval net
└── export.py # export checkpoint files into geir/onnx
├── postprocess.py # postprocess script ├── postprocess.py # postprocess script
├── preprocess.py # preprocess script ├── preprocess.py # preprocess script
``` ```
@ -231,10 +272,10 @@ For more configuration details, please refer the script `config.py`.
```shell ```shell
# distributed training # distributed training
Usage: sh scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) Usage: sh scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training # standalone training
Usage: sh scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) Usage: sh scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)
``` ```
For distributed training, a hccl configuration file with JSON format needs to be created in advance. For distributed training, a hccl configuration file with JSON format needs to be created in advance.
@ -301,7 +342,7 @@ epoch: 5 step 5004, loss is 4.888848304748535
```shell ```shell
# evaluation # evaluation
Usage: sh scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATASET_PATH] [CHECKPOINT_PATH] Usage: sh scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [CHECKPOINT_PATH]
``` ```
```shell ```shell
@ -344,7 +385,7 @@ result: {'top_1_accuracy': 0.6094950384122919, 'top_5_accuracy': 0.8263244238156
### Export MindIR ### Export MindIR
```shell ```shell
python export.py --ckpt_file [CKPT_PATH] --batch_size [BATCH_SIZE] --net [NET] --dataset [DATASET] --file_format [EXPORT_FORMAT] python export.py --checkpoint_file_path [CKPT_PATH] --batch_size [BATCH_SIZE] --net_name [NET] --dataset [DATASET] --file_format [EXPORT_FORMAT]
``` ```
The ckpt_file parameter is required, The ckpt_file parameter is required,
@ -604,7 +645,7 @@ If you need to use the trained model to perform inference on multiple hardware p
device_id=device_id) device_id=device_id)
# Load unseen dataset for inference # Load unseen dataset for inference
dataset = create_dataset(dataset_path=args_opt.dataset_path, dataset = create_dataset(dataset_path=config.data_path,
do_train=False, do_train=False,
batch_size=config.batch_size, batch_size=config.batch_size,
target='Ascend') target='Ascend')
@ -617,7 +658,7 @@ If you need to use the trained model to perform inference on multiple hardware p
metrics={'top_1_accuracy', 'top_5_accuracy'}) metrics={'top_1_accuracy', 'top_5_accuracy'})
# Load pre-trained model # Load pre-trained model
param_dict = load_checkpoint(args_opt.checkpoint_path) param_dict = load_checkpoint(config.checkpoint_file_path)
load_param_into_net(net, param_dict) load_param_into_net(net, param_dict)
net.set_train(False) net.set_train(False)
@ -632,7 +673,7 @@ If you need to use the trained model to perform inference on multiple hardware p
```py ```py
# Load dataset # Load dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, dataset = create_dataset(dataset_path=config.data_path,
do_train=True, do_train=True,
repeat_num=1, repeat_num=1,
batch_size=config.batch_size, batch_size=config.batch_size,
@ -643,8 +684,8 @@ If you need to use the trained model to perform inference on multiple hardware p
net = squeezenet(num_classes=config.class_num) net = squeezenet(num_classes=config.class_num)
# load checkpoint # load checkpoint
if args_opt.pre_trained: if config.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained) param_dict = load_checkpoint(config.pre_trained)
load_param_into_net(net, param_dict) load_param_into_net(net, param_dict)
# init lr # init lr
@ -679,7 +720,7 @@ If you need to use the trained model to perform inference on multiple hardware p
save_checkpoint_steps=config.save_checkpoint_epochs * step_size, save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
keep_checkpoint_max=config.keep_checkpoint_max) keep_checkpoint_max=config.keep_checkpoint_max)
time_cb = TimeMonitor(data_size=step_size) time_cb = TimeMonitor(data_size=step_size)
ckpt_cb = ModelCheckpoint(prefix=args_opt.net + '_' + args_opt.dataset, ckpt_cb = ModelCheckpoint(prefix=config.net_name + '_' + config.dataset,
directory=ckpt_save_dir, directory=ckpt_save_dir,
config=config_ck) config=config_ck)
loss_cb = LossMonitor() loss_cb = LossMonitor()

View File

@ -14,44 +14,34 @@
# ============================================================================ # ============================================================================
"""eval squeezenet.""" """eval squeezenet."""
import os import os
import argparse
from mindspore import context from mindspore import context
from mindspore.common import set_seed from mindspore.common import set_seed
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from src.CrossEntropySmooth import CrossEntropySmooth from src.CrossEntropySmooth import CrossEntropySmooth
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--net', type=str, default='squeezenet', choices=['squeezenet', 'squeezenet_residual'],
help='Model.')
parser.add_argument('--dataset', type=str, default='cifar10', choices=['cifar10', 'imagenet'], help='Dataset.')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
set_seed(1) set_seed(1)
if args_opt.net == "squeezenet": if config.net_name == "squeezenet":
from src.squeezenet import SqueezeNet as squeezenet from src.squeezenet import SqueezeNet as squeezenet
if args_opt.dataset == "cifar10": if config.dataset == "cifar10":
from src.config import config1 as config
from src.dataset import create_dataset_cifar as create_dataset from src.dataset import create_dataset_cifar as create_dataset
else: else:
from src.config import config2 as config
from src.dataset import create_dataset_imagenet as create_dataset from src.dataset import create_dataset_imagenet as create_dataset
else: else:
from src.squeezenet import SqueezeNet_Residual as squeezenet from src.squeezenet import SqueezeNet_Residual as squeezenet
if args_opt.dataset == "cifar10": if config.dataset == "cifar10":
from src.config import config3 as config
from src.dataset import create_dataset_cifar as create_dataset from src.dataset import create_dataset_cifar as create_dataset
else: else:
from src.config import config4 as config
from src.dataset import create_dataset_imagenet as create_dataset from src.dataset import create_dataset_imagenet as create_dataset
if __name__ == '__main__': @moxing_wrapper()
target = args_opt.device_target def eval_net():
"""eval net """
target = config.device_target
# init context # init context
device_id = os.getenv('DEVICE_ID') device_id = os.getenv('DEVICE_ID')
@ -61,22 +51,21 @@ if __name__ == '__main__':
device_id=device_id) device_id=device_id)
# create dataset # create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, dataset = create_dataset(dataset_path=config.data_path,
do_train=False, do_train=False,
batch_size=config.batch_size, batch_size=config.batch_size,
target=target) target=target)
step_size = dataset.get_dataset_size()
# define net # define net
net = squeezenet(num_classes=config.class_num) net = squeezenet(num_classes=config.class_num)
# load checkpoint # load checkpoint
param_dict = load_checkpoint(args_opt.checkpoint_path) param_dict = load_checkpoint(config.checkpoint_file_path)
load_param_into_net(net, param_dict) load_param_into_net(net, param_dict)
net.set_train(False) net.set_train(False)
# define loss # define loss
if args_opt.dataset == "imagenet": if config.dataset == "imagenet":
if not config.use_label_smooth: if not config.use_label_smooth:
config.label_smooth_factor = 0.0 config.label_smooth_factor = 0.0
loss = CrossEntropySmooth(sparse=True, loss = CrossEntropySmooth(sparse=True,
@ -93,4 +82,7 @@ if __name__ == '__main__':
# eval model # eval model
res = model.eval(dataset) res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path) print("result:", res, "ckpt=", config.checkpoint_file_path)
if __name__ == '__main__':
eval_net()

View File

@ -17,43 +17,28 @@
python export.py --net squeezenet --dataset cifar10 --checkpoint_path squeezenet_cifar10-120_1562.ckpt python export.py --net squeezenet --dataset cifar10 --checkpoint_path squeezenet_cifar10-120_1562.ckpt
""" """
import argparse
import numpy as np import numpy as np
from model_utils.config import config
from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export
parser = argparse.ArgumentParser(description='checkpoint export') if config.net_name == "squeezenet":
parser.add_argument("--device_id", type=int, default=0, help="Device id")
parser.add_argument("--batch_size", type=int, default=32, help="batch size")
parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.")
parser.add_argument('--width', type=int, default=227, help='input width')
parser.add_argument('--height', type=int, default=227, help='input height')
parser.add_argument('--net', type=str, default='squeezenet', choices=['squeezenet', 'squeezenet_residual'],
help='Model.')
parser.add_argument('--dataset', type=str, default='cifar10', choices=['cifar10', 'imagenet'], help='Dataset.')
parser.add_argument("--file_name", type=str, default="squeezenet", help="output file name.")
parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR"], default="AIR", help="file format")
parser.add_argument("--device_target", type=str, default="Ascend",
choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
args = parser.parse_args()
if args.net == "squeezenet":
from src.squeezenet import SqueezeNet as squeezenet from src.squeezenet import SqueezeNet as squeezenet
else: else:
from src.squeezenet import SqueezeNet_Residual as squeezenet from src.squeezenet import SqueezeNet_Residual as squeezenet
if args.dataset == "cifar10": if config.dataset == "cifar10":
num_classes = 10 num_classes = 10
else: else:
num_classes = 1000 num_classes = 1000
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
if args.device_target == "Ascend": if config.device_target == "Ascend":
context.set_context(device_id=args.device_id) context.set_context(device_id=config.device_id)
if __name__ == '__main__': if __name__ == '__main__':
net = squeezenet(num_classes=num_classes) net = squeezenet(num_classes=num_classes)
param_dict = load_checkpoint(args.ckpt_file) param_dict = load_checkpoint(config.checkpoint_file_path)
load_param_into_net(net, param_dict) load_param_into_net(net, param_dict)
input_data = Tensor(np.zeros([args.batch_size, 3, args.height, args.width], np.float32)) input_data = Tensor(np.zeros([config.batch_size, 3, config.height, config.width], np.float32))
export(net, input_data, file_name=args.file_name, file_format=args.file_format) export(net, input_data, file_name=config.file_name, file_format=config.file_format)

View File

@ -0,0 +1,124 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pformat
import yaml
_config_path = "./squeezenet_cifar10_config.yaml"
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="squeezenet_cifar10_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
else:
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \
"../squeezenet_cifar10_config.yaml"), help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from model_utils.config import config
if config.enable_modelarts:
from model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,115 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from model_utils.config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -16,7 +16,7 @@
if [ $# != 4 ] && [ $# != 5 ] if [ $# != 4 ] && [ $# != 5 ]
then then
echo "Usage: sh scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" echo "Usage: sh scripts/run_distribute_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [RANK_TABLE_FILE] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1 exit 1
fi fi
@ -74,6 +74,22 @@ export RANK_TABLE_FILE=$PATH1
export SERVER_ID=0 export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID)) rank_start=$((DEVICE_NUM * SERVER_ID))
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
if [ $1 == "squeezenet" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
elif [ $1 == "squeezenet" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_imagenet_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_cifar10_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_imagenet_config.yaml"
else
echo "error: the selected dataset is not in supported set{squeezenet, squeezenet_residual, cifar10, imagenet}"
exit 1
fi
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do
export DEVICE_ID=${i} export DEVICE_ID=${i}
@ -82,17 +98,21 @@ do
mkdir ./train_parallel$i mkdir ./train_parallel$i
cp ./train.py ./train_parallel$i cp ./train.py ./train_parallel$i
cp -r ./src ./train_parallel$i cp -r ./src ./train_parallel$i
cp -r ./model_utils ./train_parallel$i
cp -r ./*.yaml ./train_parallel$i
cd ./train_parallel$i || exit cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID" echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log env > env.log
if [ $# == 4 ] if [ $# == 4 ]
then then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & python train.py --net_name=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
fi fi
if [ $# == 5 ] if [ $# == 5 ]
then then
python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log & python train.py --net_name=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --data_path=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
fi fi
cd .. cd ..

View File

@ -16,7 +16,7 @@
if [ $# != 5 ] if [ $# != 5 ]
then then
echo "Usage: sh scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATASET_PATH] [CHECKPOINT_PATH]" echo "Usage: sh scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [CHECKPOINT_PATH]"
exit 1 exit 1
fi fi
@ -62,6 +62,22 @@ export DEVICE_ID=$3
export RANK_SIZE=$DEVICE_NUM export RANK_SIZE=$DEVICE_NUM
export RANK_ID=0 export RANK_ID=0
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
if [ $1 == "squeezenet" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
elif [ $1 == "squeezenet" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_imagenet_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_cifar10_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_imagenet_config.yaml"
else
echo "error: the selected dataset is not in supported set{squeezenet, squeezenet_residual, cifar10, imagenet}"
exit 1
fi
if [ -d "eval" ]; if [ -d "eval" ];
then then
rm -rf ./eval rm -rf ./eval
@ -69,8 +85,11 @@ fi
mkdir ./eval mkdir ./eval
cp ./eval.py ./eval cp ./eval.py ./eval
cp -r ./src ./eval cp -r ./src ./eval
cp -r ./model_utils ./eval
cp -r ./*.yaml ./eval
cd ./eval || exit cd ./eval || exit
env > env.log env > env.log
echo "start evaluation for device $DEVICE_ID" echo "start evaluation for device $DEVICE_ID"
python eval.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log & python eval.py --net_name=$1 --dataset=$2 --data_path=$PATH1 --checkpoint_file_path=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
cd .. cd ..

View File

@ -16,7 +16,7 @@
if [ $# != 4 ] if [ $# != 4 ]
then then
echo "Usage: bash scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATASET_PATH] [CHECKPOINT_PATH]" echo "Usage: bash scripts/run_eval.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATA_PATH] [CHECKPOINT_PATH]"
exit 1 exit 1
fi fi
@ -56,6 +56,22 @@ then
exit 1 exit 1
fi fi
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
if [ $1 == "squeezenet" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
elif [ $1 == "squeezenet" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_imagenet_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_cifar10_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_imagenet_config.yaml"
else
echo "error: the selected dataset is not in supported set{squeezenet, squeezenet_residual, cifar10, imagenet}"
exit 1
fi
if [ -d "eval" ]; if [ -d "eval" ];
then then
rm -rf ./eval rm -rf ./eval
@ -63,8 +79,11 @@ fi
mkdir ./eval mkdir ./eval
cp ./eval.py ./eval cp ./eval.py ./eval
cp -r ./src ./eval cp -r ./src ./eval
cp -r ./model_utils ./eval
cp -r ./*.yaml ./eval
cd ./eval || exit cd ./eval || exit
env > env.log env > env.log
echo "start evaluation for device CPU" echo "start evaluation for device CPU"
python eval.py --net=$1 --dataset=$2 --device_target=CPU --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log & python eval.py --net_name=$1 --dataset=$2 --device_target=CPU --data_path=$PATH1 --checkpoint_file_path=$PATH2 \
--config_path=$CONFIG_FILE --output_path './output' &> log &
cd .. cd ..

View File

@ -16,7 +16,7 @@
if [ $# != 4 ] && [ $# != 5 ] if [ $# != 4 ] && [ $# != 5 ]
then then
echo "Usage: sh scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" echo "Usage: sh scripts/run_standalone_train.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DEVICE_ID] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1 exit 1
fi fi
@ -65,6 +65,22 @@ export DEVICE_ID=$3
export RANK_ID=0 export RANK_ID=0
export RANK_SIZE=1 export RANK_SIZE=1
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
if [ $1 == "squeezenet" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
elif [ $1 == "squeezenet" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_imagenet_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_cifar10_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_imagenet_config.yaml"
else
echo "error: the selected dataset is not in supported set{squeezenet, squeezenet_residual, cifar10, imagenet}"
exit 1
fi
if [ -d "train" ]; if [ -d "train" ];
then then
rm -rf ./train rm -rf ./train
@ -72,16 +88,18 @@ fi
mkdir ./train mkdir ./train
cp ./train.py ./train cp ./train.py ./train
cp -r ./src ./train cp -r ./src ./train
cp -r ./model_utils ./train
cp -r ./*.yaml ./train
cd ./train || exit cd ./train || exit
echo "start training for device $DEVICE_ID" echo "start training for device $DEVICE_ID"
env > env.log env > env.log
if [ $# == 4 ] if [ $# == 4 ]
then then
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 &> log & python train.py --net_name=$1 --dataset=$2 --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log &
fi fi
if [ $# == 5 ] if [ $# == 5 ]
then then
python train.py --net=$1 --dataset=$2 --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & python train.py --net_name=$1 --dataset=$2 --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log &
fi fi
cd .. cd ..

View File

@ -16,7 +16,7 @@
if [ $# != 3 ] && [ $# != 4 ] if [ $# != 3 ] && [ $# != 4 ]
then then
echo "Usage: bash scripts/run_train_cpu.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" echo "Usage: bash scripts/run_train_cpu.sh [squeezenet|squeezenet_residual] [cifar10|imagenet] [DATA_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1 exit 1
fi fi
@ -59,6 +59,22 @@ then
exit 1 exit 1
fi fi
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
if [ $1 == "squeezenet" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_cifar10_config.yaml"
elif [ $1 == "squeezenet" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_imagenet_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "cifar10" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_cifar10_config.yaml"
elif [ $1 == "squeezenet_residual" ] && [ $2 == "imagenet" ]; then
CONFIG_FILE="${BASE_PATH}/squeezenet_residual_imagenet_config.yaml"
else
echo "error: the selected dataset is not in supported set{squeezenet, squeezenet_residual, cifar10, imagenet}"
exit 1
fi
if [ -d "train" ]; if [ -d "train" ];
then then
rm -rf ./train rm -rf ./train
@ -66,16 +82,18 @@ fi
mkdir ./train mkdir ./train
cp ./train.py ./train cp ./train.py ./train
cp -r ./src ./train cp -r ./src ./train
cp -r ./model_utils ./train
cp -r ./*.yaml ./train
cd ./train || exit cd ./train || exit
echo "start training for device CPU" echo "start training for device CPU"
env > env.log env > env.log
if [ $# == 3 ] if [ $# == 3 ]
then then
python train.py --net=$1 --dataset=$2 --device_target=CPU --dataset_path=$PATH1 &> log & python train.py --net_name=$1 --dataset=$2 --device_target=CPU --data_path=$PATH1 --config_path=$CONFIG_FILE --output_path './output' &> log &
fi fi
if [ $# == 4 ] if [ $# == 4 ]
then then
python train.py --net=$1 --dataset=$2 --device_target=CPU --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & python train.py --net_name=$1 --dataset=$2 --device_target=CPU --data_path=$PATH1 --pre_trained=$PATH2 --config_path=$CONFIG_FILE --output_path './output' &> log &
fi fi
cd .. cd ..

View File

@ -0,0 +1,62 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_num: 1
device_id: 0
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_cifar10-120_195.ckpt'
# ==============================================================================
# Training options
net_name: "suqeezenet"
dataset : "cifar10"
class_num: 10
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 120
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 10
warmup_epochs: 5
lr_decay_mode: "poly"
lr_init: 0
lr_end: 0
lr_max: 0.01
pre_trained: ""
# export
width: 227
height: 227
file_name: "squeezenet"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,64 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_num: 1
device_id: 0
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_imagenet-200_5004.ckpt'
# ==============================================================================
# Training options
net_name: "suqeezenet"
dataset : "imagenet"
class_num: 1000
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.00007
epoch_size: 200
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "poly"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_end: 0
lr_max: 0.01
pre_trained: ""
# export
width: 227
height: 227
file_name: "squeezenet"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,61 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_num: 1
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_residual_cifar10-150_195.ckpt'
# ==============================================================================
# Training options
net_name: "suqeezenet_residual"
dataset : "cifar10"
class_num: 10
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 150
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 10
warmup_epochs: 5
lr_decay_mode: "linear"
lr_init: 0
lr_end: 0
lr_max: 0.01
pre_trained: ""
#export
width: 227
height: 227
file_name: "squeezenet"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,64 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_num: 1
device_id: 0
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_residual_imagenet-300_5004.ckpt'
# ==============================================================================
# Training options
net_name: "suqeezenet_residual"
dataset : "imagenet"
class_num: 1000
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.00007
epoch_size: 300
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 1
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "cosine"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_end: 0
lr_max: 0.01
pre_trained: ""
#export
width: 227
height: 227
file_name: "squeezenet"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -1,102 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py and eval.py
"""
from easydict import EasyDict as ed
# config for squeezenet, cifar10
config1 = ed({
"class_num": 10,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 120,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 5,
"lr_decay_mode": "poly",
"lr_init": 0,
"lr_end": 0,
"lr_max": 0.01
})
# config for squeezenet, imagenet
config2 = ed({
"class_num": 1000,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 7e-5,
"epoch_size": 200,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 0,
"lr_decay_mode": "poly",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr_init": 0,
"lr_end": 0,
"lr_max": 0.01
})
# config for squeezenet_residual, cifar10
config3 = ed({
"class_num": 10,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 1e-4,
"epoch_size": 150,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 5,
"lr_decay_mode": "linear",
"lr_init": 0,
"lr_end": 0,
"lr_max": 0.01
})
# config for squeezenet_residual, imagenet
config4 = ed({
"class_num": 1000,
"batch_size": 32,
"loss_scale": 1024,
"momentum": 0.9,
"weight_decay": 7e-5,
"epoch_size": 300,
"pretrain_epoch_size": 0,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 10,
"save_checkpoint_path": "./",
"warmup_epochs": 0,
"lr_decay_mode": "cosine",
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr_init": 0,
"lr_end": 0,
"lr_max": 0.01
})

View File

@ -14,7 +14,6 @@
# ============================================================================ # ============================================================================
"""train squeezenet.""" """train squeezenet."""
import os import os
import argparse
from mindspore import context from mindspore import context
from mindspore import Tensor from mindspore import Tensor
from mindspore.nn.optim.momentum import Momentum from mindspore.nn.optim.momentum import Momentum
@ -26,53 +25,42 @@ from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init, get_rank, get_group_size
from mindspore.common import set_seed from mindspore.common import set_seed
from model_utils.config import config
from model_utils.moxing_adapter import moxing_wrapper
from src.lr_generator import get_lr from src.lr_generator import get_lr
from src.CrossEntropySmooth import CrossEntropySmooth from src.CrossEntropySmooth import CrossEntropySmooth
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--net', type=str, default='squeezenet', choices=['squeezenet', 'squeezenet_residual'],
help='Model.')
parser.add_argument('--dataset', type=str, default='cifar10', choices=['cifar10', 'imagenet'], help='Dataset.')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
args_opt = parser.parse_args()
set_seed(1) set_seed(1)
if args_opt.net == "squeezenet": if config.net_name == "squeezenet":
from src.squeezenet import SqueezeNet as squeezenet from src.squeezenet import SqueezeNet as squeezenet
if args_opt.dataset == "cifar10": if config.dataset == "cifar10":
from src.config import config1 as config
from src.dataset import create_dataset_cifar as create_dataset from src.dataset import create_dataset_cifar as create_dataset
else: else:
from src.config import config2 as config
from src.dataset import create_dataset_imagenet as create_dataset from src.dataset import create_dataset_imagenet as create_dataset
else: else:
from src.squeezenet import SqueezeNet_Residual as squeezenet from src.squeezenet import SqueezeNet_Residual as squeezenet
if args_opt.dataset == "cifar10": if config.dataset == "cifar10":
from src.config import config3 as config
from src.dataset import create_dataset_cifar as create_dataset from src.dataset import create_dataset_cifar as create_dataset
else: else:
from src.config import config4 as config
from src.dataset import create_dataset_imagenet as create_dataset from src.dataset import create_dataset_imagenet as create_dataset
if __name__ == '__main__': @moxing_wrapper()
target = args_opt.device_target def train_net():
ckpt_save_dir = config.save_checkpoint_path """train net"""
target = config.device_target
ckpt_save_dir = config.output_path
# init context # init context
context.set_context(mode=context.GRAPH_MODE, context.set_context(mode=context.GRAPH_MODE,
device_target=target) device_target=target)
if args_opt.run_distribute: if config.run_distribute:
if target == "Ascend": if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID')) device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id, context.set_context(device_id=device_id,
enable_auto_mixed_precision=True) enable_auto_mixed_precision=True)
context.set_auto_parallel_context( context.set_auto_parallel_context(
device_num=args_opt.device_num, device_num=config.device_num,
parallel_mode=ParallelMode.DATA_PARALLEL, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True) gradients_mean=True)
init() init()
@ -85,11 +73,11 @@ if __name__ == '__main__':
device_num=get_group_size(), device_num=get_group_size(),
parallel_mode=ParallelMode.DATA_PARALLEL, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True) gradients_mean=True)
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str( ckpt_save_dir = ckpt_save_dir + "/ckpt_" + str(
get_rank()) + "/" get_rank()) + "/"
# create dataset # create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, dataset = create_dataset(dataset_path=config.data_path,
do_train=True, do_train=True,
repeat_num=1, repeat_num=1,
batch_size=config.batch_size, batch_size=config.batch_size,
@ -100,8 +88,8 @@ if __name__ == '__main__':
net = squeezenet(num_classes=config.class_num) net = squeezenet(num_classes=config.class_num)
# load checkpoint # load checkpoint
if args_opt.pre_trained: if config.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained) param_dict = load_checkpoint(config.pre_trained)
load_param_into_net(net, param_dict) load_param_into_net(net, param_dict)
# init lr # init lr
@ -116,7 +104,7 @@ if __name__ == '__main__':
lr = Tensor(lr) lr = Tensor(lr)
# define loss # define loss
if args_opt.dataset == "imagenet": if config.dataset == "imagenet":
if not config.use_label_smooth: if not config.use_label_smooth:
config.label_smooth_factor = 0.0 config.label_smooth_factor = 0.0
loss = CrossEntropySmooth(sparse=True, loss = CrossEntropySmooth(sparse=True,
@ -163,7 +151,7 @@ if __name__ == '__main__':
config_ck = CheckpointConfig( config_ck = CheckpointConfig(
save_checkpoint_steps=config.save_checkpoint_epochs * step_size, save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
keep_checkpoint_max=config.keep_checkpoint_max) keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix=args_opt.net + '_' + args_opt.dataset, ckpt_cb = ModelCheckpoint(prefix=config.net_name + '_' + config.dataset,
directory=ckpt_save_dir, directory=ckpt_save_dir,
config=config_ck) config=config_ck)
cb += [ckpt_cb] cb += [ckpt_cb]
@ -172,3 +160,6 @@ if __name__ == '__main__':
model.train(config.epoch_size - config.pretrain_epoch_size, model.train(config.epoch_size - config.pretrain_epoch_size,
dataset, dataset,
callbacks=cb) callbacks=cb)
if __name__ == '__main__':
train_net()

View File

@ -78,7 +78,7 @@ Dataset used: [COCO2017](<http://images.cocodataset.org/>)
1. If coco dataset is used. **Select dataset to coco when run script.** 1. If coco dataset is used. **Select dataset to coco when run script.**
Change the `coco_root` and other settings you need in `src/config_xxx.py`. The directory structure is as follows: Change the `coco_root` and other settings you need in `model_utils/ssd_xxx.yaml`. The directory structure is as follows:
```shell ```shell
. .
@ -91,7 +91,7 @@ Dataset used: [COCO2017](<http://images.cocodataset.org/>)
``` ```
2. If VOC dataset is used. **Select dataset to voc when run script.** 2. If VOC dataset is used. **Select dataset to voc when run script.**
Change `classes`, `num_classes`, `voc_json` and `voc_root` in `src/config_xxx.py`. `voc_json` is the path of json file with coco format for evaluation, `voc_root` is the path of VOC dataset, the directory structure is as follows: Change `classes`, `num_classes`, `voc_json` and `voc_root` in `model_utils/ssd_xxx.yaml`. `voc_json` is the path of json file with coco format for evaluation, `voc_root` is the path of VOC dataset, the directory structure is as follows:
```shell ```shell
. .
@ -117,15 +117,15 @@ Dataset used: [COCO2017](<http://images.cocodataset.org/>)
train2017/0000001.jpg 0,259,401,459,7 35,28,324,201,2 0,30,59,80,2 train2017/0000001.jpg 0,259,401,459,7 35,28,324,201,2 0,30,59,80,2
``` ```
Each row is an image annotation which split by space, the first column is a relative path of image, the others are box and class infomations of the format [xmin,ymin,xmax,ymax,class]. We read image from an image path joined by the `image_dir`(dataset directory) and the relative path in `anno_path`(the TXT file path), `image_dir` and `anno_path` are setting in `src/config_xxx.py`. Each row is an image annotation which split by space, the first column is a relative path of image, the others are box and class infomations of the format [xmin,ymin,xmax,ymax,class]. We read image from an image path joined by the `image_dir`(dataset directory) and the relative path in `anno_path`(the TXT file path), `image_dir` and `anno_path` are setting in `model_utils/ssd_xxx.yaml`.
## [Quick Start](#contents) ## [Quick Start](#contents)
### Prepare the model ### Prepare the model
1. Chose the model by changing the `using_model` in `src/config.py`. The optional models are: `ssd300`, `ssd_mobilenet_v1_fpn`, `ssd_vgg16`, `ssd_resnet50_fpn`. 1. Chose the model by changing the `using_model` in `model_utils/ssd_xxx.yaml`. The optional models are: `ssd300`, `ssd_mobilenet_v1_fpn`, `ssd_vgg16`, `ssd_resnet50_fpn`.
2. Change the dataset config in the corresponding config. `src/config_xxx.py`, `xxx` is the corresponding backbone network name 2. Change the dataset config in the corresponding config. `model_utils/ssd_xxx.yaml`, `xxx` is the corresponding backbone network name
3. If you are running with `ssd_mobilenet_v1_fpn` or `ssd_resnet50_fpn`, you need a pretrained model for `mobilenet_v1` or `resnet50`. Set the checkpoint path to `feature_extractor_base_param` in `src/config_xxx.py`. For more detail about training pre-trained model, please refer to the corresponding backbone network. 3. If you are running with `ssd_mobilenet_v1_fpn` or `ssd_resnet50_fpn`, you need a pretrained model for `mobilenet_v1` or `resnet50`. Set the checkpoint path to `feature_extractor_base_param` in `model_utils/ssd_xxx.yaml`. For more detail about training pre-trained model, please refer to the corresponding backbone network.
### Run the scripts ### Run the scripts
@ -135,23 +135,23 @@ After installing MindSpore via the official website, you can start training and
```shell ```shell
# distributed training on Ascend # distributed training on Ascend
bash run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] bash run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] [CONFIG_PATH]
# run eval on Ascend # run eval on Ascend
bash run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] bash run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
# run inference on Ascend310, MINDIR_PATH is the mindir model which you can export from checkpoint using export.py # run inference on Ascend310, MINDIR_PATH is the mindir model which you can export from checkpoint using export.py
bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID] bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
- running on GPU - running on GPU
```shell ```shell
# distributed training on GPU # distributed training on GPU
bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [CONFIG_PATH]
# run eval on GPU # run eval on GPU
bash run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] bash run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
- running on CPU(support Windows and Ubuntu) - running on CPU(support Windows and Ubuntu)
@ -160,10 +160,10 @@ bash run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]
```shell ```shell
# training on CPU # training on CPU
python train.py --run_platform=CPU --lr=[LR] --dataset=[DATASET] --epoch_size=[EPOCH_SIZE] --batch_size=[BATCH_SIZE] --pre_trained=[PRETRAINED_CKPT] --filter_weight=True --save_checkpoint_epochs=1 python train.py --device_target=CPU --lr=[LR] --dataset=[DATASET] --epoch_size=[EPOCH_SIZE] --batch_size=[BATCH_SIZE] --config_path=[CONFIG_PATH] --pre_trained=[PRETRAINED_CKPT] --filter_weight=True --save_checkpoint_epochs=1
# run eval on GPU # run eval on GPU
python eval.py --run_platform=CPU --dataset=[DATASET] --checkpoint_path=[PRETRAINED_CKPT] python eval.py --device_target=CPU --dataset=[DATASET] --checkpoint_file_path=[PRETRAINED_CKPT] --config_path=[CONFIG_PATH]
``` ```
- Run on docker - Run on docker
@ -182,6 +182,40 @@ Create a container layer over the created image and start it
bash scripts/docker_start.sh ssd:20.1.0 [DATA_DIR] [MODEL_DIR] bash scripts/docker_start.sh ssd:20.1.0 [DATA_DIR] [MODEL_DIR]
``` ```
如果要在modelarts上进行模型的训练可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
开始进行模型的训练和推理,具体操作如下:
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
```python
# run distributed training on modelarts example
# (1) First, Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set other parameters on yaml file you need.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add other parameters on the website UI interface.
# (2) Set the config directory to "config_path=/The path of config in S3/"
# (3) Set the code directory to "/path/ssd" on the website UI interface.
# (4) Set the startup file to "train.py" on the website UI interface.
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Create your job.
# run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b.
# a. Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the config directory to "config_path=/The path of config in S3/"
# (4) Set the code directory to "/path/ssd" on the website UI interface.
# (5) Set the startup file to "eval.py" on the website UI interface.
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (7) Create your job.
```
Then you can run everything just like on ascend. Then you can run everything just like on ascend.
## [Script Description](#contents) ## [Script Description](#contents)
@ -220,6 +254,15 @@ Then you can run everything just like on ascend.
├─ resnet.py ## network definition for resnet ├─ resnet.py ## network definition for resnet
├─ ssd.py ## ssd architecture ├─ ssd.py ## ssd architecture
└─ vgg16.py ## network definition for vgg16 └─ vgg16.py ## network definition for vgg16
├── model_utils
│ ├── config.py ## parameter configuration
│ ├── device_adapter.py ## device adapter
│ ├── local_adapter.py ## local adapter
│ ├── moxing_adapter.py ## moxing adapter
├─ ssd_mobilenet_v1_fpn_config.yaml ## parameter configuration
├─ ssd_resnet50_fpn_config.yaml ## parameter configuration
├─ ssd_vgg16_config.yaml ## parameter configuration
├─ ssd300_config.yaml ## parameter configuration
├─ Dockerfile ## docker file ├─ Dockerfile ## docker file
├─ eval.py ## eval scripts ├─ eval.py ## eval scripts
├─ export.py ## export mindir script ├─ export.py ## export mindir script
@ -269,7 +312,7 @@ To train the model, run `train.py`. If the `mindrecord_dir` is empty, it will ge
- Distribute mode - Distribute mode
```shell ```shell
bash run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional) bash run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
``` ```
We need five or seven parameters for this scripts. We need five or seven parameters for this scripts.
@ -279,6 +322,7 @@ We need five or seven parameters for this scripts.
- `LR`: learning rate init value for distributed train. - `LR`: learning rate init value for distributed train.
- `DATASET`the dataset mode for distributed train. - `DATASET`the dataset mode for distributed train.
- `RANK_TABLE_FILE :` the path of [rank_table.json](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools), it is better to use absolute path. - `RANK_TABLE_FILE :` the path of [rank_table.json](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools), it is better to use absolute path.
- `CONFIG_PATH`: parameter configuration.
- `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path.
- `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained.
@ -306,7 +350,7 @@ epoch time: 39064.8467540741, per step time: 85.29442522723602
- Distribute mode - Distribute mode
```shell ```shell
bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional) bash run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
``` ```
We need four or six parameters for this scripts. We need four or six parameters for this scripts.
@ -315,6 +359,7 @@ We need four or six parameters for this scripts.
- `EPOCH_NUM`: epoch num for distributed train. - `EPOCH_NUM`: epoch num for distributed train.
- `LR`: learning rate init value for distributed train. - `LR`: learning rate init value for distributed train.
- `DATASET`the dataset mode for distributed train. - `DATASET`the dataset mode for distributed train.
- `CONFIG_PATH`: parameter configuration.
- `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path. - `PRE_TRAINED :` the path of pretrained checkpoint file, it is better to use absolute path.
- `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained. - `PRE_TRAINED_EPOCH_SIZE :` the epoch num of pretrained.
@ -349,7 +394,7 @@ You can train your own model based on either pretrained classification model or
#### Evaluation on Ascend #### Evaluation on Ascend
```shell ```shell
bash run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] bash run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
We need two parameters for this scripts. We need two parameters for this scripts.
@ -357,6 +402,7 @@ We need two parameters for this scripts.
- `DATASET`the dataset mode of evaluation dataset. - `DATASET`the dataset mode of evaluation dataset.
- `CHECKPOINT_PATH`: the absolute path for checkpoint file. - `CHECKPOINT_PATH`: the absolute path for checkpoint file.
- `DEVICE_ID`: the device id for eval. - `DEVICE_ID`: the device id for eval.
- `CONFIG_PATH`: parameter configuration.
> checkpoint can be produced in training process. > checkpoint can be produced in training process.
@ -384,7 +430,7 @@ mAP: 0.23808886505483504
#### Evaluation on GPU #### Evaluation on GPU
```shell ```shell
bash run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] bash run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
We need two parameters for this scripts. We need two parameters for this scripts.
@ -392,6 +438,7 @@ We need two parameters for this scripts.
- `DATASET`the dataset mode of evaluation dataset. - `DATASET`the dataset mode of evaluation dataset.
- `CHECKPOINT_PATH`: the absolute path for checkpoint file. - `CHECKPOINT_PATH`: the absolute path for checkpoint file.
- `DEVICE_ID`: the device id for eval. - `DEVICE_ID`: the device id for eval.
- `CONFIG_PATH`: parameter configuration.
> checkpoint can be produced in training process. > checkpoint can be produced in training process.
@ -421,7 +468,7 @@ mAP: 0.2244936111705981
### [Export MindIR](#contents) ### [Export MindIR](#contents)
```shell ```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH]
``` ```
The ckpt_file parameter is required, The ckpt_file parameter is required,

View File

@ -97,7 +97,7 @@ SSD方法基于前向卷积网络该网络产生固定大小的边界框集
``` ```
每行是按空间分割的图像标注,第一列是图像的相对路径,其余为[xmin,ymin,xmax,ymax,class]格式的框和类信息。我们从`IMAGE_DIR`(数据集目录)和`ANNO_PATH`TXT文件路径的相对路径连接起来的图像路径中读取图像。在`config.py`中设置`IMAGE_DIR`和`ANNO_PATH`。 每行是按空间分割的图像标注,第一列是图像的相对路径,其余为[xmin,ymin,xmax,ymax,class]格式的框和类信息。我们从`IMAGE_DIR`(数据集目录)和`ANNO_PATH`TXT文件路径的相对路径连接起来的图像路径中读取图像。在`*yaml`中设置`IMAGE_DIR`和`ANNO_PATH`。
# 快速入门 # 快速入门
@ -107,24 +107,58 @@ SSD方法基于前向卷积网络该网络产生固定大小的边界框集
```shell script ```shell script
# Ascend分布式训练 # Ascend分布式训练
sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] [CONFIG_PATH]
``` ```
```shell script ```shell script
# Ascend处理器环境运行eval # Ascend处理器环境运行eval
sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
- GPU处理器环境运行 - GPU处理器环境运行
```shell script ```shell script
# GPU分布式训练 # GPU分布式训练
sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [CONFIG_PATH]
``` ```
```shell script ```shell script
# GPU处理器环境运行eval # GPU处理器环境运行eval
sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
```
如果要在modelarts上进行模型的训练可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
开始进行模型的训练和推理,具体操作如下:
```python
# 在modelarts上使用分布式训练的示例
# (1) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True" 。
# 在yaml文件上设置网络所需的参数。
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 在modelarts的界面上设置网络所需的参数。
# (2)设置网络配置文件的路径 "config_path=/The path of config in S3/"
# (3) 在modelarts的界面上设置代码的路径 "/path/ssd"。
# (4) 在modelarts的界面上设置模型的启动文件 "train.py" 。
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (6) 开始模型的训练。
# 在modelarts上使用模型推理的示例
# (1) 把训练好的模型地方到桶的对应位置。
# (2) 选址a或者b其中一种方式。
# a. 设置 "enable_modelarts=True"
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
# (3) 设置网络配置文件的路径 "config_path=/The path of config in S3/"
# (4) 在modelarts的界面上设置代码的路径 "/path/ssd"。
# (5) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
# (6) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
# (7) 开始模型的推理。
``` ```
# 脚本说明 # 脚本说明
@ -163,6 +197,15 @@ sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]
├─ resnet.py ## resnet网络定义 ├─ resnet.py ## resnet网络定义
├─ ssd.py ## SSD架构 ├─ ssd.py ## SSD架构
└─ vgg16.py ## vgg16网络定义 └─ vgg16.py ## vgg16网络定义
├── model_utils
│ ├──config.py ## 参数配置
│ ├──device_adapter.py ## 设备配置
│ ├──local_adapter.py ## 本地设备配置
│ ├──moxing_adapter.py ## modelarts设备配置
├─ ssd_mobilenet_v1_fpn_config.yaml ## 参数配置
├─ ssd_resnet50_fpn_config.yaml ## 参数配置
├─ ssd_vgg16_config.yaml ## 参数配置
├─ ssd300_config.yaml ## 参数配置
├─ Dockerfile ## docker文件 ├─ Dockerfile ## docker文件
├─ eval.py ## 评估脚本 ├─ eval.py ## 评估脚本
├─ export.py ## 导出 AIR,MINDIR模型的脚本 ├─ export.py ## 导出 AIR,MINDIR模型的脚本
@ -205,7 +248,7 @@ sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]
- 分布式 - 分布式
```shell script ```shell script
sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional) sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
``` ```
此脚本需要五或七个参数。 此脚本需要五或七个参数。
@ -215,6 +258,7 @@ sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]
- `LR`:分布式训练的学习率初始值。 - `LR`:分布式训练的学习率初始值。
- `DATASET`:分布式训练的数据集模式。 - `DATASET`:分布式训练的数据集模式。
- `RANK_TABLE_FILE`[rank_table.json](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools)的路径。最好使用绝对路径。 - `RANK_TABLE_FILE`[rank_table.json](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools)的路径。最好使用绝对路径。
- `CONFIG_PATH`: 参数配置。
- `PRE_TRAINED`:预训练检查点文件的路径。最好使用绝对路径。 - `PRE_TRAINED`:预训练检查点文件的路径。最好使用绝对路径。
- `PRE_TRAINED_EPOCH_SIZE`:预训练的轮次数。 - `PRE_TRAINED_EPOCH_SIZE`:预训练的轮次数。
@ -242,7 +286,7 @@ epoch time: 39064.8467540741, per step time: 85.29442522723602
- 分布式 - 分布式
```shell script ```shell script
sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional) sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)
``` ```
此脚本需要四或六个参数。 此脚本需要四或六个参数。
@ -251,6 +295,7 @@ epoch time: 39064.8467540741, per step time: 85.29442522723602
- `EPOCH_NUM`:分布式训练的轮次数。 - `EPOCH_NUM`:分布式训练的轮次数。
- `LR`:分布式训练的学习率初始值。 - `LR`:分布式训练的学习率初始值。
- `DATASET`:分布式训练的数据集模式。 - `DATASET`:分布式训练的数据集模式。
- `CONFIG_PATH`: 参数配置。
- `PRE_TRAINED`:预训练检查点文件的路径。最好使用绝对路径。 - `PRE_TRAINED`:预训练检查点文件的路径。最好使用绝对路径。
- `PRE_TRAINED_EPOCH_SIZE`:预训练的轮次数。 - `PRE_TRAINED_EPOCH_SIZE`:预训练的轮次数。
@ -272,7 +317,7 @@ epoch time: 150753.701, per step time: 329.157
### Ascend处理器环境评估 ### Ascend处理器环境评估
```shell script ```shell script
sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
此脚本需要两个参数。 此脚本需要两个参数。
@ -280,6 +325,7 @@ sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]
- `DATASET`:评估数据集的模式。 - `DATASET`:评估数据集的模式。
- `CHECKPOINT_PATH`:检查点文件的绝对路径。 - `CHECKPOINT_PATH`:检查点文件的绝对路径。
- `DEVICE_ID`: 评估的设备ID。 - `DEVICE_ID`: 评估的设备ID。
- `CONFIG_PATH`: 参数配置。
> 在训练过程中可以生成检查点。 > 在训练过程中可以生成检查点。
@ -307,7 +353,7 @@ mAP: 0.23808886505483504
### GPU处理器环境评估 ### GPU处理器环境评估
```shell script ```shell script
sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]
``` ```
此脚本需要两个参数。 此脚本需要两个参数。
@ -315,6 +361,7 @@ sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]
- `DATASET`:评估数据集的模式。 - `DATASET`:评估数据集的模式。
- `CHECKPOINT_PATH`:检查点文件的绝对路径。 - `CHECKPOINT_PATH`:检查点文件的绝对路径。
- `DEVICE_ID`: 评估的设备ID。 - `DEVICE_ID`: 评估的设备ID。
- `CONFIG_PATH`: 参数配置。
> 在训练过程中可以生成检查点。 > 在训练过程中可以生成检查点。
@ -344,7 +391,7 @@ mAP: 0.2244936111705981
### [导出MindIR](#contents) ### [导出MindIR](#contents)
```shell ```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --config_path [CONFIG_PATH]
``` ```
参数ckpt_file为必填项 参数ckpt_file为必填项
@ -357,7 +404,7 @@ python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [
```shell ```shell
# Ascend310 inference # Ascend310 inference
bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DVPP] [DEVICE_ID] bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DVPP] [DEVICE_ID] [CONFIG_PATH]
``` ```
- `DVPP` 为必填项,需要在["DVPP", "CPU"]选择大小写均可。需要注意的是ssd_vgg16执行推理的图片尺寸为[300, 300]由于DVPP硬件限制宽为16整除高为2整除因此这个网络需要通过CPU算子对图像进行前处理。 - `DVPP` 为必填项,需要在["DVPP", "CPU"]选择大小写均可。需要注意的是ssd_vgg16执行推理的图片尺寸为[300, 300]由于DVPP硬件限制宽为16整除高为2整除因此这个网络需要通过CPU算子对图像进行前处理。

View File

@ -16,30 +16,30 @@
"""Evaluation for SSD""" """Evaluation for SSD"""
import os import os
import argparse
from mindspore import context, Tensor from mindspore import context, Tensor
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_resnet50_fpn, ssd_vgg16 from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_resnet50_fpn, ssd_vgg16
from src.dataset import create_ssd_dataset, create_mindrecord from src.dataset import create_ssd_dataset, create_mindrecord
from src.config import config
from src.eval_utils import apply_eval from src.eval_utils import apply_eval
from src.box_utils import default_boxes from src.box_utils import default_boxes
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
def ssd_eval(dataset_path, ckpt_path, anno_json): def ssd_eval(dataset_path, ckpt_path, anno_json):
"""SSD evaluation.""" """SSD evaluation."""
batch_size = 1 batch_size = 1
ds = create_ssd_dataset(dataset_path, batch_size=batch_size, repeat_num=1, ds = create_ssd_dataset(dataset_path, batch_size=batch_size, repeat_num=1,
is_training=False, use_multiprocessing=False) is_training=False, use_multiprocessing=False)
if config.model == "ssd300": if config.model_name == "ssd300":
net = SSD300(ssd_mobilenet_v2(), config, is_training=False) net = SSD300(ssd_mobilenet_v2(), config, is_training=False)
elif config.model == "ssd_vgg16": elif config.model_name == "ssd_vgg16":
net = ssd_vgg16(config=config) net = ssd_vgg16(config=config)
elif config.model == "ssd_mobilenet_v1_fpn": elif config.model_name == "ssd_mobilenet_v1_fpn":
net = ssd_mobilenet_v1_fpn(config=config) net = ssd_mobilenet_v1_fpn(config=config)
elif config.model == "ssd_resnet50_fpn": elif config.model_name == "ssd_resnet50_fpn":
net = ssd_resnet50_fpn(config=config) net = ssd_resnet50_fpn(config=config)
else: else:
raise ValueError(f'config.model: {config.model} is not supported') raise ValueError(f'config.model: {config.model_name} is not supported')
net = SsdInferWithDecoder(net, Tensor(default_boxes), config) net = SsdInferWithDecoder(net, Tensor(default_boxes), config)
print("Load Checkpoint!") print("Load Checkpoint!")
@ -57,27 +57,30 @@ def ssd_eval(dataset_path, ckpt_path, anno_json):
print("\n========================================\n") print("\n========================================\n")
print(f"mAP: {mAP}") print(f"mAP: {mAP}")
def get_eval_args(): @moxing_wrapper()
parser = argparse.ArgumentParser(description='SSD evaluation') def eval_net():
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") if hasattr(config, 'num_ssd_boxes') and config.num_ssd_boxes == -1:
parser.add_argument("--dataset", type=str, default="coco", help="Dataset, default is coco.") num = 0
parser.add_argument("--checkpoint_path", type=str, required=True, help="Checkpoint file path.") h, w = config.img_shape
parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), for i in range(len(config.steps)):
help="run platform, support Ascend ,GPU and CPU.") num += (h // config.steps[i]) * (w // config.steps[i]) * config.num_default[i]
return parser.parse_args() config.num_ssd_boxes = num
if __name__ == '__main__': if config.dataset == "coco":
args_opt = get_eval_args() coco_root = os.path.join(config.data_path, config.coco_root)
if args_opt.dataset == "coco": json_path = os.path.join(coco_root, config.instances_set.format(config.val_data_type))
json_path = os.path.join(config.coco_root, config.instances_set.format(config.val_data_type)) elif config.dataset == "voc":
elif args_opt.dataset == "voc": voc_root = os.path.join(config.data_path, config.voc_root)
json_path = os.path.join(config.voc_root, config.voc_json) json_path = os.path.join(voc_root, config.voc_json)
else: else:
raise ValueError('SSD eval only support dataset mode is coco and voc!') raise ValueError('SSD eval only support dataset mode is coco and voc!')
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.run_platform, device_id=args_opt.device_id) context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id)
mindrecord_file = create_mindrecord(args_opt.dataset, "ssd_eval.mindrecord", False) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False)
print("Start Eval!") print("Start Eval!")
ssd_eval(mindrecord_file, args_opt.checkpoint_path, json_path) ssd_eval(mindrecord_file, config.checkpoint_file_path, json_path)
if __name__ == '__main__':
eval_net()

View File

@ -13,48 +13,37 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
import argparse
import numpy as np import numpy as np
import mindspore import mindspore
from mindspore import context, Tensor from mindspore import context, Tensor
from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_resnet50_fpn, ssd_vgg16 from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn, ssd_resnet50_fpn, ssd_vgg16
from src.config import config from src.model_utils.config import config
from src.box_utils import default_boxes from src.box_utils import default_boxes
parser = argparse.ArgumentParser(description='SSD export') context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
parser.add_argument("--device_id", type=int, default=0, help="Device id") if config.device_target == "Ascend":
parser.add_argument("--batch_size", type=int, default=1, help="batch size") context.set_context(device_id=config.device_id)
parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.")
parser.add_argument("--file_name", type=str, default="ssd", help="output file name.")
parser.add_argument('--file_format', type=str, choices=["AIR", "MINDIR"], default='AIR', help='file format')
parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend",
help="device target")
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
if args.device_target == "Ascend":
context.set_context(device_id=args.device_id)
if __name__ == '__main__': if __name__ == '__main__':
if config.model == "ssd300": if config.model_name == "ssd300":
net = SSD300(ssd_mobilenet_v2(), config, is_training=False) net = SSD300(ssd_mobilenet_v2(), config, is_training=False)
elif config.model == "ssd_vgg16": elif config.model_name == "ssd_vgg16":
net = ssd_vgg16(config=config) net = ssd_vgg16(config=config)
elif config.model == "ssd_mobilenet_v1_fpn": elif config.model_name == "ssd_mobilenet_v1_fpn":
net = ssd_mobilenet_v1_fpn(config=config) net = ssd_mobilenet_v1_fpn(config=config)
elif config.model == "ssd_resnet50_fpn": elif config.model_name == "ssd_resnet50_fpn":
net = ssd_resnet50_fpn(config=config) net = ssd_resnet50_fpn(config=config)
else: else:
raise ValueError(f'config.model: {config.model} is not supported') raise ValueError(f'config.model: {config.model_name} is not supported')
net = SsdInferWithDecoder(net, Tensor(default_boxes), config) net = SsdInferWithDecoder(net, Tensor(default_boxes), config)
param_dict = load_checkpoint(args.ckpt_file) param_dict = load_checkpoint(config.checkpoint_file_path)
net.init_parameters_data() net.init_parameters_data()
load_param_into_net(net, param_dict) load_param_into_net(net, param_dict)
net.set_train(False) net.set_train(False)
input_shp = [args.batch_size, 3] + config.img_shape input_shp = [config.batch_size, 3] + config.img_shape
input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp), mindspore.float32) input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp), mindspore.float32)
export(net, input_array, file_name=args.file_name, file_format=args.file_format) export(net, input_array, file_name=config.file_name, file_format=config.file_format)

View File

@ -21,10 +21,10 @@ echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /op
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "=================================================================================================================" echo "================================================================================================================="
if [ $# != 5 ] && [ $# != 7 ] if [ $# != 6 ] && [ $# != 8 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
[RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [RANK_TABLE_FILE] [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
@ -39,8 +39,9 @@ export RANK_SIZE=$1
EPOCH_SIZE=$2 EPOCH_SIZE=$2
LR=$3 LR=$3
DATASET=$4 DATASET=$4
PRE_TRAINED=$6 PRE_TRAINED=$7
PRE_TRAINED_EPOCH_SIZE=$7 CONFIG_PATH=$6
PRE_TRAINED_EPOCH_SIZE=$8
export RANK_TABLE_FILE=$5 export RANK_TABLE_FILE=$5
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
@ -49,33 +50,38 @@ do
rm -rf LOG$i rm -rf LOG$i
mkdir ./LOG$i mkdir ./LOG$i
cp ./*.py ./LOG$i cp ./*.py ./LOG$i
cp ./*.yaml ./LOG$i
cp -r ./src ./LOG$i cp -r ./src ./LOG$i
cd ./LOG$i || exit cd ./LOG$i || exit
export RANK_ID=$i export RANK_ID=$i
echo "start training for rank $i, device $DEVICE_ID" echo "start training for rank $i, device $DEVICE_ID"
env > env.log env > env.log
if [ $# == 5 ] if [ $# == 6 ]
then then
python train.py \ python train.py \
--distribute=True \ --run_distribute=True \
--lr=$LR \ --lr=$LR \
--dataset=$DATASET \ --dataset=$DATASET \
--device_num=$RANK_SIZE \ --device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \ --device_id=$DEVICE_ID \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 & --epoch_size=$EPOCH_SIZE \
--config_path=$CONFIG_PATH \
--output_path './output' > log.txt 2>&1 &
fi fi
if [ $# == 7 ] if [ $# == 8 ]
then then
python train.py \ python train.py \
--distribute=True \ --run_distribute=True \
--lr=$LR \ --lr=$LR \
--dataset=$DATASET \ --dataset=$DATASET \
--device_num=$RANK_SIZE \ --device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \ --device_id=$DEVICE_ID \
--pre_trained=$PRE_TRAINED \ --pre_trained=$PRE_TRAINED \
--pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \ --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 & --epoch_size=$EPOCH_SIZE \
--config_path=$CONFIG_PATH \
--output_path './output' > log.txt 2>&1 &
fi fi
cd ../ cd ../

View File

@ -21,17 +21,17 @@ echo "for example: sh run_distribute_train_gpu.sh 8 500 0.2 coco /opt/ssd-300.ck
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "=================================================================================================================" echo "================================================================================================================="
if [ $# != 4 ] && [ $# != 6 ] if [ $# != 5 ] && [ $# != 7 ]
then then
echo "Usage: sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \ echo "Usage: sh run_distribute_train_gpu.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
# Before start distribute train, first create mindrecord files. # Before start distribute train, first create mindrecord files.
BASE_PATH=$(cd "`dirname $0`" || exit; pwd) BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
cd $BASE_PATH/../ || exit cd $BASE_PATH/../ || exit
python train.py --only_create_dataset=True --run_platform="GPU" --dataset=$4 python train.py --only_create_dataset=True --device_target="GPU" --dataset=$4
echo "After running the script, the network runs in the background. The log will be generated in LOG/log.txt" echo "After running the script, the network runs in the background. The log will be generated in LOG/log.txt"
@ -39,39 +39,45 @@ export RANK_SIZE=$1
EPOCH_SIZE=$2 EPOCH_SIZE=$2
LR=$3 LR=$3
DATASET=$4 DATASET=$4
PRE_TRAINED=$5 CONFIG_PATH=$5
PRE_TRAINED_EPOCH_SIZE=$6 PRE_TRAINED=$6
PRE_TRAINED_EPOCH_SIZE=$7
rm -rf LOG rm -rf LOG
mkdir ./LOG mkdir ./LOG
cp ./*.py ./LOG cp ./*.py ./LOG
cp ./*.yaml ./LOG
cp -r ./src ./LOG cp -r ./src ./LOG
cd ./LOG || exit cd ./LOG || exit
if [ $# == 4 ] if [ $# == 5 ]
then then
mpirun -allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ mpirun -allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py \ python train.py \
--distribute=True \ --run_distribute=True \
--lr=$LR \ --lr=$LR \
--dataset=$DATASET \ --dataset=$DATASET \
--device_num=$RANK_SIZE \ --device_num=$RANK_SIZE \
--loss_scale=1 \ --loss_scale=1 \
--run_platform="GPU" \ --device_target="GPU" \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 & --epoch_size=$EPOCH_SIZE \
--config_path=$CONFIG_PATH \
--output_path './output' > log.txt 2>&1 &
fi fi
if [ $# == 6 ] if [ $# == 7 ]
then then
mpirun -allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ mpirun -allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py \ python train.py \
--distribute=True \ --run_distribute=True \
--lr=$LR \ --lr=$LR \
--dataset=$DATASET \ --dataset=$DATASET \
--device_num=$RANK_SIZE \ --device_num=$RANK_SIZE \
--pre_trained=$PRE_TRAINED \ --pre_trained=$PRE_TRAINED \
--pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \ --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
--loss_scale=1 \ --loss_scale=1 \
--run_platform="GPU" \ --device_target="GPU" \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 & --epoch_size=$EPOCH_SIZE \
--config_path=$CONFIG_PATH \
--output_path './output' > log.txt 2>&1 &
fi fi

View File

@ -14,9 +14,9 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
if [ $# != 3 ] if [ $# != 4 ]
then then
echo "Usage: sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]" echo "Usage: sh run_eval.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]"
exit 1 exit 1
fi fi
@ -30,7 +30,9 @@ get_real_path(){
DATASET=$1 DATASET=$1
CHECKPOINT_PATH=$(get_real_path $2) CHECKPOINT_PATH=$(get_real_path $2)
CONFIG_PATH=$(get_real_path $4)
echo $DATASET echo $DATASET
echo $CONFIG_PATH
echo $CHECKPOINT_PATH echo $CHECKPOINT_PATH
if [ ! -f $CHECKPOINT_PATH ] if [ ! -f $CHECKPOINT_PATH ]
@ -54,12 +56,14 @@ fi
mkdir ./eval$3 mkdir ./eval$3
cp ./*.py ./eval$3 cp ./*.py ./eval$3
cp ./*.yaml ./eval$3
cp -r ./src ./eval$3 cp -r ./src ./eval$3
cd ./eval$3 || exit cd ./eval$3 || exit
env > env.log env > env.log
echo "start inferring for device $DEVICE_ID" echo "start inferring for device $DEVICE_ID"
python eval.py \ python eval.py \
--dataset=$DATASET \ --dataset=$DATASET \
--checkpoint_path=$CHECKPOINT_PATH \ --checkpoint_file_path=$CHECKPOINT_PATH \
--device_id=$3 > log.txt 2>&1 & --device_id=$3 \
--config_path=$CONFIG_PATH > log.txt 2>&1 &
cd .. cd ..

View File

@ -14,9 +14,9 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
if [ $# != 3 ] if [ $# != 4 ]
then then
echo "Usage: sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID]" echo "Usage: sh run_eval_gpu.sh [DATASET] [CHECKPOINT_PATH] [DEVICE_ID] [CONFIG_PATH]"
exit 1 exit 1
fi fi
@ -30,8 +30,10 @@ get_real_path(){
DATASET=$1 DATASET=$1
CHECKPOINT_PATH=$(get_real_path $2) CHECKPOINT_PATH=$(get_real_path $2)
CONFIG_PATH=$(get_real_path $4)
echo $DATASET echo $DATASET
echo $CHECKPOINT_PATH echo $CHECKPOINT_PATH
echo $CONFIG_PATH
if [ ! -f $CHECKPOINT_PATH ] if [ ! -f $CHECKPOINT_PATH ]
then then
@ -54,13 +56,15 @@ fi
mkdir ./eval$3 mkdir ./eval$3
cp ./*.py ./eval$3 cp ./*.py ./eval$3
cp ./*.yaml ./eval$3
cp -r ./src ./eval$3 cp -r ./src ./eval$3
cd ./eval$3 || exit cd ./eval$3 || exit
env > env.log env > env.log
echo "start inferring for device $DEVICE_ID" echo "start inferring for device $DEVICE_ID"
python eval.py \ python eval.py \
--dataset=$DATASET \ --dataset=$DATASET \
--checkpoint_path=$CHECKPOINT_PATH \ --checkpoint_file_path=$CHECKPOINT_PATH \
--run_platform="GPU" \ --device_target="GPU" \
--device_id=$3 > log.txt 2>&1 & --device_id=$3 \
--config_path=$CONFIG_PATH > log.txt 2>&1 &
cd .. cd ..

View File

@ -18,7 +18,7 @@
import math import math
import itertools as it import itertools as it
import numpy as np import numpy as np
from .config import config from src.model_utils.config import config
from .anchor_generator import GridAnchorGenerator from .anchor_generator import GridAnchorGenerator
@ -62,7 +62,7 @@ class GeneratDefaultBoxes():
self.default_boxes_tlbr = np.array(tuple(to_tlbr(*i) for i in self.default_boxes), dtype='float32') self.default_boxes_tlbr = np.array(tuple(to_tlbr(*i) for i in self.default_boxes), dtype='float32')
self.default_boxes = np.array(self.default_boxes, dtype='float32') self.default_boxes = np.array(self.default_boxes, dtype='float32')
if 'use_anchor_generator' in config and config.use_anchor_generator: if hasattr(config, 'use_anchor_generator') and config.use_anchor_generator:
generator = GridAnchorGenerator(config.img_shape, 4, 2, [1.0, 2.0, 0.5]) generator = GridAnchorGenerator(config.img_shape, 4, 2, [1.0, 2.0, 0.5])
default_boxes, default_boxes_tlbr = generator.generate_multi_levels(config.steps) default_boxes, default_boxes_tlbr = generator.generate_multi_levels(config.steps)
else: else:

View File

@ -1,39 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Config parameters for SSD models."""
from .config_ssd300 import config as config_ssd300
from .config_ssd_mobilenet_v1_fpn import config as config_ssd_mobilenet_v1_fpn
from .config_ssd_resnet50_fpn import config as config_ssd_resnet50_fpn
from .config_ssd_vgg16 import config as config_ssd_vgg16
using_model = "ssd300"
config_map = {
"ssd300": config_ssd300,
"ssd_vgg16": config_ssd_vgg16,
"ssd_mobilenet_v1_fpn": config_ssd_mobilenet_v1_fpn,
"ssd_resnet50_fpn": config_ssd_resnet50_fpn
}
config = config_map[using_model]
if config.num_ssd_boxes == -1:
num = 0
h, w = config.img_shape
for i in range(len(config.steps)):
num += (h // config.steps[i]) * (w // config.steps[i]) * config.num_default[i]
config.num_ssd_boxes = num

View File

@ -1,82 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#" ============================================================================
"""Config parameters for SSD models."""
from easydict import EasyDict as ed
config = ed({
"model": "ssd300",
"img_shape": [300, 300],
"num_ssd_boxes": 1917,
"match_threshold": 0.5,
"nms_threshold": 0.6,
"min_score": 0.1,
"max_boxes": 100,
# learing rate settings
"lr_init": 0.001,
"lr_end_rate": 0.001,
"warmup_epochs": 2,
"momentum": 0.9,
"weight_decay": 1.5e-4,
# network
"num_default": [3, 6, 6, 6, 6, 6],
"extras_in_channels": [256, 576, 1280, 512, 256, 256],
"extras_out_channels": [576, 1280, 512, 256, 256, 128],
"extras_strides": [1, 1, 2, 2, 2, 2],
"extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25],
"feature_size": [19, 10, 5, 3, 2, 1],
"min_scale": 0.2,
"max_scale": 0.95,
"aspect_ratios": [(), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)],
"steps": (16, 32, 64, 100, 150, 300),
"prior_scaling": (0.1, 0.2),
"gamma": 2.0,
"alpha": 0.75,
# `mindrecord_dir` and `coco_root` are better to use absolute path.
"feature_extractor_base_param": "",
"checkpoint_filter_list": ['multi_loc_layers', 'multi_cls_layers'],
"mindrecord_dir": "/data/MindRecord_COCO",
"coco_root": "/data/coco2017",
"train_data_type": "train2017",
"val_data_type": "val2017",
"instances_set": "annotations/instances_{}.json",
"classes": ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush'),
"num_classes": 81,
# The annotation.json position of voc validation dataset.
"voc_json": "annotations/voc_instances_val.json",
# voc original dataset.
"voc_root": "/data/voc_dataset",
# if coco or voc used, `image_dir` and `anno_path` are useless.
"image_dir": "",
"anno_path": ""
})

View File

@ -1,87 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#" ============================================================================
"""Config parameters for SSD models."""
from easydict import EasyDict as ed
config = ed({
"model": "ssd_mobilenet_v1_fpn",
"img_shape": [640, 640],
"num_ssd_boxes": -1,
"match_threshold": 0.5,
"nms_threshold": 0.6,
"min_score": 0.1,
"max_boxes": 100,
# learning rate settings
"global_step": 0,
"lr_init": 0.01333,
"lr_end_rate": 0.0,
"warmup_epochs": 2,
"weight_decay": 4e-5,
"momentum": 0.9,
# network
"num_default": [6, 6, 6, 6, 6],
"extras_in_channels": [256, 512, 1024, 256, 256],
"extras_out_channels": [256, 256, 256, 256, 256],
"extras_strides": [1, 1, 2, 2, 2, 2],
"extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25],
"feature_size": [80, 40, 20, 10, 5],
"min_scale": 0.2,
"max_scale": 0.95,
"aspect_ratios": [(2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)],
"steps": (8, 16, 32, 64, 128),
"prior_scaling": (0.1, 0.2),
"gamma": 2.0,
"alpha": 0.25,
"num_addition_layers": 4,
"use_anchor_generator": True,
"use_global_norm": True,
# `mindrecord_dir` and `coco_root` are better to use absolute path.
"feature_extractor_base_param": "/ckpt/mobilenet_v1.ckpt",
"checkpoint_filter_list": ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',
'network.multi_box.loc_layers.0.weight', 'network.multi_box.loc_layers.0.bias'],
"mindrecord_dir": "/data/MindRecord_COCO",
"coco_root": "/data/coco2017",
"train_data_type": "train2017",
"val_data_type": "val2017",
"instances_set": "annotations/instances_{}.json",
"classes": ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush'),
"num_classes": 81,
# The annotation.json position of voc validation dataset.
"voc_json": "annotations/voc_instances_val.json",
# voc original dataset.
"voc_root": "/data/voc_dataset",
# if coco or voc used, `image_dir` and `anno_path` are useless.
"image_dir": "",
"anno_path": ""
})

View File

@ -1,88 +0,0 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#" ============================================================================
"""Config parameters for SSD models."""
from easydict import EasyDict as ed
config = ed({
"model": "ssd_resnet50_fpn",
"img_shape": [640, 640],
"num_ssd_boxes": -1,
"match_threshold": 0.5,
"nms_threshold": 0.6,
"min_score": 0.1,
"max_boxes": 100,
# learning rate settings
"global_step": 0,
"lr_init": 0.01333,
"lr_end_rate": 0.0,
"warmup_epochs": 2,
"weight_decay": 4e-4,
"momentum": 0.9,
# network
"num_default": [6, 6, 6, 6, 6],
"extras_in_channels": [256, 512, 1024, 256, 256],
"extras_out_channels": [256, 256, 256, 256, 256],
"extras_strides": [1, 1, 2, 2, 2, 2],
"extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25],
"feature_size": [80, 40, 20, 10, 5],
"min_scale": 0.2,
"max_scale": 0.95,
"aspect_ratios": [(2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)],
"steps": (8, 16, 32, 64, 128),
"prior_scaling": (0.1, 0.2),
"gamma": 2.0,
"alpha": 0.25,
"num_addition_layers": 4,
"use_anchor_generator": True,
"use_global_norm": True,
"use_float16": True,
# `mindrecord_dir` and `coco_root` are better to use absolute path.
"feature_extractor_base_param": "/ckpt/resnet50.ckpt",
"checkpoint_filter_list": ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',
'network.multi_box.loc_layers.0.weight', 'network.multi_box.loc_layers.0.bias'],
"mindrecord_dir": "/data/MindRecord_COCO",
"coco_root": "/data/coco2017",
"train_data_type": "train2017",
"val_data_type": "val2017",
"instances_set": "annotations/instances_{}.json",
"classes": ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush'),
"num_classes": 81,
# The annotation.json position of voc validation dataset.
"voc_json": "annotations/voc_instances_val.json",
# voc original dataset.
"voc_root": "/data/voc_dataset",
# if coco or voc used, `image_dir` and `anno_path` are useless.
"image_dir": "",
"anno_path": ""
})

View File

@ -1,84 +0,0 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Config parameters for SSD models."""
from easydict import EasyDict as ed
config = ed({
"model": "ssd_vgg16",
"img_shape": [300, 300],
"num_ssd_boxes": 7308,
"match_threshold": 0.5,
"nms_threshold": 0.6,
"min_score": 0.1,
"max_boxes": 100,
"ssd_vgg_bn": False,
# learing rate settings
"lr_init": 0.001,
"lr_end_rate": 0.001,
"warmup_epochs": 2,
"momentum": 0.9,
"weight_decay": 1.5e-4,
# network
"num_default": [3, 6, 6, 6, 6, 6],
"extras_in_channels": [256, 512, 1024, 512, 256, 256],
"extras_out_channels": [512, 1024, 512, 256, 256, 256],
"extras_strides": [1, 1, 2, 2, 2, 2],
"extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25],
"feature_size": [38, 19, 10, 5, 3, 1],
"min_scale": 0.2,
"max_scale": 0.95,
"aspect_ratios": [(), (2, 3), (2, 3), (2, 3), (2, 3), (2, 3)],
"steps": (8, 16, 32, 64, 100, 300),
"prior_scaling": (0.1, 0.2),
"gamma": 2.0,
"alpha": 0.75,
# `mindrecord_dir` and `coco_root` are better to use absolute path.
"feature_extractor_base_param": "",
"pretrain_vgg_bn": False,
"checkpoint_filter_list": ['multi_loc_layers', 'multi_cls_layers'],
"mindrecord_dir": "/data/MindRecord_COCO",
"coco_root": "/data/coco2017",
"train_data_type": "train2017",
"val_data_type": "val2017",
"instances_set": "annotations/instances_{}.json",
"classes": ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush'),
"num_classes": 81,
# The annotation.json position of voc validation dataset.
"voc_json": "annotations/voc_instances_val.json",
# voc original dataset.
"voc_root": "/data/voc_dataset",
# if coco or voc used, `image_dir` and `anno_path` are useless.
"image_dir": "",
"anno_path": ""
})

View File

@ -26,7 +26,7 @@ import cv2
import mindspore.dataset as de import mindspore.dataset as de
import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.c_transforms as C
from mindspore.mindrecord import FileWriter from mindspore.mindrecord import FileWriter
from .config import config from src.model_utils.config import config
from .box_utils import jaccard_numpy, ssd_bboxes_encode from .box_utils import jaccard_numpy, ssd_bboxes_encode
@ -253,7 +253,7 @@ def create_coco_label(is_training):
"""Get image path and annotation from COCO.""" """Get image path and annotation from COCO."""
from pycocotools.coco import COCO from pycocotools.coco import COCO
coco_root = config.coco_root coco_root = os.path.join(config.data_path, config.coco_root)
data_type = config.val_data_type data_type = config.val_data_type
if is_training: if is_training:
data_type = config.train_data_type data_type = config.train_data_type
@ -425,13 +425,14 @@ def create_mindrecord(dataset="coco", prefix="ssd.mindrecord", is_training=True)
# It will generate mindrecord file in config.mindrecord_dir, # It will generate mindrecord file in config.mindrecord_dir,
# and the file name is ssd.mindrecord0, 1, ... file_num. # and the file name is ssd.mindrecord0, 1, ... file_num.
mindrecord_dir = config.mindrecord_dir mindrecord_dir = os.path.join(config.data_path, config.mindrecord_dir)
mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
if not os.path.exists(mindrecord_file): if not os.path.exists(mindrecord_file):
if not os.path.isdir(mindrecord_dir): if not os.path.isdir(mindrecord_dir):
os.makedirs(mindrecord_dir) os.makedirs(mindrecord_dir)
if dataset == "coco": if dataset == "coco":
if os.path.isdir(config.coco_root): coco_root = os.path.join(config.data_path, config.coco_root)
if os.path.isdir(coco_root):
print("Create Mindrecord.") print("Create Mindrecord.")
data_to_mindrecord_byte_image("coco", is_training, prefix) data_to_mindrecord_byte_image("coco", is_training, prefix)
print("Create Mindrecord Done, at {}".format(mindrecord_dir)) print("Create Mindrecord Done, at {}".format(mindrecord_dir))

View File

@ -17,7 +17,7 @@
import json import json
import numpy as np import numpy as np
from mindspore import Tensor from mindspore import Tensor
from .config import config from src.model_utils.config import config
def apply_eval(eval_param_dict): def apply_eval(eval_param_dict):
net = eval_param_dict["net"] net = eval_param_dict["net"]

View File

@ -0,0 +1,124 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Parse arguments"""
import os
import ast
import argparse
from pprint import pformat
import yaml
_config_path = "./ssd300_config.yaml"
class Config:
"""
Configuration namespace. Convert dictionary to members.
"""
def __init__(self, cfg_dict):
for k, v in cfg_dict.items():
if isinstance(v, (list, tuple)):
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Config(v) if isinstance(v, dict) else v)
def __str__(self):
return pformat(self.__dict__)
def __repr__(self):
return self.__str__()
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="ssd300_config.yaml"):
"""
Parse command line arguments to the configuration according to the default yaml.
Args:
parser: Parent parser.
cfg: Base configuration.
helper: Helper description.
cfg_path: Path to the default yaml config.
"""
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
parents=[parser])
helper = {} if helper is None else helper
choices = {} if choices is None else choices
for item in cfg:
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
choice = choices[item] if item in choices else None
if isinstance(cfg[item], bool):
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
help=help_description)
else:
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
help=help_description)
args = parser.parse_args()
return args
def parse_yaml(yaml_path):
"""
Parse the yaml config file.
Args:
yaml_path: Path to the yaml config.
"""
with open(yaml_path, 'r') as fin:
try:
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
cfgs = [x for x in cfgs]
if len(cfgs) == 1:
cfg_helper = {}
cfg = cfgs[0]
elif len(cfgs) == 2:
cfg, cfg_helper = cfgs
else:
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
print(cfg_helper)
except:
raise ValueError("Failed to parse yaml")
return cfg, cfg_helper
def merge(args, cfg):
"""
Merge the base config from yaml file and command line arguments.
Args:
args: Command line arguments.
cfg: Base configuration.
"""
args_var = vars(args)
for item in args_var:
cfg[item] = args_var[item]
return cfg
def get_config():
"""
Get Config according to the yaml file and cli arguments.
"""
parser = argparse.ArgumentParser(description="default name", add_help=False)
current_dir = os.path.dirname(os.path.abspath(__file__))
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, \
"../../ssd300_config.yaml"), help="Config file path")
path_args, _ = parser.parse_known_args()
default, helper = parse_yaml(path_args.config_path)
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
final_config = merge(args, default)
return Config(final_config)
config = get_config()

View File

@ -0,0 +1,27 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Device adapter for ModelArts"""
from src.model_utils.config import config
if config.enable_modelarts:
from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
else:
from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
__all__ = [
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
]

View File

@ -0,0 +1,36 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Local adapter"""
import os
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
return "Local Job"

View File

@ -0,0 +1,115 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Moxing adapter for ModelArts"""
import os
import functools
from mindspore import context
from src.model_utils.config import config
_global_sync_count = 0
def get_device_id():
device_id = os.getenv('DEVICE_ID', '0')
return int(device_id)
def get_device_num():
device_num = os.getenv('RANK_SIZE', '1')
return int(device_num)
def get_rank_id():
global_rank_id = os.getenv('RANK_ID', '0')
return int(global_rank_id)
def get_job_id():
job_id = os.getenv('JOB_ID')
job_id = job_id if job_id != "" else "default"
return job_id
def sync_data(from_path, to_path):
"""
Download data from remote obs to local directory if the first url is remote url and the second one is local path
Upload data from local directory to remote obs in contrast.
"""
import moxing as mox
import time
global _global_sync_count
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
_global_sync_count += 1
# Each server contains 8 devices as most.
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
print("from path: ", from_path)
print("to path: ", to_path)
mox.file.copy_parallel(from_path, to_path)
print("===finish data synchronization===")
try:
os.mknod(sync_lock)
except IOError:
pass
print("===save flag===")
while True:
if os.path.exists(sync_lock):
break
time.sleep(1)
print("Finish sync data from {} to {}.".format(from_path, to_path))
def moxing_wrapper(pre_process=None, post_process=None):
"""
Moxing wrapper to download dataset and upload outputs.
"""
def wrapper(run_func):
@functools.wraps(run_func)
def wrapped_func(*args, **kwargs):
# Download data from data_url
if config.enable_modelarts:
if config.data_url:
sync_data(config.data_url, config.data_path)
print("Dataset downloaded: ", os.listdir(config.data_path))
if config.checkpoint_url:
sync_data(config.checkpoint_url, config.load_path)
print("Preload downloaded: ", os.listdir(config.load_path))
if config.train_url:
sync_data(config.train_url, config.output_path)
print("Workspace downloaded: ", os.listdir(config.output_path))
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
config.device_num = get_device_num()
config.device_id = get_device_id()
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
if pre_process:
pre_process()
run_func(*args, **kwargs)
# Upload data to train_url
if config.enable_modelarts:
if post_process:
post_process()
if config.train_url:
print("Start to copy output directory")
sync_data(config.output_path, config.train_url)
return wrapped_func
return wrapper

View File

@ -25,7 +25,6 @@ from mindspore.communication.management import get_group_size
from mindspore.ops import operations as P from mindspore.ops import operations as P
from mindspore.ops import functional as F from mindspore.ops import functional as F
from mindspore.ops import composite as C from mindspore.ops import composite as C
from .fpn import mobilenet_v1_fpn, resnet50_fpn from .fpn import mobilenet_v1_fpn, resnet50_fpn
from .vgg16 import vgg16 from .vgg16 import vgg16

View File

@ -16,7 +16,7 @@
"""VGG16 backbone for SSD""" """VGG16 backbone for SSD"""
from mindspore import nn from mindspore import nn
from .config_ssd_vgg16 import config from src.model_utils.config import config
pretrain_vgg_bn = config.pretrain_vgg_bn pretrain_vgg_bn = config.pretrain_vgg_bn
ssd_vgg_bn = config.ssd_vgg_bn ssd_vgg_bn = config.ssd_vgg_bn

View File

@ -0,0 +1,121 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ssd-500_458.ckpt'
# ==============================================================================
# Training options
model_name: "ssd300"
img_shape: [300, 300]
num_ssd_boxes: 1917
match_threshold: 0.5
nms_threshold: 0.6
min_score: 0.1
max_boxes: 100
# learing rate settings
lr_init: 0.001
lr_end_rate: 0.001
warmup_epochs: 2
momentum: 0.9
weight_decay: 0.00015
ssd_vgg_bn: False
pretrain_vgg_bn: False
# network
num_default: [3, 6, 6, 6, 6, 6]
extras_in_channels: [256, 576, 1280, 512, 256, 256]
extras_out_channels: [576, 1280, 512, 256, 256, 128]
extras_strides: [1, 1, 2, 2, 2, 2]
extras_ratio: [0.2, 0.2, 0.2, 0.25, 0.5, 0.25]
feature_size: [19, 10, 5, 3, 2, 1]
min_scale: 0.2
max_scale: 0.95
aspect_ratios: [[], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]]
steps: [16, 32, 64, 100, 150, 300]
prior_scaling: [0.1, 0.2]
gamma: 2.0
alpha: 0.75
dataset: "coco"
lr: 0.05
mode_sink: "sink"
device_id: 0
device_num: 1
epoch_size: 500
batch_size: 32
loss_scale: 1024
pre_trained: ""
pre_trained_epoch_size: 0
save_checkpoint_epochs: 10
only_create_dataset: False
eval_start_epoch: 40
eval_interval: 1
run_eval: False
filter_weight: False
freeze_layer: None
save_best_ckpt: True
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: ""
checkpoint_filter_list: ['multi_loc_layers', 'multi_cls_layers']
mindrecord_dir: "MindRecord_COCO"
coco_root: "coco_ori"
train_data_type: "train2017"
val_data_type: "val2017"
instances_set: "annotations/instances_{}.json"
classes: ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush']
num_classes: 81
# The annotation.json position of voc validation dataset.
voc_json: "annotations/voc_instances_val.json"
# voc original dataset.
voc_root: "/data/voc_dataset"
# if coco or voc used, `image_dir` and `anno_path` are useless.
image_dir: ""
anno_path: ""
file_name: "ssd"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,125 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'mobilenet_v1.ckpt'
# ==============================================================================
# Training options
model_name: "ssd_mobilenet_v1_fpn"
img_shape: [640, 640]
num_ssd_boxes: -1
match_threshold: 0.5
nms_threshold: 0.6
min_score: 0.1
max_boxes: 100
# learning rate settings
global_step: 0
lr_init: 0.01333
lr_end_rate: 0.0
warmup_epochs: 2
weight_decay: 0.00004
momentum: 0.9
ssd_vgg_bn: False
pretrain_vgg_bn: False
# network
num_default: [6, 6, 6, 6, 6]
extras_in_channels: [256, 512, 1024, 256, 256]
extras_out_channels: [256, 256, 256, 256, 256]
extras_strides: [1, 1, 2, 2, 2, 2]
extras_ratio: [0.2, 0.2, 0.2, 0.25, 0.5, 0.25]
feature_size: [80, 40, 20, 10, 5]
min_scale: 0.2
max_scale: 0.95
aspect_ratios: [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]]
steps: [8, 16, 32, 64, 128]
prior_scaling: [0.1, 0.2]
gamma: 2.0
alpha: 0.25
num_addition_layers: 4
use_anchor_generator: True
use_global_norm: True
dataset: "coco"
lr: 0.05
mode_sink: "sink"
device_id: 0
device_num: 1
epoch_size: 500
batch_size: 32
loss_scale: 1024
pre_trained: ""
pre_trained_epoch_size: 0
save_checkpoint_epochs: 10
only_create_dataset: False
eval_start_epoch: 40
eval_interval: 1
run_eval: False
filter_weight: False
freeze_layer: None
save_best_ckpt: True
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: "/ckpt/mobilenet_v1.ckpt"
checkpoint_filter_list: ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',
'network.multi_box.loc_layers.0.weight', 'network.multi_box.loc_layers.0.bias']
mindrecord_dir: "MindRecord_COCO"
coco_root: "coco_ori"
train_data_type: "train2017"
val_data_type: "val2017"
instances_set: "annotations/instances_{}.json"
classes: ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush']
num_classes: 81
# The annotation.json position of voc validation dataset.
voc_json: "annotations/voc_instances_val.json"
# voc original dataset.
voc_root: "/data/voc_dataset"
# if coco or voc used, `image_dir` and `anno_path` are useless.
image_dir: ""
anno_path: ""
file_name: "ssd"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,126 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'resnet50.ckpt'
# ==============================================================================
# Training options
model_name: "ssd_resnet50_fpn"
img_shape: [640, 640]
num_ssd_boxes: -1
match_threshold: 0.5
nms_threshold: 0.6
min_score: 0.1
max_boxes: 100
# learning rate settings
global_step: 0
lr_init: 0.01333
lr_end_rate: 0.0
warmup_epochs: 2
weight_decay: 0.0004
momentum: 0.9
ssd_vgg_bn: False
pretrain_vgg_bn: False
# network
num_default: [6, 6, 6, 6, 6]
extras_in_channels: [256, 512, 1024, 256, 256]
extras_out_channels: [256, 256, 256, 256, 256]
extras_strides: [1, 1, 2, 2, 2, 2]
extras_ratio: [0.2, 0.2, 0.2, 0.25, 0.5, 0.25]
feature_size: [80, 40, 20, 10, 5]
min_scale: 0.2
max_scale: 0.95
aspect_ratios: [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]]
steps: [8, 16, 32, 64, 128]
prior_scaling: [0.1, 0.2]
gamma: 2.0
alpha: 0.25
num_addition_layers: 4
use_anchor_generator: True
use_global_norm: True
use_float16: True
dataset: "coco"
lr: 0.05
mode_sink: "sink"
device_id: 0
device_num: 1
epoch_size: 500
batch_size: 32
loss_scale: 1024
pre_trained: ""
pre_trained_epoch_size: 0
save_checkpoint_epochs: 10
only_create_dataset: False
eval_start_epoch: 40
eval_interval: 1
run_eval: False
filter_weight: False
freeze_layer: None
save_best_ckpt: True
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: "/ckpt/resnet50.ckpt"
checkpoint_filter_list: ['network.multi_box.cls_layers.0.weight', 'network.multi_box.cls_layers.0.bias',
'network.multi_box.loc_layers.0.weight', 'network.multi_box.loc_layers.0.bias']
mindrecord_dir: "MindRecord_COCO"
coco_root: "coco_ori"
train_data_type: "train2017"
val_data_type: "val2017"
instances_set: "annotations/instances_{}.json"
classes: ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush']
num_classes: 81
# The annotation.json position of voc validation dataset.
voc_json: "annotations/voc_instances_val.json"
# voc original dataset.
voc_root: "/data/voc_dataset"
# if coco or voc used, `image_dir` and `anno_path` are useless.
image_dir: ""
anno_path: ""
file_name: "ssd"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -0,0 +1,120 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: 'Ascend'
checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ssd-500_458.ckpt'
# ==============================================================================
# Training options
model_name: "ssd_vgg16"
img_shape: [300, 300]
num_ssd_boxes: 7308
match_threshold: 0.5
nms_threshold: 0.6
min_score: 0.1
max_boxes: 100
# learing rate settings
lr_init: 0.001
lr_end_rate: 0.001
warmup_epochs: 2
momentum: 0.9
weight_decay: 0.00015
ssd_vgg_bn: False
pretrain_vgg_bn: False
# network
num_default: [3, 6, 6, 6, 6, 6]
extras_in_channels: [256, 512, 1024, 512, 256, 256]
extras_out_channels: [512, 1024, 512, 256, 256, 256]
extras_strides: [1, 1, 2, 2, 2, 2]
extras_ratio: [0.2, 0.2, 0.2, 0.25, 0.5, 0.25]
feature_size: [38, 19, 10, 5, 3, 1]
min_scale: 0.2
max_scale: 0.95
aspect_ratios: [[], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]]
steps: [8, 16, 32, 64, 100, 300]
prior_scaling: [0.1, 0.2]
gamma: 2.0
alpha: 0.75
dataset: "coco"
lr: 0.05
mode_sink: "sink"
device_id: 0
device_num: 1
epoch_size: 500
batch_size: 32
loss_scale: 1024
pre_trained: ""
pre_trained_epoch_size: 0
save_checkpoint_epochs: 10
only_create_dataset: False
eval_start_epoch: 40
eval_interval: 1
run_eval: False
filter_weight: False
freeze_layer: None
save_best_ckpt: True
# `mindrecord_dir` and `coco_root` are better to use absolute path.
feature_extractor_base_param: ""
checkpoint_filter_list: ['multi_loc_layers', 'multi_cls_layers']
mindrecord_dir: "MindRecord_COCO"
coco_root: "coco_ori"
train_data_type: "train2017"
val_data_type: "val2017"
instances_set: "annotations/instances_{}.json"
classes: ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard',
'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors',
'teddy bear', 'hair drier', 'toothbrush']
num_classes: 81
# The annotation.json position of voc validation dataset.
voc_json: "annotations/voc_instances_val.json"
# voc original dataset.
voc_root: "/data/voc_dataset"
# if coco or voc used, `image_dir` and `anno_path` are useless.
image_dir: ""
anno_path: ""
file_name: "ssd"
file_format: "AIR"
---
# Help description for each configuration
enable_modelarts: 'Whether training on modelarts, default: False'
data_url: 'Dataset url for obs'
train_url: 'Training output url for obs'
checkpoint_url: 'The location of checkpoint for obs'
data_path: 'Dataset path for local'
output_path: 'Training output path for local'
load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -16,8 +16,6 @@
"""Train SSD and get checkpoint files.""" """Train SSD and get checkpoint files."""
import os import os
import argparse
import ast
import mindspore.nn as nn import mindspore.nn as nn
from mindspore import context, Tensor from mindspore import context, Tensor
from mindspore.communication.management import init, get_rank from mindspore.communication.management import init, get_rank
@ -28,60 +26,26 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import set_seed, dtype from mindspore.common import set_seed, dtype
from src.ssd import SSD300, SsdInferWithDecoder, SSDWithLossCell, TrainingWrapper, ssd_mobilenet_v2,\ from src.ssd import SSD300, SsdInferWithDecoder, SSDWithLossCell, TrainingWrapper, ssd_mobilenet_v2,\
ssd_mobilenet_v1_fpn, ssd_resnet50_fpn, ssd_vgg16 ssd_mobilenet_v1_fpn, ssd_resnet50_fpn, ssd_vgg16
from src.config import config
from src.dataset import create_ssd_dataset, create_mindrecord from src.dataset import create_ssd_dataset, create_mindrecord
from src.lr_schedule import get_lr from src.lr_schedule import get_lr
from src.init_params import init_net_param, filter_checkpoint_parameter_by_list from src.init_params import init_net_param, filter_checkpoint_parameter_by_list
from src.eval_callback import EvalCallBack from src.eval_callback import EvalCallBack
from src.eval_utils import apply_eval from src.eval_utils import apply_eval
from src.box_utils import default_boxes from src.box_utils import default_boxes
from src.model_utils.config import config
from src.model_utils.moxing_adapter import moxing_wrapper
set_seed(1) set_seed(1)
def get_args(): def ssd_model_build():
parser = argparse.ArgumentParser(description="SSD training") if config.model_name == "ssd300":
parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"),
help="run platform, support Ascend, GPU and CPU.")
parser.add_argument("--only_create_dataset", type=ast.literal_eval, default=False,
help="If set it true, only create Mindrecord, default is False.")
parser.add_argument("--distribute", type=ast.literal_eval, default=False,
help="Run distribute, default is False.")
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
parser.add_argument("--lr", type=float, default=0.05, help="Learning rate, default is 0.05.")
parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink.")
parser.add_argument("--dataset", type=str, default="coco", help="Dataset, default is coco.")
parser.add_argument("--epoch_size", type=int, default=500, help="Epoch size, default is 500.")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.")
parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.")
parser.add_argument("--save_checkpoint_epochs", type=int, default=10, help="Save checkpoint epochs, default is 10.")
parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
parser.add_argument("--filter_weight", type=ast.literal_eval, default=False,
help="Filter head weight parameters, default is False.")
parser.add_argument('--freeze_layer', type=str, default="none", choices=["none", "backbone"],
help="freeze the weights of network, support freeze the backbone's weights, "
"default is not freezing.")
parser.add_argument("--run_eval", type=ast.literal_eval, default=False,
help="Run evaluation when training, default is False.")
parser.add_argument("--save_best_ckpt", type=ast.literal_eval, default=True,
help="Save best checkpoint when run_eval is True, default is True.")
parser.add_argument("--eval_start_epoch", type=int, default=40,
help="Evaluation start epoch when run_eval is True, default is 40.")
parser.add_argument("--eval_interval", type=int, default=1,
help="Evaluation interval when run_eval is True, default is 1.")
args_opt = parser.parse_args()
return args_opt
def ssd_model_build(args_opt):
if config.model == "ssd300":
backbone = ssd_mobilenet_v2() backbone = ssd_mobilenet_v2()
ssd = SSD300(backbone=backbone, config=config) ssd = SSD300(backbone=backbone, config=config)
init_net_param(ssd) init_net_param(ssd)
if args_opt.freeze_layer == "backbone": if config.freeze_layer == "backbone":
for param in backbone.feature_1.trainable_params(): for param in backbone.feature_1.trainable_params():
param.requires_grad = False param.requires_grad = False
elif config.model == "ssd_mobilenet_v1_fpn": elif config.model_name == "ssd_mobilenet_v1_fpn":
ssd = ssd_mobilenet_v1_fpn(config=config) ssd = ssd_mobilenet_v1_fpn(config=config)
init_net_param(ssd) init_net_param(ssd)
if config.feature_extractor_base_param != "": if config.feature_extractor_base_param != "":
@ -90,7 +54,7 @@ def ssd_model_build(args_opt):
param_dict["network.feature_extractor.mobilenet_v1." + x] = param_dict[x] param_dict["network.feature_extractor.mobilenet_v1." + x] = param_dict[x]
del param_dict[x] del param_dict[x]
load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict) load_param_into_net(ssd.feature_extractor.mobilenet_v1.network, param_dict)
elif config.model == "ssd_resnet50_fpn": elif config.model_name == "ssd_resnet50_fpn":
ssd = ssd_resnet50_fpn(config=config) ssd = ssd_resnet50_fpn(config=config)
init_net_param(ssd) init_net_param(ssd)
if config.feature_extractor_base_param != "": if config.feature_extractor_base_param != "":
@ -99,7 +63,7 @@ def ssd_model_build(args_opt):
param_dict["network.feature_extractor.resnet." + x] = param_dict[x] param_dict["network.feature_extractor.resnet." + x] = param_dict[x]
del param_dict[x] del param_dict[x]
load_param_into_net(ssd.feature_extractor.resnet, param_dict) load_param_into_net(ssd.feature_extractor.resnet, param_dict)
elif config.model == "ssd_vgg16": elif config.model_name == "ssd_vgg16":
ssd = ssd_vgg16(config=config) ssd = ssd_vgg16(config=config)
init_net_param(ssd) init_net_param(ssd)
if config.feature_extractor_base_param != "": if config.feature_extractor_base_param != "":
@ -111,76 +75,86 @@ def ssd_model_build(args_opt):
del param_dict[k + ".weight"] del param_dict[k + ".weight"]
load_param_into_net(ssd.backbone, param_dict) load_param_into_net(ssd.backbone, param_dict)
else: else:
raise ValueError(f'config.model: {config.model} is not supported') raise ValueError(f'config.model: {config.model_name} is not supported')
return ssd return ssd
def set_graph_kernel_context(run_platform, model): def set_graph_kernel_context(device_target, model):
if run_platform == "GPU" and model == "ssd300": if device_target == "GPU" and model == "ssd300":
# Enable graph kernel for default model ssd300 on GPU back-end. # Enable graph kernel for default model ssd300 on GPU back-end.
context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion") context.set_context(enable_graph_kernel=True, graph_kernel_flags="--enable_parallel_fusion")
def main(): def set_parameter(model_name):
args_opt = get_args() if model_name == "ssd_resnet50_fpn":
context.set_auto_parallel_context(all_reduce_fusion_config=[90, 183, 279])
if model_name == "ssd_vgg16":
context.set_auto_parallel_context(all_reduce_fusion_config=[20, 41, 62])
else:
context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 89])
@moxing_wrapper()
def train_net():
if hasattr(config, 'num_ssd_boxes') and config.num_ssd_boxes == -1:
num = 0
h, w = config.img_shape
for i in range(len(config.steps)):
num += (h // config.steps[i]) * (w // config.steps[i]) * config.num_default[i]
config.num_ssd_boxes = num
rank = 0 rank = 0
device_num = 1 device_num = 1
if args_opt.run_platform == "CPU": if config.device_target == "CPU":
context.set_context(mode=context.GRAPH_MODE, device_target="CPU") context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
else: else:
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.run_platform, device_id=args_opt.device_id) context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=config.device_id)
set_graph_kernel_context(args_opt.run_platform, config.model) set_graph_kernel_context(config.device_target, config.model_name)
if args_opt.distribute: if config.run_distribute:
device_num = args_opt.device_num device_num = config.device_num
context.reset_auto_parallel_context() context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=device_num) device_num=device_num)
init() init()
if config.model == "ssd_resnet50_fpn": set_parameter(model_name=config.model_name)
context.set_auto_parallel_context(all_reduce_fusion_config=[90, 183, 279])
if config.model == "ssd_vgg16":
context.set_auto_parallel_context(all_reduce_fusion_config=[20, 41, 62])
else:
context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 89])
rank = get_rank() rank = get_rank()
mindrecord_file = create_mindrecord(args_opt.dataset, "ssd.mindrecord", True) mindrecord_file = create_mindrecord(config.dataset, "ssd.mindrecord", True)
if args_opt.only_create_dataset: if config.only_create_dataset:
return return
loss_scale = float(args_opt.loss_scale) loss_scale = float(config.loss_scale)
if args_opt.run_platform == "CPU": if config.device_target == "CPU":
loss_scale = 1.0 loss_scale = 1.0
# When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0. # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0.
use_multiprocessing = (args_opt.run_platform != "CPU") use_multiprocessing = (config.device_target != "CPU")
dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size, dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=config.batch_size,
device_num=device_num, rank=rank, use_multiprocessing=use_multiprocessing) device_num=device_num, rank=rank, use_multiprocessing=use_multiprocessing)
dataset_size = dataset.get_dataset_size() dataset_size = dataset.get_dataset_size()
print(f"Create dataset done! dataset size is {dataset_size}") print(f"Create dataset done! dataset size is {dataset_size}")
ssd = ssd_model_build(args_opt) ssd = ssd_model_build()
if ("use_float16" in config and config.use_float16) or args_opt.run_platform == "GPU": if (hasattr(config, 'use_float16') and config.use_float16) or config.device_target == "GPU":
ssd.to_float(dtype.float16) ssd.to_float(dtype.float16)
net = SSDWithLossCell(ssd, config) net = SSDWithLossCell(ssd, config)
# checkpoint # checkpoint
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * config.save_checkpoint_epochs)
save_ckpt_path = './ckpt_' + str(rank) + '/' ckpt_save_dir = config.output_path +'/ckpt_{}/'.format(rank)
ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=save_ckpt_path, config=ckpt_config) ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=ckpt_save_dir, config=ckpt_config)
if args_opt.pre_trained: if config.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained) param_dict = load_checkpoint(config.pre_trained)
if args_opt.filter_weight: if config.filter_weight:
filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list) filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list)
load_param_into_net(net, param_dict, True) load_param_into_net(net, param_dict, True)
lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size, lr = Tensor(get_lr(global_step=config.pre_trained_epoch_size * dataset_size,
lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr, lr_init=config.lr_init, lr_end=config.lr_end_rate * config.lr, lr_max=config.lr,
warmup_epochs=config.warmup_epochs, warmup_epochs=config.warmup_epochs,
total_epochs=args_opt.epoch_size, total_epochs=config.epoch_size,
steps_per_epoch=dataset_size)) steps_per_epoch=dataset_size))
if "use_global_norm" in config and config.use_global_norm: if hasattr(config, 'use_global_norm') and config.use_global_norm:
opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
config.momentum, config.weight_decay, 1.0) config.momentum, config.weight_decay, 1.0)
net = TrainingWrapper(net, opt, loss_scale, True) net = TrainingWrapper(net, opt, loss_scale, True)
@ -190,31 +164,31 @@ def main():
net = TrainingWrapper(net, opt, loss_scale) net = TrainingWrapper(net, opt, loss_scale)
callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
if args_opt.run_eval: if config.run_eval:
eval_net = SsdInferWithDecoder(ssd, Tensor(default_boxes), config) eval_net = SsdInferWithDecoder(ssd, Tensor(default_boxes), config)
eval_net.set_train(False) eval_net.set_train(False)
mindrecord_file = create_mindrecord(args_opt.dataset, "ssd_eval.mindrecord", False) mindrecord_file = create_mindrecord(config.dataset, "ssd_eval.mindrecord", False)
eval_dataset = create_ssd_dataset(mindrecord_file, batch_size=args_opt.batch_size, repeat_num=1, eval_dataset = create_ssd_dataset(mindrecord_file, batch_size=config.batch_size, repeat_num=1,
is_training=False, use_multiprocessing=False) is_training=False, use_multiprocessing=False)
if args_opt.dataset == "coco": if config.dataset == "coco":
anno_json = os.path.join(config.coco_root, config.instances_set.format(config.val_data_type)) anno_json = os.path.join(config.coco_root, config.instances_set.format(config.val_data_type))
elif args_opt.dataset == "voc": elif config.dataset == "voc":
anno_json = os.path.join(config.voc_root, config.voc_json) anno_json = os.path.join(config.voc_root, config.voc_json)
else: else:
raise ValueError('SSD eval only support dataset mode is coco and voc!') raise ValueError('SSD eval only support dataset mode is coco and voc!')
eval_param_dict = {"net": eval_net, "dataset": eval_dataset, "anno_json": anno_json} eval_param_dict = {"net": eval_net, "dataset": eval_dataset, "anno_json": anno_json}
eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval, eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval,
eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=True, eval_start_epoch=config.eval_start_epoch, save_best_ckpt=True,
ckpt_directory=save_ckpt_path, besk_ckpt_name="best_map.ckpt", ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_map.ckpt",
metrics_name="mAP") metrics_name="mAP")
callback.append(eval_cb) callback.append(eval_cb)
model = Model(net) model = Model(net)
dataset_sink_mode = False dataset_sink_mode = False
if args_opt.mode == "sink" and args_opt.run_platform != "CPU": if config.mode_sink == "sink" and config.device_target != "CPU":
print("In sink mode, one epoch return a loss.") print("In sink mode, one epoch return a loss.")
dataset_sink_mode = True dataset_sink_mode = True
print("Start train SSD, the first epoch will be slower because of the graph compilation.") print("Start train SSD, the first epoch will be slower because of the graph compilation.")
model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode) model.train(config.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
if __name__ == '__main__': if __name__ == '__main__':
main() train_net()

View File

@ -98,7 +98,7 @@ If set `split`=1.0, you should split train dataset and val dataset by directorie
We support script to convert COCO and a Cell_Nuclei dataset used in used in [Unet++ original paper](https://arxiv.org/abs/1912.05074) to mulyi-class dataset format. We support script to convert COCO and a Cell_Nuclei dataset used in used in [Unet++ original paper](https://arxiv.org/abs/1912.05074) to mulyi-class dataset format.
1. Change `cfg_unet` in `src/config.py`, you can refer to `cfg_unet_nested_cell` and `cfg_unet_simple_coco` in `src/config.py` for detail. 1. Select `*yaml` in `unet`.
2. run script to convert to mulyi-class dataset format: 2. run script to convert to mulyi-class dataset format:
@ -122,24 +122,24 @@ After installing MindSpore via the official website, you can start training and
- Select the network and dataset to use - Select the network and dataset to use
1. Select `cfg_unet` in `src/config.py`. We support unet and unet++, and we provide some parameter configurations for quick start. 1. Select `yaml` in `unet/`. We support unet and unet++, and we provide some parameter configurations for quick start.
2. If you want other parameters, please refer to `src/config.py`. You can set `'model'` to `'unet_nested'` or `'unet_simple'` to select which net to use. We support `ISBI` and `Cell_nuclei` two dataset, you can set `'dataset'` to `'Cell_nuclei'` to use `Cell_nuclei` dataset, default is `ISBI`. 2. If you want other parameters, please refer to `unet/ *.yaml`. You can set `'model'` to `'unet_nested'` or `'unet_simple'` to select which net to use. We support `ISBI` and `Cell_nuclei` two dataset, you can set `'dataset'` to `'Cell_nuclei'` to use `Cell_nuclei` dataset, default is `ISBI`.
- Run on Ascend - Run on Ascend
```python ```python
# run training example # run training example
python train.py --data_url=/path/to/data/ > train.log 2>&1 & python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
OR OR
bash scripts/run_standalone_train.sh [DATASET] bash scripts/run_standalone_train.sh [DATASET] [CONFIG_PATH]
# run distributed training example # run distributed training example
bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] [CONFIG_PATH]
# run evaluation example # run evaluation example
python eval.py --data_url=/path/to/data/ --ckpt_path=/path/to/checkpoint/ > eval.log 2>&1 & python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
OR OR
bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
``` ```
- Run on docker - Run on docker
@ -178,9 +178,11 @@ If you want to run in modelarts, please check the official documentation of [mod
# run evaluation on modelarts example # run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket. # (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b. # (2) Perform a or b.
# a. Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file. # a. Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file. # Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface. # b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface. # Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the config directory to "config_path=/The path of config in S3/" # (3) Set the config directory to "config_path=/The path of config in S3/"
# (4) Set the code directory to "/path/unet" on the website UI interface. # (4) Set the code directory to "/path/unet" on the website UI interface.
@ -309,9 +311,9 @@ Parameters for both training and evaluation can be set in config.py
#### running on Ascend #### running on Ascend
```shell ```shell
python train.py --data_url=/path/to/data/ > train.log 2>&1 & python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
OR OR
bash scripts/run_standalone_train.sh [DATASET] bash scripts/run_standalone_train.sh [DATASET] [CONFIG_PATH]
``` ```
The python command above will run in the background, you can view the results through the file `train.log`. The python command above will run in the background, you can view the results through the file `train.log`.
@ -338,7 +340,7 @@ The model checkpoint will be saved in the current directory.
#### Distributed Training #### Distributed Training
```shell ```shell
bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] [CONFIG_PATH]
``` ```
The above shell script will run distribute training in the background. You can view the results through the file `logs/device[X]/log.log`. The loss value will be achieved as follows: The above shell script will run distribute training in the background. You can view the results through the file `logs/device[X]/log.log`. The loss value will be achieved as follows:
@ -365,9 +367,9 @@ You can add `run_eval` to start shell and set it True, if you want evaluation wh
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/unet/ckpt_unet_medical_adam-48_600.ckpt". Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/unet/ckpt_unet_medical_adam-48_600.ckpt".
```shell ```shell
python eval.py --data_url=/path/to/data/ --ckpt_path=/path/to/unet.ckpt > eval.log 2>&1 & python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
OR OR
bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
``` ```
The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
@ -412,10 +414,10 @@ If you need to use the trained model to perform inference on multiple hardware p
Export MindIR Export MindIR
```shell ```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
``` ```
The ckpt_file parameter is required, The checkpoint_file_path parameter is required,
`EXPORT_FORMAT` should be in ["AIR", "MINDIR"] `EXPORT_FORMAT` should be in ["AIR", "MINDIR"]
Before performing inference, the MINDIR file must be exported by export script on the 910 environment. Before performing inference, the MINDIR file must be exported by export script on the 910 environment.
@ -436,11 +438,11 @@ Cross valid dice coeff is: 0.9054352151297033
#### Continue Training on the Pretrained Model #### Continue Training on the Pretrained Model
Set options `resume` to True in `config.py`, and set `resume_ckpt` to the path of your checkpoint. e.g. Set options `resume` to True in `*.yaml`, and set `resume_ckpt` to the path of your checkpoint. e.g.
```python ```python
'resume': True, 'resume': True,
'resume_ckpt': 'ckpt_0/ckpt_unet_sample_adam_1-1_600.ckpt', 'resume_ckpt': 'ckpt_unet_sample_adam_1-1_600.ckpt',
'transfer_training': False, 'transfer_training': False,
'filter_weight': ["final.weight"] 'filter_weight': ["final.weight"]
``` ```
@ -451,7 +453,7 @@ Do the same thing as resuming traing above. In addition, set `transfer_training`
```python ```python
'resume': True, 'resume': True,
'resume_ckpt': 'ckpt_0/ckpt_unet_sample_adam_1-1_600.ckpt', 'resume_ckpt': 'ckpt_unet_sample_adam_1-1_600.ckpt',
'transfer_training': True, 'transfer_training': True,
'filter_weight': ["final.weight"] 'filter_weight': ["final.weight"]
``` ```

View File

@ -102,7 +102,7 @@ UNet++是U-Net的增强版本使用了新的跨层链接方式和深层监督
我们提供了一个脚本来将 COCO 和 Cell_Nuclei 数据集([Unet++ 原论文](https://arxiv.org/abs/1912.05074) 中使用转换为multi-class格式。 我们提供了一个脚本来将 COCO 和 Cell_Nuclei 数据集([Unet++ 原论文](https://arxiv.org/abs/1912.05074) 中使用转换为multi-class格式。
1. 在`src/config.py`中修改`cfg_unet`,修改细节请参考`src/config.py`中的`cfg_unet_nested_cell` 和 `cfg_unet_simple_coco` 1. 在`src/model_utils/`下选择对应的yaml文件
2. 运行转换脚本: 2. 运行转换脚本:
@ -126,24 +126,24 @@ python preprocess_dataset.py -d /data/save_data_path
- 选择模型及数据集 - 选择模型及数据集
1. 在`src/config.py`中选择相应的配置项赋给`cfg_unet`现在支持unet和unet++,我们在`src/config.py`预备了一些网络及数据集的参数配置用于快速体验。 1. 在`unet/`中选择相应的配置项现在支持unet和unet++,我们在`unet/`预备了一些网络及数据集的参数配置用于快速体验。
2. 如果使用其他的参数,也可以参考`src/config.py`通过设置`'model'` 为 `'unet_nested'` 或者 `'unet_simple'` 来选择使用什么网络结构。我们支持`ISBI` 和 `Cell_nuclei`两种数据集处理,默认使用`ISBI`,可以设置`'dataset'` 为 `'Cell_nuclei'`使用`Cell_nuclei`数据集。 2. 如果使用其他的参数,也可以参考`unet/`下的yaml文件通过设置`'model'` 为 `'unet_nested'` 或者 `'unet_simple'` 来选择使用什么网络结构。我们支持`ISBI` 和 `Cell_nuclei`两种数据集处理,默认使用`ISBI`,可以设置`'dataset'` 为 `'Cell_nuclei'`使用`Cell_nuclei`数据集。
- Ascend处理器环境运行 - Ascend处理器环境运行
```python ```python
# 训练示例 # 训练示例
python train.py --data_url=/path/to/data/ > train.log 2>&1 & python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
OR OR
bash scripts/run_standalone_train.sh [DATASET] bash scripts/run_standalone_train.sh [DATASET] [CONFIG_PATH]
# 分布式训练示例 # 分布式训练示例
bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] [CONFIG_PATH]
# 评估示例 # 评估示例
python eval.py --data_url=/path/to/data/ --ckpt_path=/path/to/checkpoint/ > eval.log 2>&1 & python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
OR OR
bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
``` ```
- Docker中运行 - Docker中运行
@ -184,9 +184,11 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
# 在modelarts上使用模型推理的示例 # 在modelarts上使用模型推理的示例
# (1) 把训练好的模型地方到桶的对应位置。 # (1) 把训练好的模型地方到桶的对应位置。
# (2) 选址a或者b其中一种方式。 # (2) 选址a或者b其中一种方式。
# a. 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件. # a. 设置 "enable_modelarts=True"
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件. # 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
# b. 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。 # b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。 # 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
# (3) 设置网络配置文件的路径 "config_path=/The path of config in S3/" # (3) 设置网络配置文件的路径 "config_path=/The path of config in S3/"
# (4) 在modelarts的界面上设置代码的路径 "/path/unet"。 # (4) 在modelarts的界面上设置代码的路径 "/path/unet"。
@ -305,9 +307,9 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
- Ascend处理器环境运行 - Ascend处理器环境运行
```shell ```shell
python train.py --data_url=/path/to/data/ > train.log 2>&1 & python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
OR OR
bash scripts/run_standalone_train.sh [DATASET] bash scripts/run_standalone_train.sh [DATASET] [CONFIG_PATH]
``` ```
上述python命令在后台运行可通过`train.log`文件查看结果。 上述python命令在后台运行可通过`train.log`文件查看结果。
@ -361,9 +363,9 @@ step: 300, loss is 0.18949677, fps is 57.63118508760329
在运行以下命令之前,请检查用于评估的检查点路径。将检查点路径设置为绝对全路径,如"username/unet/ckpt_unet_medical_adam-48_600.ckpt"。 在运行以下命令之前,请检查用于评估的检查点路径。将检查点路径设置为绝对全路径,如"username/unet/ckpt_unet_medical_adam-48_600.ckpt"。
```shell ```shell
python eval.py --data_url=/path/to/data/ --ckpt_path=/path/to/unet.ckpt/ > eval.log 2>&1 & python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
OR OR
bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
``` ```
上述python命令在后台运行。可通过"eval.log"文件查看结果。测试数据集的准确率如下: 上述python命令在后台运行。可通过"eval.log"文件查看结果。测试数据集的准确率如下:
@ -408,10 +410,10 @@ step: 300, loss is 0.18949677, fps is 57.63118508760329
导出mindir模型 导出mindir模型
```shell ```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
``` ```
参数`ckpt_file` 是必需的,`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中进行选择。 参数`checkpoint_file_path` 是必需的,`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中进行选择。
在执行推理前MINDIR文件必须在910上通过export.py文件导出。 在执行推理前MINDIR文件必须在910上通过export.py文件导出。
目前仅可处理batch_Size为1。 目前仅可处理batch_Size为1。
@ -435,7 +437,7 @@ Cross valid dice coeff is: 0.9054352151297033
```python ```python
'resume': True, 'resume': True,
'resume_ckpt': 'ckpt_0/ckpt_unet_medical_adam_1-1_600.ckpt', 'resume_ckpt': 'ckpt_unet_medical_adam_1-1_600.ckpt',
'transfer_training': False, 'transfer_training': False,
'filter_weight': ["final.weight"] 'filter_weight': ["final.weight"]
``` ```
@ -446,7 +448,7 @@ Cross valid dice coeff is: 0.9054352151297033
```python ```python
'resume': True, 'resume': True,
'resume_ckpt': 'ckpt_0/ckpt_unet_medical_adam_1-1_600.ckpt', 'resume_ckpt': 'ckpt_unet_medical_adam_1-1_600.ckpt',
'transfer_training': True, 'transfer_training': True,
'filter_weight': ["final.weight"] 'filter_weight': ["final.weight"]
``` ```

View File

@ -87,7 +87,7 @@ class CellNucleiDataset:
if __name__ == '__main__': if __name__ == '__main__':
if config.dataset == "Cell_nuclei": if hasattr(config, 'dataset') and config.dataset == "Cell_nuclei":
cell_dataset = CellNucleiDataset(config.data_path, 1, config.result_path, False, 0.8) cell_dataset = CellNucleiDataset(config.data_path, 1, config.result_path, False, 0.8)
else: else:
preprocess_dataset(data_dir=config.data_path, cross_valid_ind=config.cross_valid_ind, preprocess_dataset(data_dir=config.data_path, cross_valid_ind=config.cross_valid_ind,

View File

@ -41,12 +41,13 @@ eval_resize: False
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ckpt_unet_medical_adam-4-75.ckpt' checkpoint_file_path: 'ckpt_unet_medical_adam-4-75.ckpt'
rst_path: './result_Files/' rst_path: './result_Files/'
result_path: ""
# Export options # Export options
width: 572 width: 572
height: 572 height: 572
file_name: unet file_name: "unet"
file_format: AIR file_format: "AIR"
--- ---
# Help description for each configuration # Help description for each configuration

View File

@ -45,12 +45,13 @@ eval_resize: False
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ckpt_unet_nested_adam-4-75.ckpt' checkpoint_file_path: 'ckpt_unet_nested_adam-4-75.ckpt'
rst_path: './result_Files/' rst_path: './result_Files/'
result_path: ""
# Export options # Export options
width: 572 width: 572
height: 572 height: 572
file_name: unet file_name: "unet"
file_format: AIR file_format: "AIR"
--- ---
# Help description for each configuration # Help description for each configuration

View File

@ -44,12 +44,13 @@ eval_resize: False
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ckpt_unet_nested_adam-4-75.ckpt' checkpoint_file_path: 'ckpt_unet_nested_adam-4-75.ckpt'
rst_path: './result_Files/' rst_path: './result_Files/'
result_path: ""
# Export options # Export options
width: 572 width: 572
height: 572 height: 572
file_name: unet file_name: "unet"
file_format: AIR file_format: "AIR"
--- ---
# Help description for each configuration # Help description for each configuration

View File

@ -65,12 +65,13 @@ eval_resize: False
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ckpt_unet_simple_adam-4-75.ckpt' checkpoint_file_path: 'ckpt_unet_simple_adam-4-75.ckpt'
rst_path: './result_Files/' rst_path: './result_Files/'
result_path: ""
# Export options # Export options
width: 572 width: 572
height: 572 height: 572
file_name: unet file_name: "unet"
file_format: AIR file_format: "AIR"
--- ---
# Help description for each configuration # Help description for each configuration

View File

@ -41,12 +41,13 @@ eval_resize: False
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'ckpt_unet_simple_adam-4-75.ckpt' checkpoint_file_path: 'ckpt_unet_simple_adam-4-75.ckpt'
rst_path: './result_Files/' rst_path: './result_Files/'
result_path: ""
# Export options # Export options
width: 572 width: 572
height: 572 height: 572
file_name: unet file_name: "unet"
file_format: AIR file_format: "AIR"
--- ---
# Help description for each configuration # Help description for each configuration

View File

@ -65,20 +65,20 @@ python ./src/convert_nifti.py --input_path=/path/to/input_image/ --output_path=/
``` ```
Refer to `src/config.py`. We support some parameter configurations for quick start. Refer to `default_config.yaml`. We support some parameter configurations for quick start.
- Run on Ascend - Run on Ascend
```python ```python
# run training example # run training example
python train.py --data_url=/path/to/data/ --seg_url=/path/to/segment/ > train.log 2>&1 & python train.py --data_path=/path/to/data/ > train.log 2>&1 &
# run distributed training example # run distributed training example
bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [IMAGE_PATH] [SEG_PATH] bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
# run evaluation example # run evaluation example
python eval.py --data_url=/path/to/data/ --seg_url=/path/to/segment/ --ckpt_path=/path/to/checkpoint/ > eval.log 2>&1 & python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ > eval.log 2>&1 &
``` ```
@ -92,22 +92,22 @@ If you want to run in modelarts, please check the official documentation of [mod
# b. Add "enable_modelarts=True" on the website UI interface. # b. Add "enable_modelarts=True" on the website UI interface.
# Add other parameters on the website UI interface. # Add other parameters on the website UI interface.
# (2) Download nibabel and set pip-requirements.txt to code directory # (2) Download nibabel and set pip-requirements.txt to code directory
# (3) Set the config directory to "config_path=/The path of config in S3/" # (3) Set the code directory to "/path/unet3d" on the website UI interface.
# (4) Set the code directory to "/path/unet" on the website UI interface. # (4) Set the startup file to "train.py" on the website UI interface.
# (5) Set the startup file to "train.py" on the website UI interface. # (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. # (6) Create your job.
# (7) Create your job.
# run evaluation on modelarts example # run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket. # (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b. # (2) Perform a or b.
# a. Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file. # a. Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file. # Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface. # b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface. # Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Download nibabel and set pip-requirements.txt to code directory # (3) Download nibabel and set pip-requirements.txt to code directory
# (4) Set the config directory to "config_path=/The path of config in S3/" # (5) Set the code directory to "/path/unet3d" on the website UI interface.
# (5) Set the code directory to "/path/unet" on the website UI interface.
# (6) Set the startup file to "eval.py" on the website UI interface. # (6) Set the startup file to "eval.py" on the website UI interface.
# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
# (8) Create your job. # (8) Create your job.
@ -128,7 +128,7 @@ If you want to run in modelarts, please check the official documentation of [mod
│ ├──run_standalone_eval.sh // shell script for evaluation on Ascend │ ├──run_standalone_eval.sh // shell script for evaluation on Ascend
├── src ├── src
│ ├──dataset.py // creating dataset │ ├──dataset.py // creating dataset
   ├──lr_schedule.py // learning rate scheduler ├──lr_schedule.py // learning rate scheduler
│ ├──transform.py // handle dataset │ ├──transform.py // handle dataset
│ ├──convert_nifti.py // convert dataset │ ├──convert_nifti.py // convert dataset
│ ├──loss.py // loss │ ├──loss.py // loss
@ -180,8 +180,7 @@ Parameters for both training and evaluation can be set in config.py
#### running on Ascend #### running on Ascend
```shell ```shell
python train.py --data_path=/path/to/data/ > train.log 2>&1 &
python train.py --data_url=/path/to/data/ -seg_url=/path/to/segment/ > train.log 2>&1 &
``` ```
@ -205,7 +204,7 @@ epoch time: 1180467.795 ms, per step time: 1380.664 ms
#### Distributed Training #### Distributed Training
> Notes: > Notes:
> RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/distributed_training_ascend.html) , and the device_ip can be got as [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). For large models like InceptionV4, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size. > RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/distributed_training_ascend.html) , and the device_ip can be got as [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). For large models like InceptionV4, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size.
> >
```shell ```shell
@ -241,8 +240,7 @@ epoch time: 140476.520 ms, per step time: 1312.865 ms
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/unet3d/Unet3d-10_110.ckpt". Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/unet3d/Unet3d-10_110.ckpt".
```shell ```shell
python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ > eval.log 2>&1 &
python eval.py --data_url=/path/to/data/ --seg_url=/path/to/segment/ --ckpt_path=/path/to/checkpoint/ > eval.log 2>&1 &
``` ```

View File

@ -57,14 +57,16 @@ After installing MindSpore via the official website, you can start training and
```python ```python
# run training example # run training example
# need set config_path in config.py file and set data_path in yaml file
python train.py > train.log 2>&1 & python train.py > train.log 2>&1 &
OR OR
sh scripts/run_train.sh sh scripts/run_train.sh dataset
# run evaluation example # run evaluation example
# need set config_path in config.py file and set data_path, checkpoint_file_path in yaml file
python eval.py > eval.log 2>&1 & python eval.py > eval.log 2>&1 &
OR OR
sh scripts/run_eval.sh ckpt_path sh scripts/run_eval.sh checkpoint_file_path dataset
``` ```
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
@ -84,9 +86,11 @@ If you want to run in modelarts, please check the official documentation of [mod
# run evaluation on modelarts example # run evaluation on modelarts example
# (1) Copy or upload your trained model to S3 bucket. # (1) Copy or upload your trained model to S3 bucket.
# (2) Perform a or b. # (2) Perform a or b.
# a. Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file. # a.Set "enable_modelarts=True" on yaml file.
# Set "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on yaml file.
# Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file. # Set "checkpoint_url=/The path of checkpoint in S3/" on yaml file.
# b. Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface. # b. Add "enable_modelarts=True" on the website UI interface.
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface. # Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
# (3) Set the code directory to "/path/textcnn" on the website UI interface. # (3) Set the code directory to "/path/textcnn" on the website UI interface.
# (4) Set the startup file to "eval.py" on the website UI interface. # (4) Set the startup file to "eval.py" on the website UI interface.
@ -144,16 +148,17 @@ Parameters for both training and evaluation can be set in config.py
'base_lr': 1e-3 # The base learning rate 'base_lr': 1e-3 # The base learning rate
``` ```
For more configuration details, please refer the script `config.py`. For more configuration details, please refer the script `*.yaml`.
## [Training Process](#contents) ## [Training Process](#contents)
- running on Ascend - running on Ascend
```python ```python
# need set config_path in config.py file and set data_path in yaml file
python train.py > train.log 2>&1 & python train.py > train.log 2>&1 &
OR OR
sh scripts/run_train.sh sh scripts/run_train.sh dataset
``` ```
The python command above will run in the background, you can view the results through the file `train.log`. The python command above will run in the background, you can view the results through the file `train.log`.
@ -176,9 +181,10 @@ For more configuration details, please refer the script `config.py`.
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/textcnn/ckpt/train_textcnn.ckpt". Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "username/textcnn/ckpt/train_textcnn.ckpt".
```python ```python
python eval.py --checkpoint_path=ckpt_path > eval.log 2>&1 & # need set config_path in config.py file and set data_path, checkpoint_file_path in yaml file
python eval.py > eval.log 2>&1 &
OR OR
sh scripts/run_eval.sh ckpt_path sh scripts/run_eval.sh checkpoint_file_path dataset
``` ```
The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
@ -191,7 +197,7 @@ For more configuration details, please refer the script `config.py`.
## [Export MindIR](#contents) ## [Export MindIR](#contents)
```shell ```shell
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] python export.py --checkpoint_file_path [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
``` ```
The ckpt_file parameter is required, The ckpt_file parameter is required,

View File

@ -11,6 +11,7 @@ data_path: "/cache/data"
output_path: "/cache/train" output_path: "/cache/train"
load_path: "/cache/checkpoint_path/" load_path: "/cache/checkpoint_path/"
device_num: 1 device_num: 1
device_id: 0
device_target: 'Ascend' device_target: 'Ascend'
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_cifar10-120_195.ckpt' checkpoint_file_path: 'suqeezenet_cifar10-120_195.ckpt'

View File

@ -11,6 +11,7 @@ data_path: "/cache/data"
output_path: "/cache/train" output_path: "/cache/train"
load_path: "/cache/checkpoint_path/" load_path: "/cache/checkpoint_path/"
device_num: 1 device_num: 1
device_id: 0
device_target: 'Ascend' device_target: 'Ascend'
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_imagenet-200_5004.ckpt' checkpoint_file_path: 'suqeezenet_imagenet-200_5004.ckpt'

View File

@ -11,6 +11,7 @@ data_path: "/cache/data"
output_path: "/cache/train" output_path: "/cache/train"
load_path: "/cache/checkpoint_path/" load_path: "/cache/checkpoint_path/"
device_num: 1 device_num: 1
device_id: 0
device_target: 'Ascend' device_target: 'Ascend'
checkpoint_path: './checkpoint/' checkpoint_path: './checkpoint/'
checkpoint_file_path: 'suqeezenet_residual_imagenet-300_5004.ckpt' checkpoint_file_path: 'suqeezenet_residual_imagenet-300_5004.ckpt'

View File

@ -80,7 +80,7 @@ python train.py
sh run_distribute_train_ghostnet.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE] sh run_distribute_train_ghostnet.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] [RANK_TABLE_FILE]
# run eval on Ascend # run eval on Ascend
python eval.py --device_id 0 --dataset coco --checkpoint_path LOG4/ssd-500_458.ckpt python eval.py --device_id 0 --dataset coco --checkpoint_file_path LOG4/ssd-500_458.ckpt
``` ```
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows: If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:

View File

@ -30,22 +30,16 @@ def test_SSD_mobilenet_v1_fpn_coco2017():
utils.copy_files(model_path, cur_path, model_name) utils.copy_files(model_path, cur_path, model_name)
cur_model_path = os.path.join(cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name)
old_list = ["/data/MindRecord_COCO", old_list = ["/cache/data", "MindRecord_COCO", "coco_ori", "/ckpt/mobilenet_v1.ckpt"]
"/ckpt/mobilenet_v1.ckpt", new_list = [os.path.join(utils.data_root, "coco/coco2017"), "mindrecord_train/ssd_mindrecord", ".",
"/data/coco2017"] os.path.join(utils.ckpt_root, "ssd_mobilenet_v1/mobilenet-v1.ckpt")]
new_list = [os.path.join(utils.data_root, "coco/coco2017/mindrecord_train/ssd_mindrecord"), utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "ssd_mobilenet_v1_fpn_config.yaml"))
os.path.join(utils.ckpt_root, "ssd_mobilenet_v1/mobilenet-v1.ckpt"), old_list = ["config.epoch_size", "dataset_sink_mode=dataset_sink_mode"]
os.path.join(utils.data_root, "coco/coco2017")]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/config_ssd_mobilenet_v1_fpn.py"))
old_list = ["ssd300"]
new_list = ["ssd_mobilenet_v1_fpn"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/config.py"))
old_list = ["args_opt.epoch_size", "dataset_sink_mode=dataset_sink_mode"]
new_list = ["5", "dataset_sink_mode=dataset_sink_mode, sink_size=20"] new_list = ["5", "dataset_sink_mode=dataset_sink_mode, sink_size=20"]
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
exec_network_shell = "cd {0}; sh -x scripts/run_distribute_train.sh 8 {1} 0.2 coco {2}"\ exec_network_shell = "cd {0}; sh -x scripts/run_distribute_train.sh 8 {1} 0.2 coco \
.format(model_name, 60, utils.rank_table_path) {2} ssd_mobilenet_v1_fpn_config.yaml".format(model_name, 60, utils.rank_table_path)
os.system(exec_network_shell) os.system(exec_network_shell)
cmd = "ps -ef | grep train.py | grep coco | grep device_num | grep device_id | grep -v grep" cmd = "ps -ef | grep train.py | grep coco | grep device_num | grep device_id | grep -v grep"
ret = utils.process_check(120, cmd) ret = utils.process_check(120, cmd)