modify model_zoo resnext and resnet bug

This commit is contained in:
lilei 2021-07-07 09:32:58 +08:00
parent 00149771ae
commit cc3138cd81
8 changed files with 30 additions and 16 deletions

View File

@ -380,13 +380,13 @@ Please follow the instructions in the link [GPU-Multi-Host](https://www.mindspor
- Parameter server training Ascend example
```bash
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
```
- Parameter server training GPU example
```bash
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Evaluation while training

View File

@ -16,7 +16,7 @@ checkpoint_file_path: ""
# ==============================================================================
# Training options
optimizer: "Momentum"
optimizer: "Thor"
infer_label: ""
class_num: 1001
batch_size: 32

View File

@ -24,7 +24,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init
from mindspore.communication.management import init, get_rank
from mindspore.common import set_seed
from mindspore.parallel import set_algo_parameters
import mindspore.nn as nn
@ -177,6 +177,18 @@ def run_eval(target, model, ckpt_save_dir, cb):
metrics_name="acc")
cb += [eval_cb]
def set_save_ckpt_dir():
"""set save ckpt dir"""
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
if config.enable_modelarts and config.run_distribute:
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/"
else:
if config.run_distribute:
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"
return ckpt_save_dir
@moxing_wrapper()
def train_net():
"""train net"""
@ -234,8 +246,7 @@ def train_net():
time_cb = TimeMonitor(data_size=step_size)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/"
ckpt_save_dir = set_save_ckpt_dir()
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
keep_checkpoint_max=config.keep_checkpoint_max)

View File

@ -221,7 +221,7 @@ or shell script:
```script
# Evaluation
sh run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH DEVICE_TARGET
sh scripts/run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH DEVICE_TARGET
```
PLATFORM is Ascend or GPU, default is Ascend.

View File

@ -229,7 +229,7 @@ python eval.py --data_path ~/imagenet/val/ --device_target Ascend --checkpoint_f
```shell
# 评估
sh run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH PLATFORM
sh scripts/run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH PLATFORM
```
DEVICE_TARGET is Ascend or GPU, default is Ascend.

View File

@ -127,8 +127,9 @@ def set_graph_kernel_context(device_target):
if device_target == "GPU":
context.set_context(enable_graph_kernel=True)
@moxing_wrapper()
def test(cloud_args=None):
def test():
"""test"""
set_parameters()
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
@ -150,7 +151,7 @@ def test(cloud_args=None):
if os.path.isdir(config.checkpoint_file_path):
models = list(glob.glob(os.path.join(config.checkpoint_file_path, '*.ckpt')))
print(models)
if config.graph_ckpt:
if config.checkpoint_file_path:
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])
else:
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])

View File

@ -17,13 +17,11 @@
export DEVICE_ID=$1
DATA_DIR=$2
PATH_CHECKPOINT=$3
PLATFORM=Ascend
if [ $# == 4 ]
then
PLATFORM=$4
fi
PLATFORM=$4
python eval.py \
--checkpoint_file_path=$PATH_CHECKPOINT \
--device_target=$PLATFORM \
--data_path=$DATA_DIR > log.txt 2>&1 &
--data_path=$DATA_DIR \
--device_target=$PLATFORM > log.txt 2>&1 &

View File

@ -65,12 +65,16 @@ def test_resnet50_cifar10_gpu():
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml")
os.system("nvidia-smi")
exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh {} {}" \
.format(dataset_path, config_path)
logger.warning("cmd [{}] is running...".format(exec_network_shell))
os.system(exec_network_shell)
cmd = "ps -ef | grep python | grep train.py | grep -v grep"
ret = utils.process_check(100, cmd)
if not ret:
cmd = "{} | awk -F' ' '{{print $2}}' | xargs kill -9".format(cmd)
os.system(cmd)
assert ret
log_file = os.path.join(cur_model_path, "scripts/train_parallel/log")
pattern = r"per step time: ([\d\.]+) ms"