modify model_zoo resnext and resnet bug
This commit is contained in:
parent
00149771ae
commit
cc3138cd81
|
@ -380,13 +380,13 @@ Please follow the instructions in the link [GPU-Multi-Host](https://www.mindspor
|
|||
- Parameter server training Ascend example
|
||||
|
||||
```bash
|
||||
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_parameter_server_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
```
|
||||
|
||||
- Parameter server training GPU example
|
||||
|
||||
```bash
|
||||
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
bash run_parameter_server_train_gpu.sh [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
|
||||
```
|
||||
|
||||
#### Evaluation while training
|
||||
|
|
|
@ -16,7 +16,7 @@ checkpoint_file_path: ""
|
|||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: "Momentum"
|
||||
optimizer: "Thor"
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 32
|
|
@ -24,7 +24,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
|
|||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.communication.management import init
|
||||
from mindspore.communication.management import init, get_rank
|
||||
from mindspore.common import set_seed
|
||||
from mindspore.parallel import set_algo_parameters
|
||||
import mindspore.nn as nn
|
||||
|
@ -177,6 +177,18 @@ def run_eval(target, model, ckpt_save_dir, cb):
|
|||
metrics_name="acc")
|
||||
cb += [eval_cb]
|
||||
|
||||
|
||||
def set_save_ckpt_dir():
|
||||
"""set save ckpt dir"""
|
||||
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
|
||||
if config.enable_modelarts and config.run_distribute:
|
||||
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/"
|
||||
else:
|
||||
if config.run_distribute:
|
||||
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"
|
||||
return ckpt_save_dir
|
||||
|
||||
|
||||
@moxing_wrapper()
|
||||
def train_net():
|
||||
"""train net"""
|
||||
|
@ -234,8 +246,7 @@ def train_net():
|
|||
time_cb = TimeMonitor(data_size=step_size)
|
||||
loss_cb = LossMonitor()
|
||||
cb = [time_cb, loss_cb]
|
||||
ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path)
|
||||
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank_id()) + "/"
|
||||
ckpt_save_dir = set_save_ckpt_dir()
|
||||
if config.save_checkpoint:
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
|
|
|
@ -221,7 +221,7 @@ or shell script:
|
|||
|
||||
```script
|
||||
# Evaluation
|
||||
sh run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH DEVICE_TARGET
|
||||
sh scripts/run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH DEVICE_TARGET
|
||||
```
|
||||
|
||||
PLATFORM is Ascend or GPU, default is Ascend.
|
||||
|
|
|
@ -229,7 +229,7 @@ python eval.py --data_path ~/imagenet/val/ --device_target Ascend --checkpoint_f
|
|||
|
||||
```shell
|
||||
# 评估
|
||||
sh run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH PLATFORM
|
||||
sh scripts/run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH PLATFORM
|
||||
```
|
||||
|
||||
DEVICE_TARGET is Ascend or GPU, default is Ascend.
|
||||
|
|
|
@ -127,8 +127,9 @@ def set_graph_kernel_context(device_target):
|
|||
if device_target == "GPU":
|
||||
context.set_context(enable_graph_kernel=True)
|
||||
|
||||
|
||||
@moxing_wrapper()
|
||||
def test(cloud_args=None):
|
||||
def test():
|
||||
"""test"""
|
||||
set_parameters()
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
|
||||
|
@ -150,7 +151,7 @@ def test(cloud_args=None):
|
|||
if os.path.isdir(config.checkpoint_file_path):
|
||||
models = list(glob.glob(os.path.join(config.checkpoint_file_path, '*.ckpt')))
|
||||
print(models)
|
||||
if config.graph_ckpt:
|
||||
if config.checkpoint_file_path:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])
|
||||
else:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
|
||||
|
|
|
@ -17,13 +17,11 @@
|
|||
export DEVICE_ID=$1
|
||||
DATA_DIR=$2
|
||||
PATH_CHECKPOINT=$3
|
||||
PLATFORM=Ascend
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PLATFORM=$4
|
||||
fi
|
||||
PLATFORM=$4
|
||||
|
||||
|
||||
python eval.py \
|
||||
--checkpoint_file_path=$PATH_CHECKPOINT \
|
||||
--device_target=$PLATFORM \
|
||||
--data_path=$DATA_DIR > log.txt 2>&1 &
|
||||
--data_path=$DATA_DIR \
|
||||
--device_target=$PLATFORM > log.txt 2>&1 &
|
||||
|
|
|
@ -65,12 +65,16 @@ def test_resnet50_cifar10_gpu():
|
|||
utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py"))
|
||||
dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin")
|
||||
config_path = os.path.join(cur_model_path, "resnet50_cifar10_config.yaml")
|
||||
os.system("nvidia-smi")
|
||||
exec_network_shell = "cd resnet/scripts; sh run_distribute_train_gpu.sh {} {}" \
|
||||
.format(dataset_path, config_path)
|
||||
logger.warning("cmd [{}] is running...".format(exec_network_shell))
|
||||
os.system(exec_network_shell)
|
||||
cmd = "ps -ef | grep python | grep train.py | grep -v grep"
|
||||
ret = utils.process_check(100, cmd)
|
||||
if not ret:
|
||||
cmd = "{} | awk -F' ' '{{print $2}}' | xargs kill -9".format(cmd)
|
||||
os.system(cmd)
|
||||
assert ret
|
||||
log_file = os.path.join(cur_model_path, "scripts/train_parallel/log")
|
||||
pattern = r"per step time: ([\d\.]+) ms"
|
||||
|
|
Loading…
Reference in New Issue