remove unused code in quant train

This commit is contained in:
chenzupeng 2020-06-13 19:23:30 +08:00
parent e21a0aad69
commit 52a90f2587
10 changed files with 14 additions and 266 deletions

View File

@ -67,7 +67,7 @@ Dataset used: imagenet
``` ```
# training example # training example
Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ Ascend: sh run_train.sh Ascend 4 192.168.0.1 0,1,2,3 ~/imagenet/train/ ~/mobilenet.ckpt
``` ```
### Result ### Result
@ -104,156 +104,6 @@ Inference result will be stored in the example path, you can find result like th
result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt
``` ```
# Model description
## Performance
### Training Performance
<table>
<thead>
<tr>
<th>Parameters</th>
<th>MobilenetV2</th>
<th>MobilenetV2 Quant</th>
</tr>
</thead>
<tbody>
<tr>
<td>Resource</td>
<td>Ascend 910 <br />
cpu:2.60GHz 56cores <br />
memory:314G</td>
<td>Ascend 910 <br />
cpu:2.60GHz 56cores <br />
memory:314G</td>
</tr>
<tr>
<td>uploaded Date</td>
<td>05/06/2020</td>
<td>06/12/2020</td>
</tr>
<tr>
<td>MindSpore Version</td>
<td>0.3.0</td>
<td>0.3.0</td>
</tr>
<tr>
<td>Dataset</td>
<td>ImageNet</td>
<td>ImageNet</td>
</tr>
<tr>
<td>Training Parameters</td>
<td>src/config.py</td>
<td>src/config.py</td>
</tr>
<tr>
<td>Optimizer</td>
<td>Momentum</td>
<td>Momentum</td>
</tr>
<tr>
<td>Loss Function</td>
<td>CrossEntropyWithLabelSmooth</td>
<td>CrossEntropyWithLabelSmooth</td>
</tr>
<tr>
<td>Loss</td>
<td>200 epoch:1.913</td>
<td>50 epoch:1.912</td>
</tr>
<tr>
<td>Train Accuracy</td>
<td>ACC1[77.09%] ACC5[92.57%]</td>
<td>ACC1[77.09%] ACC5[92.57%]</td>
</tr>
<tr>
<td>Eval Accuracy</td>
<td>ACC1[77.09%] ACC5[92.57%]</td>
<td>ACC1[77.09%] ACC5[92.57%]</td>
</tr>
<tr>
<td>Total time</td>
<td>48h</td>
<td>12h</td>
</tr>
<tr>
<td>Checkpoint</td>
<td>/</td>
<td>mobilenetv2.ckpt</td>
</tr>
</tbody>
</table>
#### Inference Performance
<table>
<thead>
<tr>
<th>Parameters</th>
<th>Ascend 910</th>
<th>Ascend 310</th>
<th>Nvidia V100</th>
</tr>
</thead>
<tbody>
<tr>
<td>uploaded Date</td>
<td>06/12/2020</td>
<td></td>
<td></td>
</tr>
<tr>
<td>MindSpore Version</td>
<td>0.3.0</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Dataset</td>
<td>ImageNet, 1.2W</td>
<td></td>
<td></td>
</tr>
<tr>
<td>batch_size</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>outputs</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Accuracy</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Speed</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Total time</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Model for inference</td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
# ModelZoo Homepage # ModelZoo Homepage
[Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo) [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo)

View File

@ -35,20 +35,19 @@ fi
# set environment # set environment
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export DEVICE_ID=0 export DEVICE_ID=0
export RANK_ID=0 export RANK_ID=0
export RANK_SIZE=1 export RANK_SIZE=1
if [ -d "eval" ]; if [ -d "../eval" ];
then then
rm -rf ../eval rm -rf ../eval
fi fi
mkdir ../eval mkdir ../eval
cd ../eval || exit cd ../eval || exit
# luanch # launch
python ${BASEPATH}/../eval.py \ python ${BASEPATH}/../eval.py \
--platform=$1 \ --platform=$1 \
--dataset_path=$2 \ --dataset_path=$2 \
--checkpoint_path=$3 \ --checkpoint_path=$3 \
&> ../infer.log & # dataset val folder path &> infer.log & # dataset val folder path

View File

@ -30,7 +30,7 @@ run_ascend()
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ]; if [ -d "../train" ];
then then
rm -rf ../train rm -rf ../train
fi fi
@ -43,39 +43,7 @@ run_ascend()
--training_script=${BASEPATH}/../train.py \ --training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \ --dataset_path=$5 \
--pre_trained=$6 \ --pre_trained=$6 \
--platform=$1 &> ../train.log & # dataset train folder --platform=$1 &> train.log & # dataset train folder
}
run_gpu()
{
if [ $2 -lt 1 ] && [ $2 -gt 8 ]
then
echo "error: DEVICE_NUM=$2 is not in (1-8)"
exit 1
fi
if [ ! -d $4 ]
then
echo "error: DATASET_PATH=$4 is not a directory"
exit 1
fi
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ];
then
rm -rf ../train
fi
mkdir ../train
cd ../train || exit
export CUDA_VISIBLE_DEVICES="$3"
mpirun -n $2 --allow-run-as-root \
python ${BASEPATH}/../train.py \
--dataset_path=$4 \
--platform=$1 \
--pre_trained=$5 \
&> ../train.log & # dataset train folder
} }
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]

View File

@ -35,21 +35,3 @@ config_ascend = ed({
"keep_checkpoint_max": 200, "keep_checkpoint_max": 200,
"save_checkpoint_path": "./checkpoint", "save_checkpoint_path": "./checkpoint",
}) })
config_gpu = ed({
"num_classes": 1000,
"image_height": 224,
"image_width": 224,
"batch_size": 64,
"epoch_size": 200,
"warmup_epochs": 4,
"lr": 0.5,
"momentum": 0.9,
"weight_decay": 4e-5,
"label_smooth": 0.1,
"loss_scale": 1024,
"save_checkpoint": True,
"save_checkpoint_epochs": 1,
"keep_checkpoint_max": 200,
"save_checkpoint_path": "./checkpoint",
})

View File

@ -41,17 +41,10 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
if rank_size == 1: if rank_size == 1:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=False,
num_shards=rank_size, shard_id=rank_id) num_shards=rank_size, shard_id=rank_id)
else: else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=False) ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=False)
elif platform == "GPU":
if do_train:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=False)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")

View File

@ -18,6 +18,7 @@ import sys
import json import json
import subprocess import subprocess
import shutil import shutil
import platform
from argparse import ArgumentParser from argparse import ArgumentParser
@ -80,7 +81,8 @@ def main():
device_ips[device_id] = device_ip device_ips[device_id] = device_ip
print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
hccn_table = {} hccn_table = {}
hccn_table['board_id'] = '0x0020' arch = platform.processor()
hccn_table['board_id'] = {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch]
hccn_table['chip_info'] = '910' hccn_table['chip_info'] = '910'
hccn_table['deploy_mode'] = 'lab' hccn_table['deploy_mode'] = 'lab'
hccn_table['group_count'] = '1' hccn_table['group_count'] = '1'

View File

@ -21,7 +21,6 @@ import numpy as np
from mindspore import context from mindspore import context
from mindspore import Tensor from mindspore import Tensor
from mindspore import nn from mindspore import nn
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum from mindspore.nn.optim.momentum import Momentum
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.nn.loss.loss import _Loss from mindspore.nn.loss.loss import _Loss
@ -57,9 +56,6 @@ if args_opt.platform == "Ascend":
context.set_context(mode=context.GRAPH_MODE, context.set_context(mode=context.GRAPH_MODE,
device_target="Ascend", device_target="Ascend",
device_id=device_id, save_graphs=False) device_id=device_id, save_graphs=False)
elif args_opt.platform == "GPU":
context.set_context(mode=context.GRAPH_MODE,
device_target="GPU", save_graphs=False)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
@ -191,7 +187,6 @@ if __name__ == '__main__':
if run_distribute: if run_distribute:
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL,
parameter_broadcast=True, mirror_mean=True) parameter_broadcast=True, mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init() init()
epoch_size = config_ascend.epoch_size epoch_size = config_ascend.epoch_size

View File

@ -15,8 +15,7 @@
# ============================================================================ # ============================================================================
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH] \ echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]"
GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]"
exit 1 exit 1
fi fi

View File

@ -46,51 +46,16 @@ run_ascend()
--device_target=$1 &> train.log & # dataset train folder --device_target=$1 &> train.log & # dataset train folder
} }
run_gpu()
{
if [ $2 -lt 1 ] && [ $2 -gt 8 ]
then
echo "error: DEVICE_NUM=$2 is not in (1-8)"
exit 1
fi
if [ ! -d $4 ]
then
echo "error: DATASET_PATH=$4 is not a directory"
exit 1
fi
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "../train" ];
then
rm -rf ../train
fi
mkdir ../train
cd ../train || exit
export CUDA_VISIBLE_DEVICES="$3"
mpirun -n $2 --allow-run-as-root \
python ${BASEPATH}/../train.py \
--dataset_path=$4 \
--platform=$1 \
--pre_trained=$5 \
&> train.log & # dataset train folder
}
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]
then then
echo "Usage:\n \ echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
" "
exit 1 exit 1
fi fi
if [ $1 = "Ascend" ] ; then if [ $1 = "Ascend" ] ; then
run_ascend "$@" run_ascend "$@"
elif [ $1 = "GPU" ] ; then
run_gpu "$@"
else else
echo "not support platform" echo "not support platform"
fi; fi;

View File

@ -23,7 +23,7 @@ from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint from mindspore.train.serialization import load_checkpoint
from mindspore.communication.management import init, get_rank, get_group_size from mindspore.communication.management import init
import mindspore.nn as nn import mindspore.nn as nn
import mindspore.common.initializer as weight_init import mindspore.common.initializer as weight_init
from models.resnet_quant import resnet50_quant from models.resnet_quant import resnet50_quant
@ -57,13 +57,8 @@ if __name__ == '__main__':
mirror_mean=True) mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
ckpt_save_dir = config.save_checkpoint_path ckpt_save_dir = config.save_checkpoint_path
elif target == "GPU": else:
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False) raise ValueError("Unsupport platform.")
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
epoch_size = config.epoch_size epoch_size = config.epoch_size
net = resnet50_quant(class_num=config.class_num) net = resnet50_quant(class_num=config.class_num)
net.set_train(True) net.set_train(True)