forked from mindspore-Ecosystem/mindspore
support xception lr slice and fix usability problems
This commit is contained in:
parent
b0794bb5f6
commit
806d4d304c
|
@ -81,9 +81,10 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
|
||||||
├─config.py # parameter configuration
|
├─config.py # parameter configuration
|
||||||
├─dataset.py # data preprocessing
|
├─dataset.py # data preprocessing
|
||||||
├─Xception.py # network definition
|
├─Xception.py # network definition
|
||||||
├─CrossEntropySmooth.py # Customized CrossEntropy loss function
|
├─loss.py # Customized CrossEntropy loss function
|
||||||
└─lr_generator.py # learning rate generator
|
└─lr_generator.py # learning rate generator
|
||||||
├─train.py # train net
|
├─train.py # train net
|
||||||
|
├─export.py # export net
|
||||||
└─eval.py # eval net
|
└─eval.py # eval net
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -110,7 +111,6 @@ Major parameters in train.py and config.py are:
|
||||||
'lr_init': 0.00004 # initiate learning rate
|
'lr_init': 0.00004 # initiate learning rate
|
||||||
'lr_max': 0.4 # max bound of learning rate
|
'lr_max': 0.4 # max bound of learning rate
|
||||||
'lr_end': 0.00004 # min bound of learning rate
|
'lr_end': 0.00004 # min bound of learning rate
|
||||||
"weight_init": 'xavier_uniform' # Weight initialization mode
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## [Training process](#contents)
|
## [Training process](#contents)
|
||||||
|
@ -149,13 +149,13 @@ sh scripts/run_standalone_train.sh DEVICE_ID DATA_PATH
|
||||||
|
|
||||||
### Result
|
### Result
|
||||||
|
|
||||||
Training result will be stored in the example path. Checkpoints will be stored at `. /model_0` by default, and training log will be redirected to `log.txt` like followings.
|
Training result will be stored in the example path. Checkpoints will be stored at `. /ckpt_0` by default, and training log will be redirected to `log.txt` like followings.
|
||||||
|
|
||||||
``` shell
|
``` shell
|
||||||
epoch: [ 0/250], step:[ 1250/ 1251], loss:[4.761/5.613], time:[529.305], lr:[0.400]
|
epoch: 1 step: 1251, loss is 4.8427444
|
||||||
epoch time: 1128662.862, per step time: 902.209, avg loss: 5.609
|
epoch time: 701242.350 ms, per step time: 560.545 ms
|
||||||
epoch: [ 1/250], step:[ 1250/ 1251], loss:[4.164/4.318], time:[503.708], lr:[0.398]
|
epoch: 2 step: 1251, loss is 4.0637593
|
||||||
epoch time: 889163.081, per step time: 710.762, avg loss: 4.312
|
epoch time: 598591.422 ms, per step time: 478.490ms
|
||||||
```
|
```
|
||||||
|
|
||||||
## [Eval process](#contents)
|
## [Eval process](#contents)
|
||||||
|
@ -199,18 +199,19 @@ result: {'Loss': 1.7797744848789312, 'Top_1_Acc': 0.7985777243589743, 'Top_5_Acc
|
||||||
| -------------------------- | ---------------------------------------------- |
|
| -------------------------- | ---------------------------------------------- |
|
||||||
| Model Version | Xception |
|
| Model Version | Xception |
|
||||||
| Resource | HUAWEI CLOUD Modelarts |
|
| Resource | HUAWEI CLOUD Modelarts |
|
||||||
| uploaded Date | 11/15/2020 |
|
| uploaded Date | 12/10/2020 |
|
||||||
| MindSpore Version | 1.0.0 |
|
| MindSpore Version | 1.1.0 |
|
||||||
| Dataset | 1200k images |
|
| Dataset | 1200k images |
|
||||||
| Batch_size | 128 |
|
| Batch_size | 128 |
|
||||||
| Training Parameters | src/config.py |
|
| Training Parameters | src/config.py |
|
||||||
| Optimizer | Momentum |
|
| Optimizer | Momentum |
|
||||||
| Loss Function | CrossEntropySmooth |
|
| Loss Function | CrossEntropySmooth |
|
||||||
| Loss | 1.78 |
|
| Loss | 1.78 |
|
||||||
| Accuracy (8p) | Top1[79.9%] Top5[94.9%] |
|
| Accuracy (8p) | Top1[79.8%] Top5[94.8%] |
|
||||||
| Total time (8p) | 63h |
|
| Per step time (8p) | 479 ms/step |
|
||||||
|
| Total time (8p) | 42h |
|
||||||
| Params (M) | 180M |
|
| Params (M) | 180M |
|
||||||
| Scripts | [Xception script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/Xception) |
|
| Scripts | [Xception script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/xception) |
|
||||||
|
|
||||||
#### Inference Performance
|
#### Inference Performance
|
||||||
|
|
||||||
|
|
|
@ -27,13 +27,10 @@ echo "avg_core_per_rank" $avg_core_per_rank
|
||||||
echo "core_gap" $core_gap
|
echo "core_gap" $core_gap
|
||||||
for((i=0;i<RANK_SIZE;i++))
|
for((i=0;i<RANK_SIZE;i++))
|
||||||
do
|
do
|
||||||
start=`expr $i \* $avg_core_per_rank`
|
|
||||||
export DEVICE_ID=$i
|
export DEVICE_ID=$i
|
||||||
export RANK_ID=$i
|
export RANK_ID=$i
|
||||||
export DEPLOY_MODE=0
|
export DEPLOY_MODE=0
|
||||||
export GE_USE_STATIC_MEMORY=1
|
export GE_USE_STATIC_MEMORY=1
|
||||||
end=`expr $start \+ $core_gap`
|
|
||||||
cmdopt=$start"-"$end
|
|
||||||
|
|
||||||
rm -rf train_parallel$i
|
rm -rf train_parallel$i
|
||||||
mkdir ./train_parallel$i
|
mkdir ./train_parallel$i
|
||||||
|
@ -42,7 +39,7 @@ do
|
||||||
echo "start training for rank $i, device $DEVICE_ID"
|
echo "start training for rank $i, device $DEVICE_ID"
|
||||||
|
|
||||||
env > env.log
|
env > env.log
|
||||||
taskset -c $cmdopt python ../train.py \
|
python ../train.py \
|
||||||
--is_distributed \
|
--is_distributed \
|
||||||
--device_target=Ascend \
|
--device_target=Ascend \
|
||||||
--dataset_path=$DATA_DIR > log.txt 2>&1 &
|
--dataset_path=$DATA_DIR > log.txt 2>&1 &
|
||||||
|
|
|
@ -18,8 +18,14 @@ export DEVICE_ID=$1
|
||||||
DATA_DIR=$2
|
DATA_DIR=$2
|
||||||
PATH_CHECKPOINT=$3
|
PATH_CHECKPOINT=$3
|
||||||
|
|
||||||
python ./eval.py \
|
rm -rf eval_output
|
||||||
|
mkdir ./eval_output
|
||||||
|
cd ./eval_output || exit
|
||||||
|
echo "start evaluating model..."
|
||||||
|
|
||||||
|
python ../eval.py \
|
||||||
--device_target=Ascend \
|
--device_target=Ascend \
|
||||||
--device_id=$DEVICE_ID \
|
--device_id=$DEVICE_ID \
|
||||||
--checkpoint_path=$PATH_CHECKPOINT \
|
--checkpoint_path=$PATH_CHECKPOINT \
|
||||||
--dataset_path=$DATA_DIR > eval.log 2>&1 &
|
--dataset_path=$DATA_DIR > eval.log 2>&1 &
|
||||||
|
cd ../
|
||||||
|
|
|
@ -16,7 +16,13 @@
|
||||||
|
|
||||||
export DEVICE_ID=$1
|
export DEVICE_ID=$1
|
||||||
DATA_DIR=$2
|
DATA_DIR=$2
|
||||||
python ./train.py \
|
|
||||||
|
rm -rf train_standalone
|
||||||
|
mkdir ./train_standalone
|
||||||
|
cd ./train_standalone || exit
|
||||||
|
echo "start training standalone on device $DEVICE_ID"
|
||||||
|
|
||||||
|
python ../train.py \
|
||||||
--device_target=Ascend \
|
--device_target=Ascend \
|
||||||
--dataset_path=$DATA_DIR > log.txt 2>&1 &
|
--dataset_path=$DATA_DIR > log.txt 2>&1 &
|
||||||
|
cd ../
|
||||||
|
|
|
@ -15,15 +15,14 @@
|
||||||
"""Xception."""
|
"""Xception."""
|
||||||
import mindspore.nn as nn
|
import mindspore.nn as nn
|
||||||
import mindspore.ops.operations as P
|
import mindspore.ops.operations as P
|
||||||
from src.config import config
|
|
||||||
|
|
||||||
class SeparableConv2d(nn.Cell):
|
class SeparableConv2d(nn.Cell):
|
||||||
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
|
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
|
||||||
super(SeparableConv2d, self).__init__()
|
super(SeparableConv2d, self).__init__()
|
||||||
self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, group=in_channels, pad_mode='pad',
|
self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, group=in_channels, pad_mode='pad',
|
||||||
padding=padding, weight_init=config.weight_init)
|
padding=padding, weight_init='xavier_uniform')
|
||||||
self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, pad_mode='valid',
|
self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, pad_mode='valid',
|
||||||
weight_init=config.weight_init)
|
weight_init='xavier_uniform')
|
||||||
|
|
||||||
def construct(self, x):
|
def construct(self, x):
|
||||||
x = self.conv1(x)
|
x = self.conv1(x)
|
||||||
|
@ -37,7 +36,7 @@ class Block(nn.Cell):
|
||||||
|
|
||||||
if out_filters != in_filters or strides != 1:
|
if out_filters != in_filters or strides != 1:
|
||||||
self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, pad_mode='valid', has_bias=False,
|
self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, pad_mode='valid', has_bias=False,
|
||||||
weight_init=config.weight_init)
|
weight_init='xavier_uniform')
|
||||||
self.skipbn = nn.BatchNorm2d(out_filters, momentum=0.9)
|
self.skipbn = nn.BatchNorm2d(out_filters, momentum=0.9)
|
||||||
else:
|
else:
|
||||||
self.skip = None
|
self.skip = None
|
||||||
|
@ -96,10 +95,10 @@ class Xception(nn.Cell):
|
||||||
"""
|
"""
|
||||||
super(Xception, self).__init__()
|
super(Xception, self).__init__()
|
||||||
self.num_classes = num_classes
|
self.num_classes = num_classes
|
||||||
self.conv1 = nn.Conv2d(3, 32, 3, 2, pad_mode='valid', weight_init=config.weight_init)
|
self.conv1 = nn.Conv2d(3, 32, 3, 2, pad_mode='valid', weight_init='xavier_uniform')
|
||||||
self.bn1 = nn.BatchNorm2d(32, momentum=0.9)
|
self.bn1 = nn.BatchNorm2d(32, momentum=0.9)
|
||||||
self.relu = nn.ReLU()
|
self.relu = nn.ReLU()
|
||||||
self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='valid', weight_init=config.weight_init)
|
self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='valid', weight_init='xavier_uniform')
|
||||||
self.bn2 = nn.BatchNorm2d(64, momentum=0.9)
|
self.bn2 = nn.BatchNorm2d(64, momentum=0.9)
|
||||||
|
|
||||||
# Entry flow
|
# Entry flow
|
||||||
|
|
|
@ -36,6 +36,5 @@ config = ed({
|
||||||
"label_smooth_factor": 0.1,
|
"label_smooth_factor": 0.1,
|
||||||
"lr_init": 0.00004,
|
"lr_init": 0.00004,
|
||||||
"lr_max": 0.4,
|
"lr_max": 0.4,
|
||||||
"lr_end": 0.00004,
|
"lr_end": 0.00004
|
||||||
"weight_init": 'xavier_uniform'
|
|
||||||
})
|
})
|
||||||
|
|
|
@ -17,7 +17,7 @@ import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
|
def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode, global_step=0):
|
||||||
"""
|
"""
|
||||||
generate learning rate array
|
generate learning rate array
|
||||||
|
|
||||||
|
@ -82,6 +82,6 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch
|
||||||
lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
|
lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
|
||||||
lr_each_step.append(lr)
|
lr_each_step.append(lr)
|
||||||
|
|
||||||
lr_each_step = np.array(lr_each_step).astype(np.float32)
|
lr_each_step = np.array(lr_each_step[global_step:]).astype(np.float32)
|
||||||
|
|
||||||
return lr_each_step
|
return lr_each_step
|
||||||
|
|
|
@ -59,7 +59,8 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
rank = 0
|
rank = 0
|
||||||
group_size = 1
|
group_size = 1
|
||||||
context.set_context(device_id=0)
|
if os.getenv('DEVICE_ID', "not_set").isdigit():
|
||||||
|
context.set_context(device_id=int(os.getenv('DEVICE_ID')))
|
||||||
|
|
||||||
# define network
|
# define network
|
||||||
net = xception(class_num=config.class_num)
|
net = xception(class_num=config.class_num)
|
||||||
|
@ -88,7 +89,8 @@ if __name__ == '__main__':
|
||||||
warmup_epochs=config.warmup_epochs,
|
warmup_epochs=config.warmup_epochs,
|
||||||
total_epochs=config.epoch_size,
|
total_epochs=config.epoch_size,
|
||||||
steps_per_epoch=step_size,
|
steps_per_epoch=step_size,
|
||||||
lr_decay_mode=config.lr_decay_mode))
|
lr_decay_mode=config.lr_decay_mode,
|
||||||
|
global_step=config.finish_epoch * step_size))
|
||||||
|
|
||||||
# define optimization
|
# define optimization
|
||||||
opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay, config.loss_scale)
|
opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay, config.loss_scale)
|
||||||
|
@ -100,7 +102,7 @@ if __name__ == '__main__':
|
||||||
# define callbacks
|
# define callbacks
|
||||||
cb = [TimeMonitor(), LossMonitor()]
|
cb = [TimeMonitor(), LossMonitor()]
|
||||||
if config.save_checkpoint:
|
if config.save_checkpoint:
|
||||||
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'model_' + str(rank) + '/')
|
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/')
|
||||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
|
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
|
||||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||||
ckpt_cb = ModelCheckpoint(f"Xception-rank{rank}", directory=save_ckpt_path, config=config_ck)
|
ckpt_cb = ModelCheckpoint(f"Xception-rank{rank}", directory=save_ckpt_path, config=config_ck)
|
||||||
|
|
Loading…
Reference in New Issue