diff --git a/model_zoo/official/cv/xception/README.md b/model_zoo/official/cv/xception/README.md index aaa81a3bfbe..0ab75ae016f 100644 --- a/model_zoo/official/cv/xception/README.md +++ b/model_zoo/official/cv/xception/README.md @@ -81,9 +81,10 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil ├─config.py # parameter configuration ├─dataset.py # data preprocessing ├─Xception.py # network definition - ├─CrossEntropySmooth.py # Customized CrossEntropy loss function + ├─loss.py # Customized CrossEntropy loss function └─lr_generator.py # learning rate generator ├─train.py # train net + ├─export.py # export net └─eval.py # eval net ``` @@ -110,7 +111,6 @@ Major parameters in train.py and config.py are: 'lr_init': 0.00004 # initiate learning rate 'lr_max': 0.4 # max bound of learning rate 'lr_end': 0.00004 # min bound of learning rate -"weight_init": 'xavier_uniform' # Weight initialization mode ``` ## [Training process](#contents) @@ -149,13 +149,13 @@ sh scripts/run_standalone_train.sh DEVICE_ID DATA_PATH ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /model_0` by default, and training log will be redirected to `log.txt` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `. /ckpt_0` by default, and training log will be redirected to `log.txt` like followings. ``` shell -epoch: [ 0/250], step:[ 1250/ 1251], loss:[4.761/5.613], time:[529.305], lr:[0.400] -epoch time: 1128662.862, per step time: 902.209, avg loss: 5.609 -epoch: [ 1/250], step:[ 1250/ 1251], loss:[4.164/4.318], time:[503.708], lr:[0.398] -epoch time: 889163.081, per step time: 710.762, avg loss: 4.312 +epoch: 1 step: 1251, loss is 4.8427444 +epoch time: 701242.350 ms, per step time: 560.545 ms +epoch: 2 step: 1251, loss is 4.0637593 +epoch time: 598591.422 ms, per step time: 478.490ms ``` ## [Eval process](#contents) @@ -199,18 +199,19 @@ result: {'Loss': 1.7797744848789312, 'Top_1_Acc': 0.7985777243589743, 'Top_5_Acc | -------------------------- | ---------------------------------------------- | | Model Version | Xception | | Resource | HUAWEI CLOUD Modelarts | -| uploaded Date | 11/15/2020 | -| MindSpore Version | 1.0.0 | +| uploaded Date | 12/10/2020 | +| MindSpore Version | 1.1.0 | | Dataset | 1200k images | | Batch_size | 128 | | Training Parameters | src/config.py | | Optimizer | Momentum | | Loss Function | CrossEntropySmooth | | Loss | 1.78 | -| Accuracy (8p) | Top1[79.9%] Top5[94.9%] | -| Total time (8p) | 63h | +| Accuracy (8p) | Top1[79.8%] Top5[94.8%] | +| Per step time (8p) | 479 ms/step | +| Total time (8p) | 42h | | Params (M) | 180M | -| Scripts | [Xception script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/Xception) | +| Scripts | [Xception script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/xception) | #### Inference Performance @@ -231,4 +232,4 @@ In `dataset.py`, we set the seed inside `create_dataset` function. We also use r # [ModelZoo Homepage](#contents) -Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). \ No newline at end of file +Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). diff --git a/model_zoo/official/cv/xception/scripts/run_distribute_train.sh b/model_zoo/official/cv/xception/scripts/run_distribute_train.sh index 4e0b65055e8..ae19cf20212 100644 --- a/model_zoo/official/cv/xception/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/xception/scripts/run_distribute_train.sh @@ -27,13 +27,10 @@ echo "avg_core_per_rank" $avg_core_per_rank echo "core_gap" $core_gap for((i=0;i env.log - taskset -c $cmdopt python ../train.py \ + python ../train.py \ --is_distributed \ --device_target=Ascend \ --dataset_path=$DATA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/cv/xception/scripts/run_eval.sh b/model_zoo/official/cv/xception/scripts/run_eval.sh index 7b407d6f54f..f7e8f8277a4 100644 --- a/model_zoo/official/cv/xception/scripts/run_eval.sh +++ b/model_zoo/official/cv/xception/scripts/run_eval.sh @@ -18,8 +18,14 @@ export DEVICE_ID=$1 DATA_DIR=$2 PATH_CHECKPOINT=$3 -python ./eval.py \ +rm -rf eval_output +mkdir ./eval_output +cd ./eval_output || exit +echo "start evaluating model..." + +python ../eval.py \ --device_target=Ascend \ --device_id=$DEVICE_ID \ --checkpoint_path=$PATH_CHECKPOINT \ --dataset_path=$DATA_DIR > eval.log 2>&1 & +cd ../ diff --git a/model_zoo/official/cv/xception/scripts/run_standalone_train.sh b/model_zoo/official/cv/xception/scripts/run_standalone_train.sh index 6d896b8e914..d6f99d189fe 100644 --- a/model_zoo/official/cv/xception/scripts/run_standalone_train.sh +++ b/model_zoo/official/cv/xception/scripts/run_standalone_train.sh @@ -16,7 +16,13 @@ export DEVICE_ID=$1 DATA_DIR=$2 -python ./train.py \ + +rm -rf train_standalone +mkdir ./train_standalone +cd ./train_standalone || exit +echo "start training standalone on device $DEVICE_ID" + +python ../train.py \ --device_target=Ascend \ --dataset_path=$DATA_DIR > log.txt 2>&1 & - +cd ../ diff --git a/model_zoo/official/cv/xception/src/Xception.py b/model_zoo/official/cv/xception/src/Xception.py index b2deffb8e15..f9d26a07678 100644 --- a/model_zoo/official/cv/xception/src/Xception.py +++ b/model_zoo/official/cv/xception/src/Xception.py @@ -15,15 +15,14 @@ """Xception.""" import mindspore.nn as nn import mindspore.ops.operations as P -from src.config import config class SeparableConv2d(nn.Cell): def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0): super(SeparableConv2d, self).__init__() self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, group=in_channels, pad_mode='pad', - padding=padding, weight_init=config.weight_init) + padding=padding, weight_init='xavier_uniform') self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, pad_mode='valid', - weight_init=config.weight_init) + weight_init='xavier_uniform') def construct(self, x): x = self.conv1(x) @@ -37,7 +36,7 @@ class Block(nn.Cell): if out_filters != in_filters or strides != 1: self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, pad_mode='valid', has_bias=False, - weight_init=config.weight_init) + weight_init='xavier_uniform') self.skipbn = nn.BatchNorm2d(out_filters, momentum=0.9) else: self.skip = None @@ -96,10 +95,10 @@ class Xception(nn.Cell): """ super(Xception, self).__init__() self.num_classes = num_classes - self.conv1 = nn.Conv2d(3, 32, 3, 2, pad_mode='valid', weight_init=config.weight_init) + self.conv1 = nn.Conv2d(3, 32, 3, 2, pad_mode='valid', weight_init='xavier_uniform') self.bn1 = nn.BatchNorm2d(32, momentum=0.9) self.relu = nn.ReLU() - self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='valid', weight_init=config.weight_init) + self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='valid', weight_init='xavier_uniform') self.bn2 = nn.BatchNorm2d(64, momentum=0.9) # Entry flow diff --git a/model_zoo/official/cv/xception/src/config.py b/model_zoo/official/cv/xception/src/config.py index 2a30b82f173..2b824941047 100644 --- a/model_zoo/official/cv/xception/src/config.py +++ b/model_zoo/official/cv/xception/src/config.py @@ -36,6 +36,5 @@ config = ed({ "label_smooth_factor": 0.1, "lr_init": 0.00004, "lr_max": 0.4, - "lr_end": 0.00004, - "weight_init": 'xavier_uniform' + "lr_end": 0.00004 }) diff --git a/model_zoo/official/cv/xception/src/lr_generator.py b/model_zoo/official/cv/xception/src/lr_generator.py index df640027258..c270538df5f 100644 --- a/model_zoo/official/cv/xception/src/lr_generator.py +++ b/model_zoo/official/cv/xception/src/lr_generator.py @@ -17,7 +17,7 @@ import math import numpy as np -def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): +def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode, global_step=0): """ generate learning rate array @@ -82,6 +82,6 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) lr_each_step.append(lr) - lr_each_step = np.array(lr_each_step).astype(np.float32) + lr_each_step = np.array(lr_each_step[global_step:]).astype(np.float32) return lr_each_step diff --git a/model_zoo/official/cv/xception/train.py b/model_zoo/official/cv/xception/train.py index dd4ebb6352e..d8e212e5e87 100644 --- a/model_zoo/official/cv/xception/train.py +++ b/model_zoo/official/cv/xception/train.py @@ -59,7 +59,8 @@ if __name__ == '__main__': else: rank = 0 group_size = 1 - context.set_context(device_id=0) + if os.getenv('DEVICE_ID', "not_set").isdigit(): + context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # define network net = xception(class_num=config.class_num) @@ -88,7 +89,8 @@ if __name__ == '__main__': warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, - lr_decay_mode=config.lr_decay_mode)) + lr_decay_mode=config.lr_decay_mode, + global_step=config.finish_epoch * step_size)) # define optimization opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay, config.loss_scale) @@ -100,7 +102,7 @@ if __name__ == '__main__': # define callbacks cb = [TimeMonitor(), LossMonitor()] if config.save_checkpoint: - save_ckpt_path = os.path.join(config.save_checkpoint_path, 'model_' + str(rank) + '/') + save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/') config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(f"Xception-rank{rank}", directory=save_ckpt_path, config=config_ck)