diff --git a/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train_ascend.sh b/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train_ascend.sh index 22e08219fcf..1cb0898bc20 100755 --- a/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train_ascend.sh +++ b/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train_ascend.sh @@ -46,6 +46,7 @@ exit 1 fi ulimit -u unlimited +export HCCL_CONNECT_TIMEOUT=600 export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 diff --git a/model_zoo/official/cv/inceptionv4/README.md b/model_zoo/official/cv/inceptionv4/README.md index 6b51515b1f2..2522f842f73 100644 --- a/model_zoo/official/cv/inceptionv4/README.md +++ b/model_zoo/official/cv/inceptionv4/README.md @@ -142,12 +142,12 @@ sh scripts/run_standalone_train_ascend.sh DEVICE_ID DATA_DIR Training result will be stored in the example path. Checkpoints will be stored at `ckpt_path` by default, and training log will be redirected to `./log.txt` like followings. ```python -epoch: 1 step: 1251, loss is 5.861846 -Epoch time: 701416.649, per step time: 560.685 -epoch: 2 step: 1251, loss is 4.295785 -Epoch time: 472524.154, per step time: 377.717 -epoch: 3 step: 1251, loss is 3.691987 -Epoch time: 472505.767, per step time: 377.702 +epoch: 1 step: 1251, loss is 5.4833196 +Epoch time: 520274.060, per step time: 415.887 +epoch: 2 step: 1251, loss is 4.093194 +Epoch time: 288520.628, per step time: 230.632 +epoch: 3 step: 1251, loss is 3.6242008 +Epoch time: 288507.506, per step time: 230.622 ``` ## [Eval process](#contents) @@ -201,7 +201,7 @@ metric: {'Loss': 0.9849, 'Top1-Acc':0.7985, 'Top5-Acc':0.9460} | Outputs | probability | | Loss | 0.98486 | | Accuracy (8p) | ACC1[79.85%] ACC5[94.60%] | -| Total time (8p) | 33h | +| Total time (8p) | 20h | | Params (M) | 153M | | Checkpoint for Fine tuning | 2135M | | Scripts | [inceptionv4 script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/inceptionv4) | @@ -225,11 +225,11 @@ metric: {'Loss': 0.9849, 'Top1-Acc':0.7985, 'Top5-Acc':0.9460} | **Ascend** | train performance | | :--------: | :---------------: | -| 1p | 345 img/s | +| 1p | 556 img/s | | **Ascend** | train performance | | :--------: | :---------------: | -| 8p | 2708img/s | +| 8p | 4430 img/s | # [Description of Random Situation](#contents) diff --git a/model_zoo/official/cv/inceptionv4/scripts/run_distribute_train_ascend.sh b/model_zoo/official/cv/inceptionv4/scripts/run_distribute_train_ascend.sh index e21f9517a59..49768b95a64 100644 --- a/model_zoo/official/cv/inceptionv4/scripts/run_distribute_train_ascend.sh +++ b/model_zoo/official/cv/inceptionv4/scripts/run_distribute_train_ascend.sh @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================ +export HCCL_CONNECT_TIMEOUT=600 export RANK_TABLE_FILE=$1 DATA_DIR=$2 export RANK_SIZE=8 diff --git a/model_zoo/official/cv/inceptionv4/src/config.py b/model_zoo/official/cv/inceptionv4/src/config.py index 699cc94f592..9ece7a578ea 100644 --- a/model_zoo/official/cv/inceptionv4/src/config.py +++ b/model_zoo/official/cv/inceptionv4/src/config.py @@ -41,7 +41,4 @@ config_ascend = edict({ 'lr_max': 0.4, 'warmup_epochs': 1, 'start_epoch': 1, - - 'onnx_filename': 'inceptionv4.onnx', - 'air_filename': 'inceptionv4.air' }) diff --git a/model_zoo/official/cv/inceptionv4/src/inceptionv4.py b/model_zoo/official/cv/inceptionv4/src/inceptionv4.py index 596a14c29af..15e5d0bee81 100644 --- a/model_zoo/official/cv/inceptionv4/src/inceptionv4.py +++ b/model_zoo/official/cv/inceptionv4/src/inceptionv4.py @@ -15,40 +15,15 @@ """InceptionV4""" import mindspore.nn as nn from mindspore.ops import operations as P -from mindspore.common.initializer import Initializer - - - -class Avginitializer(Initializer): - """ - Initialize the weight to 1/m*n, (m, n) is the shape of kernel. - """ - - - def _initialize(self, arr): - arr[:] = 0 - for i in range(arr.shape[0]): - for j in range(arr.shape[2]): - for k in range(arr.shape[3]): - arr[i][i][j][k] = 1/(arr.shape[2]*arr.shape[3]) - class Avgpool(nn.Cell): - """ - Average pooling for temporal data. - - Using a custom initializer to turn conv2d into avgpool2d. The weights won't be trained. - - """ - def __init__(self, channel, kernel_size, stride=1, pad_mode='same'): + """Avgpool""" + def __init__(self, kernel_size, stride=1, pad_mode='same'): super(Avgpool, self).__init__() - self.init = Avginitializer() - self.conv = nn.Conv2d(channel, channel, kernel_size, - stride=stride, pad_mode=pad_mode, weight_init=self.init) - self.conv.set_train(False) + self.avg_pool = nn.AvgPool2d(kernel_size=kernel_size, stride=stride, pad_mode=pad_mode) def construct(self, x): - x = self.conv(x) + x = self.avg_pool(x) return x @@ -141,7 +116,7 @@ class InceptionA(nn.Cell): Conv2d(96, 96, 3, stride=1, pad_mode='pad', padding=1, has_bias=False)]) self.branch_3 = nn.SequentialCell([ - Avgpool(384, kernel_size=3, stride=1, pad_mode='same'), + Avgpool(kernel_size=3, stride=1, pad_mode='same'), Conv2d(384, 96, 1, stride=1, padding=0, has_bias=False)]) self.concat = P.Concat(1) @@ -178,7 +153,7 @@ class InceptionB(nn.Cell): Conv2d(224, 256, (1, 7), pad_mode='same', stride=1, has_bias=False) ]) self.branch_3 = nn.SequentialCell([ - Avgpool(in_channels, kernel_size=3, stride=1, pad_mode='same'), + Avgpool(kernel_size=3, stride=1, pad_mode='same'), Conv2d(in_channels, 128, 1, stride=1, padding=0, has_bias=False) ]) self.concat = P.Concat(1) @@ -265,7 +240,7 @@ class InceptionC(nn.Cell): 512, 256, (3, 1), pad_mode='same', stride=1, has_bias=False) self.branch_3 = nn.SequentialCell([ - Avgpool(in_channels, kernel_size=3, stride=1, pad_mode='same'), + Avgpool(kernel_size=3, stride=1, pad_mode='same'), Conv2d(in_channels, 256, 1, stride=1, padding=0, has_bias=False) ]) self.concat0 = P.Concat(1) diff --git a/model_zoo/official/cv/unet/scripts/run_distribute_train.sh b/model_zoo/official/cv/unet/scripts/run_distribute_train.sh index 6ca780fc367..807490db215 100644 --- a/model_zoo/official/cv/unet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/unet/scripts/run_distribute_train.sh @@ -26,6 +26,7 @@ then exit 1 fi +export HCCL_CONNECT_TIMEOUT=600 export RANK_SIZE=8 for((i=0;i