1. remove unused variable in resnet.py

2. add relative path support in resnet50 example 3. optimize allreuce split strategy
2020-04-29 11:41:10 +08:00 · 2020-04-29 11:41:10 +08:00 · 615b60d590
parent 3dd369cefa
commit 615b60d590
4 changed files with 34 additions and 13 deletions
--- a/example/resnet50_cifar10/run_distribute_train.sh
+++ b/example/resnet50_cifar10/run_distribute_train.sh
@ -20,22 +20,33 @@ then
 exit 1
 fi

-if [ ! -f $1 ]
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+PATH2=$(get_real_path $2)
+
+if [ ! -f "$PATH1" ]
 then 
-    echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
+    echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
 exit 1
 fi 

-if [ ! -d $2 ]
+if [ ! -d "$PATH2" ]
 then 
-    echo "error: DATASET_PATH=$2 is not a directory"
+    echo "error: DATASET_PATH=$PATH2 is not a directory"
 exit 1
 fi 

 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
-export MINDSPORE_HCCL_CONFIG_PATH=$1
+export MINDSPORE_HCCL_CONFIG_PATH=$PATH1

 for((i=0; i<${DEVICE_NUM}; i++))
 do
@ -48,6 +59,6 @@ do
    cd ./train_parallel$i || exit
    echo "start training for rank $RANK_ID, device $DEVICE_ID"
    env > env.log
-    python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$2 &> log &
+    python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
    cd ..
 done
--- a/example/resnet50_cifar10/run_standalone_train.sh
+++ b/example/resnet50_cifar10/run_standalone_train.sh
@ -20,9 +20,19 @@ then
 exit 1
 fi

-if [ ! -d $1 ]
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+
+if [ ! -d "$PATH1" ]
 then 
-    echo "error: DATASET_PATH=$1 is not a directory"
+    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi 

@ -41,5 +51,5 @@ cp *.sh ./train
 cd ./train || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
-python train.py --do_train=True --dataset_path=$1 &> log &
+python train.py --do_train=True --dataset_path=$PATH1 &> log &
 cd ..
--- a/example/resnet50_cifar10/train.py
+++ b/example/resnet50_cifar10/train.py
@ -57,12 +57,12 @@ if __name__ == '__main__':
    if not args_opt.do_eval and args_opt.run_distribute:
        context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          mirror_mean=True)
-        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
+        auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
        init()

    epoch_size = config.epoch_size
    net = resnet50(class_num=config.class_num)
-    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
+    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')


    if args_opt.do_train:
--- a/mindspore/model_zoo/resnet.py
+++ b/mindspore/model_zoo/resnet.py
@ -168,7 +168,7 @@ class ResNet(nn.Cell):
        self.conv1 = _conv7x7(3, 64, stride=2)
        self.bn1 = _bn(64)
        self.relu = P.ReLU()
-        self.maxpool = P.MaxPoolWithArgmax(padding="same", ksize=3, strides=2)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")

        self.layer1 = self._make_layer(block,
                                       layer_nums[0],
@ -227,7 +227,7 @@ class ResNet(nn.Cell):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
-        c1, argmax = self.maxpool(x)
+        c1 = self.maxpool(x)

        c2 = self.layer1(c1)
        c3 = self.layer2(c2)