From 615b60d59098bb3713875fe23c381cd7e88b422a Mon Sep 17 00:00:00 2001 From: gengdongjie Date: Wed, 29 Apr 2020 11:41:10 +0800 Subject: [PATCH] 1. remove unused variable in resnet.py 2. add relative path support in resnet50 example 3. optimize allreuce split strategy --- .../resnet50_cifar10/run_distribute_train.sh | 23 ++++++++++++++----- .../resnet50_cifar10/run_standalone_train.sh | 16 ++++++++++--- example/resnet50_cifar10/train.py | 4 ++-- mindspore/model_zoo/resnet.py | 4 ++-- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/example/resnet50_cifar10/run_distribute_train.sh b/example/resnet50_cifar10/run_distribute_train.sh index e78e2bf1045..e4bdd775b35 100755 --- a/example/resnet50_cifar10/run_distribute_train.sh +++ b/example/resnet50_cifar10/run_distribute_train.sh @@ -20,22 +20,33 @@ then exit 1 fi -if [ ! -f $1 ] +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + +if [ ! -f "$PATH1" ] then - echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" + echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" exit 1 fi -if [ ! -d $2 ] +if [ ! -d "$PATH2" ] then - echo "error: DATASET_PATH=$2 is not a directory" + echo "error: DATASET_PATH=$PATH2 is not a directory" exit 1 fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 -export MINDSPORE_HCCL_CONFIG_PATH=$1 +export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 for((i=0; i<${DEVICE_NUM}; i++)) do @@ -48,6 +59,6 @@ do cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log - python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$2 &> log & + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & cd .. done diff --git a/example/resnet50_cifar10/run_standalone_train.sh b/example/resnet50_cifar10/run_standalone_train.sh index 90423630aa5..cb08cde6c94 100755 --- a/example/resnet50_cifar10/run_standalone_train.sh +++ b/example/resnet50_cifar10/run_standalone_train.sh @@ -20,9 +20,19 @@ then exit 1 fi -if [ ! -d $1 ] +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ ! -d "$PATH1" ] then - echo "error: DATASET_PATH=$1 is not a directory" + echo "error: DATASET_PATH=$PATH1 is not a directory" exit 1 fi @@ -41,5 +51,5 @@ cp *.sh ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log -python train.py --do_train=True --dataset_path=$1 &> log & +python train.py --do_train=True --dataset_path=$PATH1 &> log & cd .. diff --git a/example/resnet50_cifar10/train.py b/example/resnet50_cifar10/train.py index 0a3ad9dc5a7..c39d1bcf88d 100755 --- a/example/resnet50_cifar10/train.py +++ b/example/resnet50_cifar10/train.py @@ -57,12 +57,12 @@ if __name__ == '__main__': if not args_opt.do_eval and args_opt.run_distribute: context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) init() epoch_size = config.epoch_size net = resnet50(class_num=config.class_num) - loss = SoftmaxCrossEntropyWithLogits(sparse=True) + loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') if args_opt.do_train: diff --git a/mindspore/model_zoo/resnet.py b/mindspore/model_zoo/resnet.py index 3055026718a..001e1db0cf3 100755 --- a/mindspore/model_zoo/resnet.py +++ b/mindspore/model_zoo/resnet.py @@ -168,7 +168,7 @@ class ResNet(nn.Cell): self.conv1 = _conv7x7(3, 64, stride=2) self.bn1 = _bn(64) self.relu = P.ReLU() - self.maxpool = P.MaxPoolWithArgmax(padding="same", ksize=3, strides=2) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same") self.layer1 = self._make_layer(block, layer_nums[0], @@ -227,7 +227,7 @@ class ResNet(nn.Cell): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) - c1, argmax = self.maxpool(x) + c1 = self.maxpool(x) c2 = self.layer1(c1) c3 = self.layer2(c2)