forked from mindspore-Ecosystem/mindspore
1. remove unused variable in resnet.py
2. add relative path support in resnet50 example 3. optimize allreuce split strategy
This commit is contained in:
parent
3dd369cefa
commit
615b60d590
|
@ -20,22 +20,33 @@ then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $1 ]
|
get_real_path(){
|
||||||
|
if [ "${1:0:1}" == "/" ]; then
|
||||||
|
echo "$1"
|
||||||
|
else
|
||||||
|
echo "$(realpath -m $PWD/$1)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
PATH1=$(get_real_path $1)
|
||||||
|
PATH2=$(get_real_path $2)
|
||||||
|
|
||||||
|
if [ ! -f "$PATH1" ]
|
||||||
then
|
then
|
||||||
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -d $2 ]
|
if [ ! -d "$PATH2" ]
|
||||||
then
|
then
|
||||||
echo "error: DATASET_PATH=$2 is not a directory"
|
echo "error: DATASET_PATH=$PATH2 is not a directory"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
||||||
|
|
||||||
for((i=0; i<${DEVICE_NUM}; i++))
|
for((i=0; i<${DEVICE_NUM}; i++))
|
||||||
do
|
do
|
||||||
|
@ -48,6 +59,6 @@ do
|
||||||
cd ./train_parallel$i || exit
|
cd ./train_parallel$i || exit
|
||||||
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
||||||
env > env.log
|
env > env.log
|
||||||
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$2 &> log &
|
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
|
||||||
cd ..
|
cd ..
|
||||||
done
|
done
|
||||||
|
|
|
@ -20,9 +20,19 @@ then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -d $1 ]
|
get_real_path(){
|
||||||
|
if [ "${1:0:1}" == "/" ]; then
|
||||||
|
echo "$1"
|
||||||
|
else
|
||||||
|
echo "$(realpath -m $PWD/$1)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
PATH1=$(get_real_path $1)
|
||||||
|
|
||||||
|
if [ ! -d "$PATH1" ]
|
||||||
then
|
then
|
||||||
echo "error: DATASET_PATH=$1 is not a directory"
|
echo "error: DATASET_PATH=$PATH1 is not a directory"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -41,5 +51,5 @@ cp *.sh ./train
|
||||||
cd ./train || exit
|
cd ./train || exit
|
||||||
echo "start training for device $DEVICE_ID"
|
echo "start training for device $DEVICE_ID"
|
||||||
env > env.log
|
env > env.log
|
||||||
python train.py --do_train=True --dataset_path=$1 &> log &
|
python train.py --do_train=True --dataset_path=$PATH1 &> log &
|
||||||
cd ..
|
cd ..
|
||||||
|
|
|
@ -57,12 +57,12 @@ if __name__ == '__main__':
|
||||||
if not args_opt.do_eval and args_opt.run_distribute:
|
if not args_opt.do_eval and args_opt.run_distribute:
|
||||||
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||||
mirror_mean=True)
|
mirror_mean=True)
|
||||||
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
|
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
|
||||||
init()
|
init()
|
||||||
|
|
||||||
epoch_size = config.epoch_size
|
epoch_size = config.epoch_size
|
||||||
net = resnet50(class_num=config.class_num)
|
net = resnet50(class_num=config.class_num)
|
||||||
loss = SoftmaxCrossEntropyWithLogits(sparse=True)
|
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||||
|
|
||||||
|
|
||||||
if args_opt.do_train:
|
if args_opt.do_train:
|
||||||
|
|
|
@ -168,7 +168,7 @@ class ResNet(nn.Cell):
|
||||||
self.conv1 = _conv7x7(3, 64, stride=2)
|
self.conv1 = _conv7x7(3, 64, stride=2)
|
||||||
self.bn1 = _bn(64)
|
self.bn1 = _bn(64)
|
||||||
self.relu = P.ReLU()
|
self.relu = P.ReLU()
|
||||||
self.maxpool = P.MaxPoolWithArgmax(padding="same", ksize=3, strides=2)
|
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
|
||||||
|
|
||||||
self.layer1 = self._make_layer(block,
|
self.layer1 = self._make_layer(block,
|
||||||
layer_nums[0],
|
layer_nums[0],
|
||||||
|
@ -227,7 +227,7 @@ class ResNet(nn.Cell):
|
||||||
x = self.conv1(x)
|
x = self.conv1(x)
|
||||||
x = self.bn1(x)
|
x = self.bn1(x)
|
||||||
x = self.relu(x)
|
x = self.relu(x)
|
||||||
c1, argmax = self.maxpool(x)
|
c1 = self.maxpool(x)
|
||||||
|
|
||||||
c2 = self.layer1(c1)
|
c2 = self.layer1(c1)
|
||||||
c3 = self.layer2(c2)
|
c3 = self.layer2(c2)
|
||||||
|
|
Loading…
Reference in New Issue