forked from mindspore-Ecosystem/mindspore
!1248 support load pretrain ckpt and modify weight initializer
Merge pull request !1248 from meixiaowei/master
This commit is contained in:
commit
1c4b8b14dd
|
@ -46,6 +46,7 @@ Parameters for both training and evaluating can be set in config.py.
|
||||||
"momentum": 0.9, # momentum optimizer
|
"momentum": 0.9, # momentum optimizer
|
||||||
"weight_decay": 1e-4, # weight decay
|
"weight_decay": 1e-4, # weight decay
|
||||||
"epoch_size": 120, # epoch sizes for training
|
"epoch_size": 120, # epoch sizes for training
|
||||||
|
"pretrain_epoch_size": 0, # epoch size of pretrain checkpoint
|
||||||
"buffer_size": 1000, # number of queue size in data preprocessing
|
"buffer_size": 1000, # number of queue size in data preprocessing
|
||||||
"image_height": 224, # image height
|
"image_height": 224, # image height
|
||||||
"image_width": 224, # image width
|
"image_width": 224, # image width
|
||||||
|
@ -68,10 +69,10 @@ Parameters for both training and evaluating can be set in config.py.
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training
|
# distributed training
|
||||||
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
|
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)
|
||||||
|
|
||||||
# standalone training
|
# standalone training
|
||||||
sh run_standalone_train.sh [DATASET_PATH]
|
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Launch
|
#### Launch
|
||||||
|
@ -79,9 +80,15 @@ sh run_standalone_train.sh [DATASET_PATH]
|
||||||
```bash
|
```bash
|
||||||
# distributed training example(8p)
|
# distributed training example(8p)
|
||||||
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
|
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
|
||||||
|
|
||||||
|
If you want to load pretrained ckpt file,
|
||||||
|
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./ckpt/pretrained.ckpt
|
||||||
|
|
||||||
# standalone training example(1p)
|
# standalone training example(1p)
|
||||||
sh run_standalone_train.sh dataset/ilsvrc
|
sh run_standalone_train.sh dataset/ilsvrc
|
||||||
|
|
||||||
|
f you want to load pretrained ckpt file,
|
||||||
|
sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt
|
||||||
```
|
```
|
||||||
|
|
||||||
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
|
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
|
||||||
|
|
|
@ -24,6 +24,7 @@ config = ed({
|
||||||
"momentum": 0.9,
|
"momentum": 0.9,
|
||||||
"weight_decay": 1e-4,
|
"weight_decay": 1e-4,
|
||||||
"epoch_size": 120,
|
"epoch_size": 120,
|
||||||
|
"pretrain_epoch_size": 0,
|
||||||
"buffer_size": 1000,
|
"buffer_size": 1000,
|
||||||
"image_height": 224,
|
"image_height": 224,
|
||||||
"image_width": 224,
|
"image_width": 224,
|
||||||
|
|
|
@ -21,7 +21,7 @@ def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr):
|
||||||
lr = float(init_lr) + lr_inc * current_step
|
lr = float(init_lr) + lr_inc * current_step
|
||||||
return lr
|
return lr
|
||||||
|
|
||||||
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
|
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0):
|
||||||
"""
|
"""
|
||||||
generate learning rate array with cosine
|
generate learning rate array with cosine
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
|
||||||
steps_per_epoch(int): steps size of one epoch
|
steps_per_epoch(int): steps size of one epoch
|
||||||
warmup_epochs(int): number of warmup epochs
|
warmup_epochs(int): number of warmup epochs
|
||||||
max_epoch(int): total epochs of training
|
max_epoch(int): total epochs of training
|
||||||
|
global_step(int): the current start index of lr array
|
||||||
Returns:
|
Returns:
|
||||||
np.array, learning rate array
|
np.array, learning rate array
|
||||||
"""
|
"""
|
||||||
|
@ -49,4 +50,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
|
||||||
decayed = linear_decay * cosine_decay + 0.00001
|
decayed = linear_decay * cosine_decay + 0.00001
|
||||||
lr = base_lr * decayed
|
lr = base_lr * decayed
|
||||||
lr_each_step.append(lr)
|
lr_each_step.append(lr)
|
||||||
return np.array(lr_each_step).astype(np.float32)
|
|
||||||
|
lr_each_step = np.array(lr_each_step).astype(np.float32)
|
||||||
|
learning_rate = lr_each_step[global_step:]
|
||||||
|
return learning_rate
|
||||||
|
|
|
@ -14,9 +14,9 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
if [ $# != 2 ]
|
if [ $# != 2 ] && [ $# != 3 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
|
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -31,6 +31,11 @@ PATH1=$(get_real_path $1)
|
||||||
PATH2=$(get_real_path $2)
|
PATH2=$(get_real_path $2)
|
||||||
echo $PATH1
|
echo $PATH1
|
||||||
echo $PATH2
|
echo $PATH2
|
||||||
|
if [ $# == 3 ]
|
||||||
|
then
|
||||||
|
PATH3=$(get_real_path $3)
|
||||||
|
echo $PATH3
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ! -f $PATH1 ]
|
if [ ! -f $PATH1 ]
|
||||||
then
|
then
|
||||||
|
@ -44,6 +49,12 @@ then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $# == 3 ] && [ ! -f $PATH3 ]
|
||||||
|
then
|
||||||
|
echo "error: PRETRAINED_PATH=$PATH3 is not a file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
|
@ -61,6 +72,15 @@ do
|
||||||
cd ./train_parallel$i || exit
|
cd ./train_parallel$i || exit
|
||||||
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
||||||
env > env.log
|
env > env.log
|
||||||
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
|
if [ $# == 2 ]
|
||||||
|
then
|
||||||
|
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $# == 3 ]
|
||||||
|
then
|
||||||
|
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
|
||||||
|
fi
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
done
|
done
|
||||||
|
|
|
@ -14,9 +14,9 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
if [ $# != 1 ]
|
if [ $# != 1 ] && [ $# != 2 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_standalone_train.sh [DATASET_PATH]"
|
echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -29,12 +29,23 @@ get_real_path(){
|
||||||
}
|
}
|
||||||
PATH1=$(get_real_path $1)
|
PATH1=$(get_real_path $1)
|
||||||
echo $PATH1
|
echo $PATH1
|
||||||
|
if [ $# == 2 ]
|
||||||
|
then
|
||||||
|
PATH2=$(get_real_path $2)
|
||||||
|
echo $PATH2
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ! -d $PATH1 ]
|
if [ ! -d $PATH1 ]
|
||||||
then
|
then
|
||||||
echo "error: DATASET_PATH=$PATH1 is not a directory"
|
echo "error: DATASET_PATH=$PATH1 is not a directory"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $# == 2 ] && [ ! -f $PATH2 ]
|
||||||
|
then
|
||||||
|
echo "error: PRETRAINED_PATH=$PATH2 is not a file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=1
|
export DEVICE_NUM=1
|
||||||
|
@ -52,5 +63,13 @@ cp *.sh ./train
|
||||||
cd ./train || exit
|
cd ./train || exit
|
||||||
echo "start training for device $DEVICE_ID"
|
echo "start training for device $DEVICE_ID"
|
||||||
env > env.log
|
env > env.log
|
||||||
python train.py --do_train=True --dataset_path=$PATH1 &> log &
|
if [ $# == 1 ]
|
||||||
|
then
|
||||||
|
python train.py --do_train=True --dataset_path=$PATH1 &> log &
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $# == 2 ]
|
||||||
|
then
|
||||||
|
python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
||||||
|
fi
|
||||||
cd ..
|
cd ..
|
||||||
|
|
|
@ -44,6 +44,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
|
||||||
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
|
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
|
||||||
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
|
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
|
||||||
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
|
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
|
||||||
|
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
|
||||||
args_opt = parser.parse_args()
|
args_opt = parser.parse_args()
|
||||||
|
|
||||||
device_id = int(os.getenv('DEVICE_ID'))
|
device_id = int(os.getenv('DEVICE_ID'))
|
||||||
|
@ -77,9 +78,13 @@ if __name__ == '__main__':
|
||||||
repeat_num=epoch_size, batch_size=config.batch_size)
|
repeat_num=epoch_size, batch_size=config.batch_size)
|
||||||
step_size = dataset.get_dataset_size()
|
step_size = dataset.get_dataset_size()
|
||||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||||
|
if args_opt.pre_trained:
|
||||||
|
param_dict = load_checkpoint(args_opt.pre_trained)
|
||||||
|
load_param_into_net(net, param_dict)
|
||||||
|
|
||||||
# learning rate strategy with cosine
|
# learning rate strategy with cosine
|
||||||
lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size))
|
lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120,
|
||||||
|
config.pretrain_epoch_size*step_size))
|
||||||
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
|
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
|
||||||
config.weight_decay, config.loss_scale)
|
config.weight_decay, config.loss_scale)
|
||||||
model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False,
|
model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False,
|
||||||
|
|
Loading…
Reference in New Issue