forked from mindspore-Ecosystem/mindspore
Add training and evaluation of Transformer on GPU
Add gradients clipping to NASNet training and adjust hyper-parameters
This commit is contained in:
parent
fa5c9c1528
commit
f27f047f14
|
@ -23,7 +23,7 @@ nasnet_a_mobile_config_gpu = edict({
|
||||||
'rank': 0,
|
'rank': 0,
|
||||||
'group_size': 1,
|
'group_size': 1,
|
||||||
'work_nums': 8,
|
'work_nums': 8,
|
||||||
'epoch_size': 500,
|
'epoch_size': 600,
|
||||||
'keep_checkpoint_max': 100,
|
'keep_checkpoint_max': 100,
|
||||||
'ckpt_path': './checkpoint/',
|
'ckpt_path': './checkpoint/',
|
||||||
'is_save_on_master': 0,
|
'is_save_on_master': 0,
|
||||||
|
@ -39,7 +39,7 @@ nasnet_a_mobile_config_gpu = edict({
|
||||||
|
|
||||||
### Learning Rate Config
|
### Learning Rate Config
|
||||||
# 'lr_decay_method': 'exponential',
|
# 'lr_decay_method': 'exponential',
|
||||||
'lr_init': 0.04,
|
'lr_init': 0.04*8,
|
||||||
'lr_decay_rate': 0.97,
|
'lr_decay_rate': 0.97,
|
||||||
'num_epoch_per_decay': 2.4,
|
'num_epoch_per_decay': 2.4,
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ from mindspore.common import dtype as mstype
|
||||||
|
|
||||||
from src.config import nasnet_a_mobile_config_gpu as cfg
|
from src.config import nasnet_a_mobile_config_gpu as cfg
|
||||||
from src.dataset import create_dataset
|
from src.dataset import create_dataset
|
||||||
from src.nasnet_a_mobile import NASNetAMobile, CrossEntropy
|
from src.nasnet_a_mobile import NASNetAMobileWithLoss, NASNetAMobileTrainOneStepWithClipGradient
|
||||||
from src.lr_generator import get_lr
|
from src.lr_generator import get_lr
|
||||||
|
|
||||||
|
|
||||||
|
@ -69,13 +69,10 @@ if __name__ == '__main__':
|
||||||
batches_per_epoch = dataset.get_dataset_size()
|
batches_per_epoch = dataset.get_dataset_size()
|
||||||
|
|
||||||
# network
|
# network
|
||||||
net = NASNetAMobile(cfg.num_classes)
|
net_with_loss = NASNetAMobileWithLoss(cfg)
|
||||||
if args_opt.resume:
|
if args_opt.resume:
|
||||||
ckpt = load_checkpoint(args_opt.resume)
|
ckpt = load_checkpoint(args_opt.resume)
|
||||||
load_param_into_net(net, ckpt)
|
load_param_into_net(net_with_loss, ckpt)
|
||||||
|
|
||||||
#loss
|
|
||||||
loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes, factor=cfg.aux_factor)
|
|
||||||
|
|
||||||
# learning rate schedule
|
# learning rate schedule
|
||||||
lr = get_lr(lr_init=cfg.lr_init, lr_decay_rate=cfg.lr_decay_rate,
|
lr = get_lr(lr_init=cfg.lr_init, lr_decay_rate=cfg.lr_decay_rate,
|
||||||
|
@ -88,26 +85,28 @@ if __name__ == '__main__':
|
||||||
resume = split_result[-2].split("-")
|
resume = split_result[-2].split("-")
|
||||||
resume_epoch = int(resume[-1])
|
resume_epoch = int(resume[-1])
|
||||||
step_num_in_epoch = int(split_result[-1])
|
step_num_in_epoch = int(split_result[-1])
|
||||||
assert step_num_in_epoch == ds_train.get_dataset_size()\
|
assert step_num_in_epoch == dataset.get_dataset_size()\
|
||||||
, "This script only supports resuming at the end of epoch"
|
, "This script only supports resuming at the end of epoch"
|
||||||
lr = lr[(ds_train.get_dataset_size() * (resume_epoch - 1) + step_num_in_epoch):]
|
lr = lr[(dataset.get_dataset_size() * (resume_epoch - 1) + step_num_in_epoch):]
|
||||||
lr = Tensor(lr, mstype.float32)
|
lr = Tensor(lr, mstype.float32)
|
||||||
|
|
||||||
# optimizer
|
# optimizer
|
||||||
decayed_params = []
|
decayed_params = []
|
||||||
no_decayed_params = []
|
no_decayed_params = []
|
||||||
for param in net.trainable_params():
|
for param in net_with_loss.trainable_params():
|
||||||
if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
|
if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
|
||||||
decayed_params.append(param)
|
decayed_params.append(param)
|
||||||
else:
|
else:
|
||||||
no_decayed_params.append(param)
|
no_decayed_params.append(param)
|
||||||
group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay},
|
group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay},
|
||||||
{'params': no_decayed_params},
|
{'params': no_decayed_params},
|
||||||
{'order_params': net.trainable_params()}]
|
{'order_params': net_with_loss.trainable_params()}]
|
||||||
optimizer = RMSProp(group_params, lr, decay=cfg.rmsprop_decay, weight_decay=cfg.weight_decay,
|
optimizer = RMSProp(group_params, lr, decay=cfg.rmsprop_decay, weight_decay=cfg.weight_decay,
|
||||||
momentum=cfg.momentum, epsilon=cfg.opt_eps, loss_scale=cfg.loss_scale)
|
momentum=cfg.momentum, epsilon=cfg.opt_eps, loss_scale=cfg.loss_scale)
|
||||||
|
|
||||||
model = Model(net, loss_fn=loss, optimizer=optimizer)
|
net_with_grads = NASNetAMobileTrainOneStepWithClipGradient(net_with_loss, optimizer)
|
||||||
|
net_with_grads.set_train()
|
||||||
|
model = Model(net_with_grads)
|
||||||
|
|
||||||
print("============== Starting Training ==============")
|
print("============== Starting Training ==============")
|
||||||
loss_cb = LossMonitor(per_print_times=batches_per_epoch)
|
loss_cb = LossMonitor(per_print_times=batches_per_epoch)
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
"""Transformer evaluation script."""
|
"""Transformer evaluation script."""
|
||||||
|
|
||||||
import os
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import mindspore.nn as nn
|
import mindspore.nn as nn
|
||||||
|
@ -97,9 +97,14 @@ def run_transformer_eval():
|
||||||
"""
|
"""
|
||||||
Transformer evaluation.
|
Transformer evaluation.
|
||||||
"""
|
"""
|
||||||
device_id = int(os.getenv('DEVICE_ID'))
|
parser = argparse.ArgumentParser(description='tranformer')
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False,
|
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||||
device_id=device_id)
|
help="device where the code will be implemented, default is Ascend")
|
||||||
|
parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, reserve_class_name_in_scope=False,
|
||||||
|
device_id=args.device_id)
|
||||||
|
|
||||||
dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
|
dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file)
|
||||||
tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
|
tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False)
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
echo "=============================================================================================================="
|
||||||
|
echo "Please run the scipt as: "
|
||||||
|
echo "sh run_distribute_pretrain_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH"
|
||||||
|
echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00"
|
||||||
|
echo "It is better to use absolute path."
|
||||||
|
echo "=============================================================================================================="
|
||||||
|
|
||||||
|
rm -rf run_distribute_train
|
||||||
|
mkdir run_distribute_train
|
||||||
|
cp -rf ./src/ train.py ./run_distribute_train
|
||||||
|
cd run_distribute_train || exit
|
||||||
|
|
||||||
|
export RANK_SIZE=$1
|
||||||
|
EPOCH_SIZE=$2
|
||||||
|
DATA_PATH=$3
|
||||||
|
echo $RANK_SIZE
|
||||||
|
|
||||||
|
mpirun -n $RANK_SIZE \
|
||||||
|
python train.py \
|
||||||
|
--distribute="true" \
|
||||||
|
--device_target="GPU" \
|
||||||
|
--epoch_size=$EPOCH_SIZE \
|
||||||
|
--device_num=$RANK_SIZE \
|
||||||
|
--enable_save_ckpt="true" \
|
||||||
|
--enable_lossscale="true" \
|
||||||
|
--do_shuffle="true" \
|
||||||
|
--checkpoint_path="" \
|
||||||
|
--save_checkpoint_steps=2500 \
|
||||||
|
--save_checkpoint_num=30 \
|
||||||
|
--data_path=$DATA_PATH \
|
||||||
|
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
echo "=============================================================================================================="
|
||||||
|
echo "Please run the scipt as: "
|
||||||
|
echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID"
|
||||||
|
echo "for example: sh run_eval.sh Ascend 0"
|
||||||
|
echo "Note: set the checkpoint and dataset path in src/eval_config.py"
|
||||||
|
echo "=============================================================================================================="
|
||||||
|
|
||||||
|
export DEVICE_TARGET=$1
|
||||||
|
DEVICE_ID=$2
|
||||||
|
|
||||||
|
python eval.py \
|
||||||
|
--device_target=$DEVICE_TARGET \
|
||||||
|
--device_id=$DEVICE_ID \
|
|
@ -16,8 +16,8 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE DATA_PATH"
|
echo "sh run_standalone_train.sh DEVICE_TARGET DEVICE_ID EPOCH_SIZE DATA_PATH"
|
||||||
echo "for example: sh run_standalone_train.sh 0 52 /path/ende-l128-mindrecord00"
|
echo "for example: sh run_standalone_train.sh Ascend 0 52 /path/ende-l128-mindrecord00"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
|
|
||||||
|
@ -26,13 +26,15 @@ mkdir run_standalone_train
|
||||||
cp -rf ./src/ train.py ./run_standalone_train
|
cp -rf ./src/ train.py ./run_standalone_train
|
||||||
cd run_standalone_train || exit
|
cd run_standalone_train || exit
|
||||||
|
|
||||||
export DEVICE_ID=$1
|
export DEVICE_TARGET=$1
|
||||||
EPOCH_SIZE=$2
|
DEVICE_ID=$2
|
||||||
DATA_PATH=$3
|
EPOCH_SIZE=$3
|
||||||
|
DATA_PATH=$4
|
||||||
|
|
||||||
python train.py \
|
python train.py \
|
||||||
--distribute="false" \
|
--distribute="false" \
|
||||||
--epoch_size=$EPOCH_SIZE \
|
--epoch_size=$EPOCH_SIZE \
|
||||||
|
--device_target=$DEVICE_TARGET \
|
||||||
--device_id=$DEVICE_ID \
|
--device_id=$DEVICE_ID \
|
||||||
--enable_save_ckpt="true" \
|
--enable_save_ckpt="true" \
|
||||||
--enable_lossscale="true" \
|
--enable_lossscale="true" \
|
||||||
|
@ -42,4 +44,4 @@ python train.py \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH \
|
||||||
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
||||||
cd ..
|
cd ..
|
|
@ -23,6 +23,7 @@ cfg = edict({
|
||||||
'scale_factor': 2,
|
'scale_factor': 2,
|
||||||
'scale_window': 2000,
|
'scale_window': 2000,
|
||||||
'optimizer': 'Adam',
|
'optimizer': 'Adam',
|
||||||
|
'optimizer_adam_beta2': 0.997,
|
||||||
'lr_schedule': edict({
|
'lr_schedule': edict({
|
||||||
'learning_rate': 2.0,
|
'learning_rate': 2.0,
|
||||||
'warmup_steps': 8000,
|
'warmup_steps': 8000,
|
||||||
|
@ -51,6 +52,23 @@ if cfg.transformer_network == 'large':
|
||||||
input_mask_from_dataset=True,
|
input_mask_from_dataset=True,
|
||||||
dtype=mstype.float32,
|
dtype=mstype.float32,
|
||||||
compute_type=mstype.float16)
|
compute_type=mstype.float16)
|
||||||
|
transformer_net_cfg_gpu = TransformerConfig(
|
||||||
|
batch_size=32,
|
||||||
|
seq_length=128,
|
||||||
|
vocab_size=36560,
|
||||||
|
hidden_size=1024,
|
||||||
|
num_hidden_layers=6,
|
||||||
|
num_attention_heads=16,
|
||||||
|
intermediate_size=4096,
|
||||||
|
hidden_act="relu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=128,
|
||||||
|
initializer_range=0.02,
|
||||||
|
label_smoothing=0.1,
|
||||||
|
input_mask_from_dataset=True,
|
||||||
|
dtype=mstype.float32,
|
||||||
|
compute_type=mstype.float16)
|
||||||
if cfg.transformer_network == 'base':
|
if cfg.transformer_network == 'base':
|
||||||
transformer_net_cfg = TransformerConfig(
|
transformer_net_cfg = TransformerConfig(
|
||||||
batch_size=96,
|
batch_size=96,
|
||||||
|
|
|
@ -166,7 +166,7 @@ class TransformerTrainOneStepCell(nn.Cell):
|
||||||
self.reducer_flag = False
|
self.reducer_flag = False
|
||||||
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
|
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
|
||||||
if self.parallel_mode not in ParallelMode.MODE_LIST:
|
if self.parallel_mode not in ParallelMode.MODE_LIST:
|
||||||
raise ValueError("Parallel mode does not support: ", parallel_mode)
|
raise ValueError("Parallel mode does not support: ", self.parallel_mode)
|
||||||
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
||||||
self.reducer_flag = True
|
self.reducer_flag = True
|
||||||
self.grad_reducer = None
|
self.grad_reducer = None
|
||||||
|
@ -228,6 +228,12 @@ reciprocal = P.Reciprocal()
|
||||||
def tensor_grad_scale(scale, grad):
|
def tensor_grad_scale(scale, grad):
|
||||||
return grad * F.cast(reciprocal(scale), F.dtype(grad))
|
return grad * F.cast(reciprocal(scale), F.dtype(grad))
|
||||||
|
|
||||||
|
_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
|
||||||
|
grad_overflow = P.FloatStatus()
|
||||||
|
|
||||||
|
@_grad_overflow.register("Tensor")
|
||||||
|
def _tensor_grad_overflow(grad):
|
||||||
|
return grad_overflow(grad)
|
||||||
|
|
||||||
class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
||||||
"""
|
"""
|
||||||
|
@ -255,7 +261,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
||||||
|
|
||||||
self.parallel_mode = _get_parallel_mode()
|
self.parallel_mode = _get_parallel_mode()
|
||||||
if self.parallel_mode not in ParallelMode.MODE_LIST:
|
if self.parallel_mode not in ParallelMode.MODE_LIST:
|
||||||
raise ValueError("Parallel mode does not support: ", parallel_mode)
|
raise ValueError("Parallel mode does not support: ", self.parallel_mode)
|
||||||
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
|
||||||
self.reducer_flag = True
|
self.reducer_flag = True
|
||||||
self.grad_reducer = None
|
self.grad_reducer = None
|
||||||
|
@ -266,9 +272,16 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
||||||
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
|
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
|
||||||
self.clip_gradients = ClipGradients()
|
self.clip_gradients = ClipGradients()
|
||||||
self.cast = P.Cast()
|
self.cast = P.Cast()
|
||||||
self.alloc_status = P.NPUAllocFloatStatus()
|
if context.get_context("device_target") == "GPU":
|
||||||
self.get_status = P.NPUGetFloatStatus()
|
self.gpu_target = True
|
||||||
self.clear_before_grad = P.NPUClearFloatStatus()
|
self.float_status = P.FloatStatus()
|
||||||
|
self.addn = P.AddN()
|
||||||
|
self.reshape = P.Reshape()
|
||||||
|
else:
|
||||||
|
self.gpu_target = False
|
||||||
|
self.alloc_status = P.NPUAllocFloatStatus()
|
||||||
|
self.get_status = P.NPUGetFloatStatus()
|
||||||
|
self.clear_before_grad = P.NPUClearFloatStatus()
|
||||||
self.reduce_sum = P.ReduceSum(keep_dims=False)
|
self.reduce_sum = P.ReduceSum(keep_dims=False)
|
||||||
self.depend_parameter_use = P.ControlDepend(depend_mode=1)
|
self.depend_parameter_use = P.ControlDepend(depend_mode=1)
|
||||||
self.base = Tensor(1, mstype.float32)
|
self.base = Tensor(1, mstype.float32)
|
||||||
|
@ -305,10 +318,12 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
||||||
target_mask,
|
target_mask,
|
||||||
label_ids,
|
label_ids,
|
||||||
label_weights)
|
label_weights)
|
||||||
# alloc status
|
init = False
|
||||||
init = self.alloc_status()
|
if not self.gpu_target:
|
||||||
# clear overflow buffer
|
# alloc status
|
||||||
self.clear_before_grad(init)
|
init = self.alloc_status()
|
||||||
|
# clear overflow buffer
|
||||||
|
self.clear_before_grad(init)
|
||||||
if sens is None:
|
if sens is None:
|
||||||
scaling_sens = self.loss_scale
|
scaling_sens = self.loss_scale
|
||||||
else:
|
else:
|
||||||
|
@ -327,8 +342,16 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell):
|
||||||
if self.reducer_flag:
|
if self.reducer_flag:
|
||||||
# apply grad reducer on grads
|
# apply grad reducer on grads
|
||||||
grads = self.grad_reducer(grads)
|
grads = self.grad_reducer(grads)
|
||||||
self.get_status(init)
|
|
||||||
flag_sum = self.reduce_sum(init, (0,))
|
if not self.gpu_target:
|
||||||
|
self.get_status(init)
|
||||||
|
# sum overflow buffer elements, 0: not overflow, >0: overflow
|
||||||
|
flag_sum = self.reduce_sum(init, (0,))
|
||||||
|
else:
|
||||||
|
flag_sum = self.hyper_map(F.partial(_grad_overflow), grads)
|
||||||
|
flag_sum = self.addn(flag_sum)
|
||||||
|
# convert flag_sum to scalar
|
||||||
|
flag_sum = self.reshape(flag_sum, (()))
|
||||||
|
|
||||||
if self.is_distributed:
|
if self.is_distributed:
|
||||||
# sum overflow flag over devices
|
# sum overflow flag over devices
|
||||||
|
|
|
@ -35,7 +35,7 @@ from mindspore.common import set_seed
|
||||||
|
|
||||||
from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \
|
from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \
|
||||||
TransformerTrainOneStepWithLossScaleCell
|
TransformerTrainOneStepWithLossScaleCell
|
||||||
from src.config import cfg, transformer_net_cfg
|
from src.config import cfg, transformer_net_cfg, transformer_net_cfg_gpu
|
||||||
from src.dataset import create_transformer_dataset
|
from src.dataset import create_transformer_dataset
|
||||||
from src.lr_schedule import create_dynamic_lr
|
from src.lr_schedule import create_dynamic_lr
|
||||||
|
|
||||||
|
@ -73,13 +73,17 @@ class LossCallBack(Callback):
|
||||||
time_stamp_current = get_ms_timestamp()
|
time_stamp_current = get_ms_timestamp()
|
||||||
cb_params = run_context.original_args()
|
cb_params = run_context.original_args()
|
||||||
print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
||||||
cb_params.cur_epoch_num, cb_params.cur_step_num,
|
cb_params.cur_epoch_num,
|
||||||
|
cb_params.cur_step_num,
|
||||||
str(cb_params.net_outputs)))
|
str(cb_params.net_outputs)))
|
||||||
with open("./loss_{}.log".format(self.rank_id), "a+") as f:
|
with open("./loss_{}.log".format(self.rank_id), "a+") as f:
|
||||||
f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format(
|
||||||
cb_params.cur_epoch_num,
|
time_stamp_current - time_stamp_first,
|
||||||
cb_params.cur_step_num,
|
cb_params.cur_epoch_num,
|
||||||
str(cb_params.net_outputs)))
|
cb_params.cur_step_num,
|
||||||
|
str(cb_params.net_outputs[0].asnumpy()),
|
||||||
|
str(cb_params.net_outputs[1].asnumpy()),
|
||||||
|
str(cb_params.net_outputs[2].asnumpy())))
|
||||||
f.write('\n')
|
f.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,6 +95,8 @@ def argparse_init():
|
||||||
parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'],
|
parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'],
|
||||||
help="Run distribute, default is false.")
|
help="Run distribute, default is false.")
|
||||||
parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.")
|
parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.")
|
||||||
|
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||||
|
help="device where the code will be implemented, default is Ascend")
|
||||||
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
|
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
|
||||||
parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
|
parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
|
||||||
parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'],
|
parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'],
|
||||||
|
@ -116,15 +122,21 @@ def run_transformer_train():
|
||||||
"""
|
"""
|
||||||
parser = argparse_init()
|
parser = argparse_init()
|
||||||
args, _ = parser.parse_known_args()
|
args, _ = parser.parse_known_args()
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)
|
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
|
||||||
context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
|
context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
|
||||||
|
|
||||||
if args.distribute == "true":
|
if args.distribute == "true":
|
||||||
device_num = args.device_num
|
if args.device_target == "Ascend":
|
||||||
|
device_num = args.device_num
|
||||||
|
D.init('hccl')
|
||||||
|
else:
|
||||||
|
D.init('nccl')
|
||||||
|
device_num = D.get_group_size()
|
||||||
|
rank = get_rank()
|
||||||
|
args.device_id = rank
|
||||||
context.reset_auto_parallel_context()
|
context.reset_auto_parallel_context()
|
||||||
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
|
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
|
||||||
device_num=device_num)
|
device_num=device_num)
|
||||||
D.init()
|
|
||||||
rank_id = args.device_id % device_num
|
rank_id = args.device_id % device_num
|
||||||
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
|
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
|
||||||
else:
|
else:
|
||||||
|
@ -135,27 +147,39 @@ def run_transformer_train():
|
||||||
rank_id=rank_id, do_shuffle=args.do_shuffle,
|
rank_id=rank_id, do_shuffle=args.do_shuffle,
|
||||||
dataset_path=args.data_path,
|
dataset_path=args.data_path,
|
||||||
bucket_boundaries=args.bucket_boundaries)
|
bucket_boundaries=args.bucket_boundaries)
|
||||||
|
if args.device_target == "Ascend":
|
||||||
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)
|
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)
|
||||||
|
else:
|
||||||
|
netwithloss = TransformerNetworkWithLoss(transformer_net_cfg_gpu, True)
|
||||||
|
|
||||||
if args.checkpoint_path:
|
if args.checkpoint_path:
|
||||||
parameter_dict = load_checkpoint(args.checkpoint_path)
|
parameter_dict = load_checkpoint(args.checkpoint_path)
|
||||||
load_param_into_net(netwithloss, parameter_dict)
|
load_param_into_net(netwithloss, parameter_dict)
|
||||||
|
|
||||||
|
hidden_size = transformer_net_cfg.hidden_size if args.device_target == "Ascend" \
|
||||||
|
else transformer_net_cfg_gpu.hidden_size
|
||||||
lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
|
lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
|
||||||
training_steps=dataset.get_dataset_size()*args.epoch_size,
|
training_steps=dataset.get_dataset_size()*args.epoch_size,
|
||||||
learning_rate=cfg.lr_schedule.learning_rate,
|
learning_rate=cfg.lr_schedule.learning_rate,
|
||||||
warmup_steps=cfg.lr_schedule.warmup_steps,
|
warmup_steps=cfg.lr_schedule.warmup_steps,
|
||||||
hidden_size=transformer_net_cfg.hidden_size,
|
hidden_size=hidden_size,
|
||||||
start_decay_step=cfg.lr_schedule.start_decay_step,
|
start_decay_step=cfg.lr_schedule.start_decay_step,
|
||||||
min_lr=cfg.lr_schedule.min_lr), mstype.float32)
|
min_lr=cfg.lr_schedule.min_lr), mstype.float32)
|
||||||
optimizer = Adam(netwithloss.trainable_params(), lr)
|
|
||||||
|
if args.device_target == "GPU" and cfg.transformer_network == "large":
|
||||||
|
optimizer = Adam(netwithloss.trainable_params(), lr, beta2=cfg.optimizer_adam_beta2)
|
||||||
|
else:
|
||||||
|
optimizer = Adam(netwithloss.trainable_params(), lr)
|
||||||
|
|
||||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
|
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
|
||||||
if args.enable_save_ckpt == "true":
|
if args.enable_save_ckpt == "true":
|
||||||
if device_num == 1 or (device_num > 1 and rank_id == 0):
|
if device_num == 1 or (device_num > 1 and rank_id == 0):
|
||||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
|
if args.device_target == "Ascend":
|
||||||
keep_checkpoint_max=args.save_checkpoint_num)
|
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
|
||||||
|
keep_checkpoint_max=args.save_checkpoint_num)
|
||||||
|
else:
|
||||||
|
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset.get_dataset_size(),
|
||||||
|
keep_checkpoint_max=args.save_checkpoint_num)
|
||||||
ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config)
|
ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config)
|
||||||
callbacks.append(ckpoint_cb)
|
callbacks.append(ckpoint_cb)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue