imporve convergence of loss in bert

This commit is contained in:
chenhaozhe 2021-03-18 10:37:29 +08:00
parent 4e1e16c335
commit 15d37e5db9
4 changed files with 8 additions and 6 deletions

View File

@ -317,8 +317,9 @@ You can train your own model based on either pretrained classification model or
1. Convert your own dataset to COCO or VOC style. Otherwise you have to add your own data preprocess code.
2. Change config.py according to your own dataset, especially the `num_classes`.
3. Set argument `filter_weight` to `True` while calling `train.py`, this will filter the final detection box weight from the pretrained model.
4. Build your own bash scripts using new config and arguments for further convenient.
3. Prepare a pretrained checkpoint. You can load the pretrained checkpoint by `pre_trained` argument. Transfer training means a new training job, so just keep `pre_trained_epoch_size` same as default value `0`.
4. Set argument `filter_weight` to `True` while calling `train.py`, this will filter the final detection box weight from the pretrained model.
5. Build your own bash scripts using new config and arguments for further convenient.
### [Evaluation Process](#contents)

View File

@ -599,7 +599,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
scaling_sens = sens
# alloc status and clear should be right before gradoperation
init = self.alloc_status()
init = F.depend(loss, init)
init = F.depend(init, loss)
clear_status = self.clear_status(init)
scaling_sens = F.depend(scaling_sens, clear_status)
# update accumulation parameters

View File

@ -804,7 +804,8 @@ class BertModel(nn.Cell):
self.bert_embedding_lookup = nn.Embedding(
vocab_size=config.vocab_size,
embedding_size=self.embedding_size,
use_one_hot=use_one_hot_embeddings)
use_one_hot=use_one_hot_embeddings,
embedding_table=TruncatedNormal(config.initializer_range))
self.bert_embedding_postprocessor = EmbeddingPostprocessor(
embedding_size=self.embedding_size,

View File

@ -36,9 +36,9 @@ cfg = edict({
'warmup_steps': 10000,
}),
'Lamb': edict({
'learning_rate': 3e-5,
'learning_rate': 3e-4,
'end_learning_rate': 0.0,
'power': 5.0,
'power': 2.0,
'warmup_steps': 10000,
'weight_decay': 0.01,
'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),