mindspore/model_zoo/official/nlp/bert/pretrain_config.yaml

# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path"
device_target: "Ascend"
enable_profiling: False

# ==============================================================================
description: 'run_pretrain'
distribute: 'false'
epoch_size: 40
device_id: 0
device_num: 1
enable_save_ckpt: 'true'
enable_lossscale: 'true'
do_shuffle: 'true'
enable_data_sink: 'true'
data_sink_steps: 1
accumulation_steps: 1
allreduce_post_accumulation: 'true'
save_checkpoint_path: ''
load_checkpoint_path: ''
save_checkpoint_steps: 10000
train_steps: -1
save_checkpoint_num: 1
data_dir: ''
schema_dir: ''

# ==============================================================================
# pretrain related
batch_size: 32
# Available: [base, nezha, large, large_boost]
bert_network: 'base'
loss_scale_value: 65536
scale_factor: 2
scale_window: 1000
optimizer: 'Lamb'
enable_global_norm: False
# pretrain_eval related
data_file: ""
schema_file: ""
finetune_ckpt: ""
# optimizer related
AdamWeightDecay:
    learning_rate: 0.00003  # 3e-5
    end_learning_rate: 0.0
    power: 5.0
    weight_decay: 0.00001  # 1e-5
    decay_filter: ['layernorm', 'bias']
    eps: 0.000001  # 1e-6
    warmup_steps: 10000

Lamb:
    learning_rate: 0.0003  # 3e-4
    end_learning_rate: 0.0
    power: 2.0
    warmup_steps: 10000
    weight_decay: 0.01
    decay_filter: ['layernorm', 'bias']
    eps: 0.00000001  # 1e-8,

Momentum:
    learning_rate: 0.00002  # 2e-5
    momentum: 0.9

Thor:
    lr_max: 0.006464
    lr_min: 0.000001  # 1e-6
    lr_power: 2.0
    lr_total_steps: 30000
    damping_max: 0.007035
    damping_min: 0.000001  # 1e-6
    damping_power: 4.0
    damping_total_steps: 30000
    momentum: 0.9
    weight_decay: 0.00001  # 1e-5
    loss_scale: 1024.0
    frequency: 100
# ==============================================================================
# base
base_batch_size: 256
base_net_cfg:
    seq_length: 128
    vocab_size: 21128
    hidden_size: 768
    num_hidden_layers: 12
    num_attention_heads: 12
    intermediate_size: 3072
    hidden_act: "gelu"
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 512
    type_vocab_size: 2
    initializer_range: 0.02
    use_relative_positions: False
    dtype: mstype.float32
    compute_type: mstype.float16
# nezha
nezha_batch_size: 96
nezha_net_cfg:
    seq_length: 128
    vocab_size: 21128
    hidden_size: 1024
    num_hidden_layers: 24
    num_attention_heads: 16
    intermediate_size: 4096
    hidden_act: "gelu"
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 512
    type_vocab_size: 2
    initializer_range: 0.02
    use_relative_positions: True
    dtype: mstype.float32
    compute_type: mstype.float16
# large
large_batch_size: 24
large_net_cfg:
    seq_length: 512
    vocab_size: 30522
    hidden_size: 1024
    num_hidden_layers: 24
    num_attention_heads: 16
    intermediate_size: 4096
    hidden_act: "gelu"
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 512
    type_vocab_size: 2
    initializer_range: 0.02
    use_relative_positions: False
    dtype: mstype.float32
    compute_type: mstype.float16
# Accelerated large network which is only supported in Ascend yet.
large_boost_batch_size: 24
large_boost_net_cfg:
    seq_length: 512
    vocab_size: 30522
    hidden_size: 1024
    num_hidden_layers: 24
    num_attention_heads: 16
    intermediate_size: 4096
    hidden_act: "fast_gelu"
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 512
    type_vocab_size: 2
    initializer_range: 0.02
    use_relative_positions: False
    dtype: mstype.float32
    compute_type: mstype.float16


---
# Help description for each configuration
enable_modelarts: "Whether training on modelarts, default: False"
data_url: "Url for modelarts"
train_url: "Url for modelarts"
data_path: "The location of the input data."
output_path: "The location of the output file."
device_target: "Running platform, choose from Ascend or CPU, and default is Ascend."
enable_profiling: 'Whether enable profiling while training, default: False'

distribute: "Run distribute, default is 'false'."
epoch_size: "Epoch size, default is 1."
enable_save_ckpt: "Enable save checkpoint, default is true."
enable_lossscale: "Use lossscale or not, default is not."
do_shuffle: "Enable shuffle for dataset, default is true."
enable_data_sink: "Enable data sink, default is true."
data_sink_steps: "Sink steps for each epoch, default is 1."
accumulation_steps: "Accumulating gradients N times before weight update, default is 1."
allreduce_post_accumulation: "Whether to allreduce after accumulation of N steps or after each step, default is true."
save_checkpoint_path: "Save checkpoint path"
load_checkpoint_path: "Load checkpoint file path"
save_checkpoint_steps: "Save checkpoint steps, default is 1000"
train_steps: "Training Steps, default is -1, meaning run all steps according to epoch number."
save_checkpoint_num: "Save checkpoint numbers, default is 1."
data_dir: "Data path, it is better to use absolute path"
schema_dir: "Schema path, it is better to use absolute path"
---
# chocies
device_target: ['Ascend', 'GPU']
distribute: ["true", "false"]
enable_save_ckpt: ["true", "false"]
enable_lossscale: ["true", "false"]
do_shuffle: ["true", "false"]
enable_data_sink: ["true", "false"]
allreduce_post_accumulation: ["true", "false"]