diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md index 4d02c8abfa4..6ca9162ddc2 100644 --- a/model_zoo/official/nlp/bert/README.md +++ b/model_zoo/official/nlp/bert/README.md @@ -377,6 +377,12 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1], ... ``` +> **Attention** If you are running with a huge dataset, it's better to add an external environ variable to make sure the hccl won't timeout. +> ``` +> export HCCL_CONNECT_TIMEOUT=600 +> ``` +> This will extend the timeout limits of hccl from the default 120 seconds to 600 seconds. + ### Distributed Training #### Running on Ascend ``` diff --git a/model_zoo/official/nlp/bert/run_pretrain.py b/model_zoo/official/nlp/bert/run_pretrain.py index 48802efa9ab..5db2cac5140 100644 --- a/model_zoo/official/nlp/bert/run_pretrain.py +++ b/model_zoo/official/nlp/bert/run_pretrain.py @@ -178,8 +178,7 @@ def run_pretrain(): if args_opt.accumulation_steps <= 1: net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, - scale_update_cell=update_cell, - enable_global_norm=cfg.enable_global_norm) + scale_update_cell=update_cell) else: accumulation_steps = args_opt.accumulation_steps net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer, diff --git a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py index fae88f92a56..eb19a5c88fd 100644 --- a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py +++ b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py @@ -350,13 +350,12 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): optimizer (Optimizer): Optimizer for updating the weights. scale_update_cell (Cell): Cell to do the loss scale. Default: None. """ - def __init__(self, network, optimizer, scale_update_cell=None, enable_global_norm=False): + def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer - self.enable_global_norm = enable_global_norm self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False @@ -423,10 +422,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) - if self.enable_global_norm: - grads = ClipByGlobalNorm()(grads) - else: - grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) self.get_status(init) flag_sum = self.reduce_sum(init, (0,)) if self.is_distributed: diff --git a/model_zoo/utils/ascend_distributed_launcher/README.md b/model_zoo/utils/ascend_distributed_launcher/README.md index c8692774f5d..18a6532fbf8 100644 --- a/model_zoo/utils/ascend_distributed_launcher/README.md +++ b/model_zoo/utils/ascend_distributed_launcher/README.md @@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set ## how to use -For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: +For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir: ``` -python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json +python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json ``` output: @@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: - device_id - device_num + - device_id + - device_num + - data_dir 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. diff --git a/model_zoo/utils/ascend_distributed_launcher/run_distributed.py b/model_zoo/utils/ascend_distributed_launcher/get_distribute_pretrain_cmd.py similarity index 71% rename from model_zoo/utils/ascend_distributed_launcher/run_distributed.py rename to model_zoo/utils/ascend_distributed_launcher/get_distribute_pretrain_cmd.py index b21accf7c25..e2a62ba95d6 100644 --- a/model_zoo/utils/ascend_distributed_launcher/run_distributed.py +++ b/model_zoo/utils/ascend_distributed_launcher/get_distribute_pretrain_cmd.py @@ -42,11 +42,21 @@ def parse_args(): help="Data path, it is better to use absolute path") parser.add_argument("--hccl_config_dir", type=str, default="", help="Hccl config path, it is better to use absolute path") + parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", + help="Path of the generated cmd file.") args = parser.parse_args() return args +def append_cmd(cmd, s): + cmd += s + cmd += "\n" + return cmd + +def append_cmd_env(cmd, key, value): + return append_cmd(cmd, "export" + str(key) + "=" + str(value)) + def distribute_pretrain(): """ distribute pretrain scripts. The number of D chips can be automatically allocated @@ -92,6 +102,7 @@ def distribute_pretrain(): print("avg_core_per_rank:", avg_core_per_rank) count = 0 + cmd = "" for instance in this_server["device"]: device_id = instance["device_id"] rank_id = instance["rank_id"] @@ -104,38 +115,44 @@ def distribute_pretrain(): end = start + core_gap cmdopt = str(start) + "-" + str(end) - os.environ["DEVICE_ID"] = device_id - os.environ["RANK_ID"] = rank_id - os.environ["DEPLOY_MODE"] = "0" - os.environ["GE_USE_STATIC_MEMORY"] = "1" + cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id)) + cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id)) + cmd = append_cmd(cmd, "export DEPLOY_MODE=0") + cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1") - os.system("rm -rf LOG" + str(device_id)) - os.system("mkdir ./LOG" + str(device_id)) - os.system("cp *.py ./LOG" + str(device_id)) - os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") - os.system("env > ./LOG" + str(device_id) + "/env.log") + cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) + cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) + cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id)) + cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log") + cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") cur_dir = os.getcwd() - os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" - os.environ["GLOG_logtostderr"] = "0" + cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log") + cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") print("core_nums:", cmdopt) print("epoch_size:", str(cfg['epoch_size'])) print("data_dir:", data_dir) - print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") + print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt") - cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " + cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id)) + + run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," " 'device_num' or 'data_dir'! ") - cmd += opt - cmd += " --data_dir=" + data_dir - cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ - + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' + run_cmd += opt + run_cmd += " --data_dir=" + data_dir + run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ + + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' - os.system(cmd) + cmd = append_cmd(cmd, run_cmd) + cmd = append_cmd(cmd, "cd -") + cmd += "\n" + with open(args.cmd_file, "w") as f: + f.write(cmd) if __name__ == "__main__": distribute_pretrain() diff --git a/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini b/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini index 2298f83509b..1af5bdbae40 100644 --- a/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini +++ b/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini @@ -6,6 +6,7 @@ enable_lossscale=true do_shuffle=true enable_data_sink=true data_sink_steps=100 +accumulation_steps=1 save_checkpoint_path=./checkpoint/ save_checkpoint_steps=10000 -save_checkpoint_num=1 \ No newline at end of file +save_checkpoint_num=1 diff --git a/model_zoo/utils/hccl_tools/README.md b/model_zoo/utils/hccl_tools/README.md index 3bbe149a829..db97a60312b 100644 --- a/model_zoo/utils/hccl_tools/README.md +++ b/model_zoo/utils/hccl_tools/README.md @@ -5,7 +5,7 @@ mindspore distributed training launch helper utilty that will generate hccl conf # use ``` -python hccl_tools.py --device_num [0,8) +python hccl_tools.py --device_num "[0,8)" ``` output: