forked from mindspore-Ecosystem/mindspore
fix some doc error
This commit is contained in:
parent
a0e3fd6bf3
commit
91c65a734a
|
@ -377,6 +377,12 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1],
|
|||
...
|
||||
```
|
||||
|
||||
> **Attention** If you are running with a huge dataset, it's better to add an external environ variable to make sure the hccl won't timeout.
|
||||
> ```
|
||||
> export HCCL_CONNECT_TIMEOUT=600
|
||||
> ```
|
||||
> This will extend the timeout limits of hccl from the default 120 seconds to 600 seconds.
|
||||
|
||||
### Distributed Training
|
||||
#### Running on Ascend
|
||||
```
|
||||
|
|
|
@ -178,8 +178,7 @@ def run_pretrain():
|
|||
|
||||
if args_opt.accumulation_steps <= 1:
|
||||
net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer,
|
||||
scale_update_cell=update_cell,
|
||||
enable_global_norm=cfg.enable_global_norm)
|
||||
scale_update_cell=update_cell)
|
||||
else:
|
||||
accumulation_steps = args_opt.accumulation_steps
|
||||
net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer,
|
||||
|
|
|
@ -350,13 +350,12 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
optimizer (Optimizer): Optimizer for updating the weights.
|
||||
scale_update_cell (Cell): Cell to do the loss scale. Default: None.
|
||||
"""
|
||||
def __init__(self, network, optimizer, scale_update_cell=None, enable_global_norm=False):
|
||||
def __init__(self, network, optimizer, scale_update_cell=None):
|
||||
super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
self.network.set_grad()
|
||||
self.weights = optimizer.parameters
|
||||
self.optimizer = optimizer
|
||||
self.enable_global_norm = enable_global_norm
|
||||
self.grad = C.GradOperation(get_by_list=True,
|
||||
sens_param=True)
|
||||
self.reducer_flag = False
|
||||
|
@ -423,10 +422,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
|
|||
# apply grad reducer on grads
|
||||
grads = self.grad_reducer(grads)
|
||||
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
|
||||
if self.enable_global_norm:
|
||||
grads = ClipByGlobalNorm()(grads)
|
||||
else:
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
self.get_status(init)
|
||||
flag_sum = self.reduce_sum(init, (0,))
|
||||
if self.is_distributed:
|
||||
|
|
|
@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set
|
|||
|
||||
|
||||
## how to use
|
||||
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
|
||||
For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir:
|
||||
```
|
||||
python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
|
||||
python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
|
||||
```
|
||||
|
||||
output:
|
||||
|
@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt
|
|||
1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
|
||||
|
||||
2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
|
||||
device_id
|
||||
device_num
|
||||
- device_id
|
||||
- device_num
|
||||
- data_dir
|
||||
|
||||
3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.
|
||||
|
|
|
@ -42,11 +42,21 @@ def parse_args():
|
|||
help="Data path, it is better to use absolute path")
|
||||
parser.add_argument("--hccl_config_dir", type=str, default="",
|
||||
help="Hccl config path, it is better to use absolute path")
|
||||
parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh",
|
||||
help="Path of the generated cmd file.")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def append_cmd(cmd, s):
|
||||
cmd += s
|
||||
cmd += "\n"
|
||||
return cmd
|
||||
|
||||
def append_cmd_env(cmd, key, value):
|
||||
return append_cmd(cmd, "export" + str(key) + "=" + str(value))
|
||||
|
||||
def distribute_pretrain():
|
||||
"""
|
||||
distribute pretrain scripts. The number of D chips can be automatically allocated
|
||||
|
@ -92,6 +102,7 @@ def distribute_pretrain():
|
|||
print("avg_core_per_rank:", avg_core_per_rank)
|
||||
|
||||
count = 0
|
||||
cmd = ""
|
||||
for instance in this_server["device"]:
|
||||
device_id = instance["device_id"]
|
||||
rank_id = instance["rank_id"]
|
||||
|
@ -104,38 +115,44 @@ def distribute_pretrain():
|
|||
end = start + core_gap
|
||||
cmdopt = str(start) + "-" + str(end)
|
||||
|
||||
os.environ["DEVICE_ID"] = device_id
|
||||
os.environ["RANK_ID"] = rank_id
|
||||
os.environ["DEPLOY_MODE"] = "0"
|
||||
os.environ["GE_USE_STATIC_MEMORY"] = "1"
|
||||
cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id))
|
||||
cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id))
|
||||
cmd = append_cmd(cmd, "export DEPLOY_MODE=0")
|
||||
cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1")
|
||||
|
||||
os.system("rm -rf LOG" + str(device_id))
|
||||
os.system("mkdir ./LOG" + str(device_id))
|
||||
os.system("cp *.py ./LOG" + str(device_id))
|
||||
os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
|
||||
os.system("env > ./LOG" + str(device_id) + "/env.log")
|
||||
cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id))
|
||||
cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id))
|
||||
cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id))
|
||||
cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log")
|
||||
cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log")
|
||||
|
||||
cur_dir = os.getcwd()
|
||||
os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
|
||||
os.environ["GLOG_logtostderr"] = "0"
|
||||
cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log")
|
||||
cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0")
|
||||
|
||||
print("core_nums:", cmdopt)
|
||||
print("epoch_size:", str(cfg['epoch_size']))
|
||||
print("data_dir:", data_dir)
|
||||
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
|
||||
print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt")
|
||||
|
||||
cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
|
||||
cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id))
|
||||
|
||||
run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
|
||||
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
|
||||
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
|
||||
raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
|
||||
" 'device_num' or 'data_dir'! ")
|
||||
cmd += opt
|
||||
cmd += " --data_dir=" + data_dir
|
||||
cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
|
||||
+ str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
|
||||
run_cmd += opt
|
||||
run_cmd += " --data_dir=" + data_dir
|
||||
run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
|
||||
+ str(rank_size) + ' >./pretraining_log.txt 2>&1 &'
|
||||
|
||||
os.system(cmd)
|
||||
cmd = append_cmd(cmd, run_cmd)
|
||||
cmd = append_cmd(cmd, "cd -")
|
||||
cmd += "\n"
|
||||
|
||||
with open(args.cmd_file, "w") as f:
|
||||
f.write(cmd)
|
||||
|
||||
if __name__ == "__main__":
|
||||
distribute_pretrain()
|
|
@ -6,6 +6,7 @@ enable_lossscale=true
|
|||
do_shuffle=true
|
||||
enable_data_sink=true
|
||||
data_sink_steps=100
|
||||
accumulation_steps=1
|
||||
save_checkpoint_path=./checkpoint/
|
||||
save_checkpoint_steps=10000
|
||||
save_checkpoint_num=1
|
||||
save_checkpoint_num=1
|
||||
|
|
|
@ -5,7 +5,7 @@ mindspore distributed training launch helper utilty that will generate hccl conf
|
|||
# use
|
||||
|
||||
```
|
||||
python hccl_tools.py --device_num [0,8)
|
||||
python hccl_tools.py --device_num "[0,8)"
|
||||
```
|
||||
|
||||
output:
|
||||
|
|
Loading…
Reference in New Issue