forked from mindspore-Ecosystem/mindspore
add hccl_time_out options in bert distribute launcher
This commit is contained in:
parent
438cd08016
commit
1464bf3dd8
|
@ -44,6 +44,9 @@ def parse_args():
|
|||
help="Hccl config path, it is better to use absolute path")
|
||||
parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh",
|
||||
help="Path of the generated cmd file.")
|
||||
parser.add_argument("--hccl_time_out", type=int, default=120,
|
||||
help="Seconds to determine the hccl time out,"
|
||||
"default: 120, which is the same as hccl default config")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
@ -73,6 +76,8 @@ def distribute_pretrain():
|
|||
cfg = dict(cf.items("config"))
|
||||
|
||||
print("hccl_config_dir:", args.hccl_config_dir)
|
||||
print("hccl_time_out:", args.hccl_time_out)
|
||||
cmd = append_cmd_env(cmd, 'HCCL_CONNECTION_TIMEOUT', args.hccl_time_out)
|
||||
cmd = append_cmd_env(cmd, 'RANK_TABLE_FILE', args.hccl_config_dir)
|
||||
|
||||
cores = multiprocessing.cpu_count()
|
||||
|
|
|
@ -29,6 +29,7 @@ python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cm
|
|||
--hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
|
||||
--data_dir=$1 \
|
||||
--hccl_config_dir=$2 \
|
||||
--hccl_time_out=600 \
|
||||
--cmd_file=distributed_cmd.sh
|
||||
|
||||
bash distributed_cmd.sh
|
||||
|
|
Loading…
Reference in New Issue