!28551 [dataset][multiprocessing][modelarts] 修复集群下数据多进程shm计算出错引发runtimeerror

Merge pull request !28551 from xiefangqi/md_fix_shm_device_num_greater_problem
This commit is contained in:
i-robot 2022-01-05 01:26:34 +00:00 committed by Gitee
commit 4c3d1ece65
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
1 changed files with 6 additions and 1 deletions

View File

@ -4996,7 +4996,12 @@ def _check_shm_usage(num_worker, queue_size, max_rowsize, num_queues=1):
"""
threshold_ratio = 0.8
if platform.system().lower() not in {"windows", "darwin"}:
shm_estimate_usage = _get_device_num() * num_worker * num_queues * \
device_num = _get_device_num()
# In the cluster, _get_device_num indicates the number of the entire cluster. The maximum number of cards
# on the ascend server is 8.
if device_num > 1 and context.get_context("device_target") == "Ascend":
device_num = min(device_num, 8)
shm_estimate_usage = device_num * num_worker * num_queues * \
(queue_size + 2) * max_rowsize * 1024 * 1024
try:
shm_available = psutil.disk_usage('/dev/shm').free