!49939 print process stack when hang

Merge pull request !49939 from gengdongjie/r1.9_py_spy
This commit is contained in:
i-robot 2023-03-08 07:40:00 +00:00 committed by Gitee
commit d289c017ce
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
2 changed files with 41 additions and 3 deletions

View File

@ -30,6 +30,7 @@ import multiprocessing
from multiprocessing.util import Finalize
import queue
from functools import partial
import subprocess
import threading
import weakref
import platform
@ -265,6 +266,14 @@ class SamplerFn:
"output data is too large. You can also set the timeout interval by "
"ds.config.set_multiprocessing_interval to adjust the output frequency of this "
"log.")
pid = self.workers[i % self.num_worker].pid
logger.warning("Generator subprocess ID {} is stuck.".format(pid))
install_status, _ = subprocess.getstatusoutput("py-spy --version")
if install_status == 0:
stack = subprocess.getoutput("py-spy dump -p {} -l -s".format(pid))
logger.warning("Generator subprocess stack:\n{}".format(stack))
else:
logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
result = self.workers[i % self.num_worker].get()
if isinstance(result, ExceptionHandler):

View File

@ -13,8 +13,10 @@
# limitations under the License.
# ==============================================================================
import copy
import numpy as np
import time
import pytest
import numpy as np
import mindspore
import mindspore.common.dtype as mstype
@ -2138,6 +2140,30 @@ def test_generator_with_single_numpy_with_yield():
assert count == 20
@pytest.mark.skip(reason="only for testing stuck scenario")
def test_generator_traceback():
"""
Feature: GeneratorDataset
Description: Generator is too slow then main process will log the stack of the stuck process
Expectation: The stuck locality can be logged
"""
class SlowDataset:
def __init__(self):
self.data = np.random.randint(0, 255, (100, 28, 28, 3), dtype=np.uint8)
def __getitem__(self, index):
if index % 10 == 0:
time.sleep(600)
return self.data[index]
def __len__(self):
return len(self.data)
dataset = ds.GeneratorDataset(SlowDataset(), column_names=["image"], num_parallel_workers=8)
for _ in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
pass
if __name__ == "__main__":
test_generator_0()
test_generator_1()
@ -2159,13 +2185,13 @@ if __name__ == "__main__":
test_generator_17()
test_generator_18()
test_generator_19()
test_generator_20()
test_generator_error_1()
test_generator_error_2()
test_generator_error_3()
test_generator_error_4()
test_generator_sequential_sampler()
test_generator_distributed_sampler()
test_generator_random_sampler()
test_generator_distributed_sampler()
test_generator_num_samples()
test_generator_num_samples_underflow()
test_generator_schema()
@ -2186,6 +2212,9 @@ if __name__ == "__main__":
test_generator_single_input_4()
test_generator_single_input_5()
test_generator_single_input_6()
test_generator_with_seed_5489_when_dist()
test_generator_with_set_seed_when_dist()
test_generator_with_single_numpy()
test_generator_with_single_numpy_with_next()
test_generator_with_single_numpy_with_yield()
test_generator_traceback()