diff --git a/tests/st/cpu_data_parallel/test_all_reduce/build_allreduce_net_cluster.sh b/tests/st/cpu_data_parallel/test_all_reduce/build_allreduce_net_cluster.sh index e0570bc28ca..d46555ea3d5 100644 --- a/tests/st/cpu_data_parallel/test_all_reduce/build_allreduce_net_cluster.sh +++ b/tests/st/cpu_data_parallel/test_all_reduce/build_allreduce_net_cluster.sh @@ -17,10 +17,11 @@ export MS_WORKER_NUM=8 export MS_SCHED_HOST=127.0.0.1 export MS_SCHED_PORT=$2 +export GLOG_v=1 # Launch 1 scheduler. export MS_ROLE=MS_SCHED -python3 $1 >scheduler.txt 2>&1 & +python3 $1 >scheduler.log 2>&1 & sched_pid=${!} echo "scheduler start success!" @@ -29,7 +30,7 @@ export MS_ROLE=MS_WORKER process_pid=() for((i=0;i<8;i++)); do - python3 $1 >worker_$i.txt 2>&1 & + python3 $1 >worker_$i.log 2>&1 & echo "worker ${i} start success with pid ${!}" process_pid[${i}]=${!} done diff --git a/tests/st/cpu_data_parallel/test_all_reduce/test_allreduce.py b/tests/st/cpu_data_parallel/test_all_reduce/test_allreduce.py index 5743995656f..ca0f3703b18 100644 --- a/tests/st/cpu_data_parallel/test_all_reduce/test_allreduce.py +++ b/tests/st/cpu_data_parallel/test_all_reduce/test_allreduce.py @@ -33,6 +33,11 @@ def test_allreduce(): if sys.platform != 'linux': return return_code = os.system("bash build_allreduce_net_cluster.sh run_allreduce.py 8119") + if return_code != 0: + os.system(f"echo '\n**************** Worker Log ****************'") + os.system(f"grep -E 'ERROR|Error|error' -C 15 ./worker*.log") + os.system(f"echo '\n**************** Scheduler Log ****************'") + os.system(f"grep -E 'ERROR|Error|error' -C 15 ./scheduler.log") assert return_code == 0