mirror of https://github.com/vllm-project/vllm
[Frontend] Improve Startup Failure UX (#7716)
This commit is contained in:
parent
91f4522cbf
commit
970dfdc01d
|
@ -1,3 +1,5 @@
|
|||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
||||
|
@ -8,19 +10,20 @@ from vllm.utils import FlexibleArgumentParser
|
|||
@pytest.mark.asyncio
|
||||
async def test_mp_crash_detection():
|
||||
|
||||
with pytest.raises(RuntimeError) as excinfo:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="vLLM's remote OpenAI server.")
|
||||
parser = make_arg_parser(parser)
|
||||
args = parser.parse_args([])
|
||||
# use an invalid tensor_parallel_size to trigger the
|
||||
# error in the server
|
||||
args.tensor_parallel_size = 65536
|
||||
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
||||
parser = make_arg_parser(parser)
|
||||
args = parser.parse_args([])
|
||||
# use an invalid tensor_parallel_size to trigger the
|
||||
# error in the server
|
||||
args.tensor_parallel_size = 65536
|
||||
|
||||
async with build_async_engine_client(args):
|
||||
pass
|
||||
assert "The server process died before responding to the readiness probe"\
|
||||
in str(excinfo.value)
|
||||
start = time.perf_counter()
|
||||
async with build_async_engine_client(args):
|
||||
pass
|
||||
end = time.perf_counter()
|
||||
|
||||
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
|
||||
"if there is an error in the startup.")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
|
@ -8,7 +8,7 @@ import tempfile
|
|||
from argparse import Namespace
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPStatus
|
||||
from typing import AsyncIterator, Set
|
||||
from typing import AsyncIterator, Optional, Set
|
||||
|
||||
from fastapi import APIRouter, FastAPI, Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
|
@ -60,6 +60,7 @@ openai_serving_embedding: OpenAIServingEmbedding
|
|||
openai_serving_tokenization: OpenAIServingTokenization
|
||||
prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
||||
|
||||
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
|
||||
logger = init_logger('vllm.entrypoints.openai.api_server')
|
||||
|
||||
_running_tasks: Set[asyncio.Task] = set()
|
||||
|
@ -94,7 +95,15 @@ async def lifespan(app: FastAPI):
|
|||
|
||||
@asynccontextmanager
|
||||
async def build_async_engine_client(
|
||||
args: Namespace) -> AsyncIterator[AsyncEngineClient]:
|
||||
args: Namespace) -> AsyncIterator[Optional[AsyncEngineClient]]:
|
||||
"""
|
||||
Create AsyncEngineClient, either:
|
||||
- in-process using the AsyncLLMEngine Directly
|
||||
- multiprocess using AsyncLLMEngine RPC
|
||||
|
||||
Returns the Client or None if the creation failed.
|
||||
"""
|
||||
|
||||
# Context manager to handle async_engine_client lifecycle
|
||||
# Ensures everything is shutdown and cleaned up on error/exit
|
||||
global engine_args
|
||||
|
@ -157,11 +166,13 @@ async def build_async_engine_client(
|
|||
try:
|
||||
await rpc_client.setup()
|
||||
break
|
||||
except TimeoutError as e:
|
||||
except TimeoutError:
|
||||
if not rpc_server_process.is_alive():
|
||||
raise RuntimeError(
|
||||
"The server process died before "
|
||||
"responding to the readiness probe") from e
|
||||
logger.error(
|
||||
"RPCServer process died before responding "
|
||||
"to readiness probe")
|
||||
yield None
|
||||
return
|
||||
|
||||
yield async_engine_client
|
||||
finally:
|
||||
|
@ -410,6 +421,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
|||
logger.info("args: %s", args)
|
||||
|
||||
async with build_async_engine_client(args) as async_engine_client:
|
||||
# If None, creation of the client failed and we exit.
|
||||
if async_engine_client is None:
|
||||
return
|
||||
|
||||
app = await init_app(async_engine_client, args)
|
||||
|
||||
shutdown_task = await serve_http(
|
||||
|
|
Loading…
Reference in New Issue