[Frontend] Publish Prometheus metrics in run_batch API (#7641)

This commit is contained in:
Pooya Davoodi 2024-08-23 23:04:22 -07:00 committed by GitHub
parent 6885fde317
commit 8da48e4d95
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 76 additions and 0 deletions

View File

@ -1,3 +1,7 @@
import subprocess
import sys
import tempfile
import time
from http import HTTPStatus
import openai
@ -177,3 +181,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI):
for metric in EXPECTED_METRICS:
assert metric in response.text
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0"
port = "8001"
server_url = f"http://{base_url}:{port}"
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(input_batch)
input_file.flush()
proc = subprocess.Popen([
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/e5-mistral-7b-instruct",
"--enable-metrics",
"--url",
base_url,
"--port",
port,
], )
def is_server_up(url):
try:
response = requests.get(url)
return response.status_code == 200
except requests.ConnectionError:
return False
while not is_server_up(server_url):
time.sleep(1)
response = requests.get(server_url + "/metrics")
assert response.status_code == HTTPStatus.OK
proc.wait()

View File

@ -3,6 +3,7 @@ from io import StringIO
from typing import Awaitable, Callable, List
import aiohttp
from prometheus_client import start_http_server
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -56,6 +57,24 @@ def parse_args():
'ID numbers being printed in log.'
'\n\nDefault: Unlimited')
parser.add_argument("--enable-metrics",
action="store_true",
help="Enable Prometheus metrics")
parser.add_argument(
"--url",
type=str,
default="0.0.0.0",
help="URL to the Prometheus metrics server "
"(only needed if enable-metrics is set).",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Port number for the Prometheus metrics server "
"(only needed if enable-metrics is set).",
)
return parser.parse_args()
@ -184,4 +203,12 @@ if __name__ == "__main__":
logger.info("vLLM batch processing API version %s", VLLM_VERSION)
logger.info("args: %s", args)
# Start the Prometheus metrics server. LLMEngine uses the Prometheus client
# to publish metrics at the /metrics endpoint.
if args.enable_metrics:
logger.info("Prometheus metrics enabled")
start_http_server(port=args.port, addr=args.url)
else:
logger.info("Prometheus metrics disabled")
asyncio.run(main(args))