From afa691378a09209d738eea6bffa546e5c58185ab Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Sat, 22 Feb 2025 00:22:12 +0000 Subject: [PATCH] p Signed-off-by: <> --- a.j2 | 345 ------------------ hfs3.py | 208 ----------- .../test_basic_correctness.py | 4 +- tests/metrics/test_metrics.py | 6 +- .../e2e/test_multistep_correctness.py | 90 +++-- vllm/test_utils.py | 150 +++++--- 6 files changed, 161 insertions(+), 642 deletions(-) delete mode 100644 a.j2 delete mode 100644 hfs3.py diff --git a/a.j2 b/a.j2 deleted file mode 100644 index d2e88c398a..0000000000 --- a/a.j2 +++ /dev/null @@ -1,345 +0,0 @@ -{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% if branch == "main" %} -{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %} -{% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu121" %} -{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} -{% endif %} -{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} -{% set default_working_dir = "/vllm-workspace/tests" %} -{% set hf_home = "/root/.cache/huggingface" %} -{% set list_file_diff = list_file_diff | split("|") %} - -steps: - - label: ":docker: build image" - key: image-build - depends_on: ~ - agents: - {% if branch == "main" %} - queue: cpu_queue_postmerge - {% else %} - queue: cpu_queue_premerge - {% endif %} - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - | - #!/bin/bash - if [[ -z $(docker manifest inspect {{ docker_image }}) ]]; then - echo "Image not found, proceeding with build..." - else - echo "Image found" - exit 0 - fi - - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 2 - - exit_status: -10 # Agent was lost - limit: 2 - - - block: Build CUDA 12.1 image - key: block-build-cu121 - depends_on: ~ - - - label: ":docker: build image CUDA 12.1" - key: image-build-cu121 - depends_on: block-build-cu121 - agents: - {% if branch == "main" %} - queue: cpu_queue_postmerge - {% else %} - queue: cpu_queue_premerge - {% endif %} - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - | - #!/bin/bash - if [[ -z $(docker manifest inspect {{ docker_image_cu121 }}) ]]; then - echo "Image not found, proceeding with build..." - else - echo "Image found" - exit 0 - fi - - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag {{ docker_image_cu121 }} --target test --progress plain ." - - "docker push {{ docker_image_cu121 }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 2 - - exit_status: -10 # Agent was lost - limit: 2 - - - block: Build CUDA 11.8 image - key: block-build-cu118 - depends_on: ~ - - - label: ":docker: build image CUDA 11.8" - key: image-build-cu118 - depends_on: block-build-cu118 - agents: - {% if branch == "main" %} - queue: cpu_queue_postmerge - {% else %} - queue: cpu_queue_premerge - {% endif %} - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - | - #!/bin/bash - if [[ -z $(docker manifest inspect {{ docker_image_cu118 }}) ]]; then - echo "Image not found, proceeding with build..." - else - echo "Image found" - exit 0 - fi - - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag {{ docker_image_cu118 }} --target test --progress plain ." - - "docker push {{ docker_image_cu118 }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 2 - - exit_status: -10 # Agent was lost - limit: 2 - - {% for step in steps %} - {% if step.fast_check_only != true %} - - {% set ns = namespace(blocked=1) %} - - {% if run_all == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if nightly == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if step.source_file_dependencies %} - {% for source_file in step.source_file_dependencies %} - {% for file in list_file_diff %} - {% if source_file in file %} - {% set ns.blocked = 0 %} - {% endif %} - {% endfor %} - {% endfor %} - {% else %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - - block: "Run {{ step.label }}" - depends_on: image-build - key: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} - {% endif %} - - - label: "{{ step.label }}" - {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} - {% else %} - depends_on: image-build - {% endif %} - agents: - {% if step.label == "Documentation Build" %} - queue: small_cpu_queue_premerge - {% elif step.no_gpu %} - queue: cpu_queue_premerge - {% elif step.gpu == "a100" %} - queue: a100_queue - {% elif step.num_gpus == 2 or step.num_gpus == 4 %} - queue: gpu_4_queue - {% else %} - queue: gpu_1_queue - {% endif %} - {% if step.num_nodes >= 2%} {# for multi-node test #} - commands: - - ./.buildkite/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ docker_image }} {% for command in step.commands %}"{{ (command | join(" && ")) | safe }}" {% endfor %} - {% endif %} - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 1 - - exit_status: -10 # Agent was lost - limit: 1 - {% if step.num_nodes < 2 %} - plugins: - {% if step.gpu != "a100" %} - - docker#v5.2.0: {# for GPU test #} - image: {{ docker_image }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: ["bash", "-xc", "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label == "Core" %}&& export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_HOME={{ hf_home }} - - HF_TOKEN - {% if branch == "main" %} - - BUILDKITE_ANALYTICS_TOKEN - {% endif %} - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - - {{ hf_home }}:{{ hf_home }} - {% else %} {# A100 is managed on EKS #} - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: {{ docker_image }} - command: - - bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label in ["Basic Correctness Test", "Basic Models Test", "Entrypoints Test", "Metrics, Tracing Test", "Async Engine, Inputs, Utils, Worker Test", "Samplers Test", "Engine Test"] %}&& export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}' - resources: - limits: - nvidia.com/gpu: {{ step.num_gpus or 1 }} - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: {{ hf_home }} - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: {{ hf_home }} - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: {{ hf_home }} - type: Directory - {% endif %} - {% endif %} - {% endif %} - {% endfor %} - - - group: "AMD Tests" - depends_on: ~ - steps: - - label: "AMD: :docker: build image" - depends_on: ~ - soft_fail: true - commands: - # Handle the introduction of test target in Dockerfile.rocm - - "grep -i 'from base as test' Dockerfile.rocm && docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain . || docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ." - - "docker push {{ docker_image_amd }}" - key: "amd-build" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 1 - - exit_status: -10 # Agent was lost - limit: 1 - - exit_status: 1 # Machine occasionally fail - limit: 1 - agents: - queue: amd-cpu - - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - depends_on: amd-build - agents: - {% if step.amd_gpu_type and step.amd_gpu_type=="mi300"%} - queue: amd_mi300 - {% else%} - queue: amd_gpu - {% endif%} - - command: bash .buildkite/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - priority: 100 - soft_fail: true - - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: true - - - block: "Run Intel CPU test" - depends_on: ~ - key: block-intel-cpu - - - label: "Intel CPU Test" - depends_on: block-intel-cpu - soft_fail: true - agents: - queue: intel-cpu - command: bash .buildkite/run-cpu-test.sh - - - label: "Intel HPU Test" - depends_on: ~ - soft_fail: true - agents: - queue: intel-hpu - command: bash .buildkite/run-hpu-test.sh - - - block: "Run Intel GPU test" - depends_on: ~ - key: block-intel-gpu - - - label: "Intel GPU Test" - soft_fail: true - depends_on: block-intel-gpu - agents: - queue: intel-gpu - command: bash .buildkite/run-xpu-test.sh - - - label: "IBM Power(ppc64le) CPU Test" - depends_on: ~ - soft_fail: true - agents: - queue: ibm-ppc64le - command: bash .buildkite/run-cpu-test-ppc64le.sh - - {% if nightly == "1" %} - - label: "GH200 Test" - depends_on: ~ - soft_fail: true - agents: - queue: gh200_queue - command: nvidia-smi && bash .buildkite/run-gh200-test.sh - {% endif %} - - - label: "TPU Test" - depends_on: ~ - soft_fail: True - agents: - queue: tpu_queue - commands: - - if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi - - yes | docker system prune -a diff --git a/hfs3.py b/hfs3.py deleted file mode 100644 index d79543a407..0000000000 --- a/hfs3.py +++ /dev/null @@ -1,208 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -import logging -import os -import shutil - -import boto3 -from huggingface_hub import HfApi, snapshot_download -from tqdm import tqdm - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class ModelTransfer: - - def __init__(self, - model_id, - s3_bucket, - aws_access_key_id=None, - aws_secret_access_key=None, - aws_region=None): - """ - Initialize the ModelTransfer class. - - Args: - model_id (str): HuggingFace model ID - s3_bucket (str): Name of the S3 bucket - aws_access_key_id (str, optional) - aws_secret_access_key (str, optional) - aws_region (str, optional): AWS region. Defaults to None. - """ - self.model_id = model_id - self.s3_bucket = s3_bucket - self.model_name = model_id.split('/')[-1] - - # Initialize S3 client - self.s3_client = boto3.client( - 's3', - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - region_name=aws_region) - - # Initialize Hugging Face API - self.hf_api = HfApi() - - def download_model(self, local_dir): - """ - Download the model from HuggingFace. - - Args: - local_dir (str): Local directory to save the model - - Returns: - str: Path to the downloaded model directory - """ - logger.info("Downloading model %s...", self.model_id) - - try: - local_dir_with_model = os.path.join(local_dir, self.model_name) - snapshot_download(repo_id=self.model_id, - local_dir=local_dir_with_model, - local_dir_use_symlinks=False, - token=os.getenv("HF_TOKEN")) - logger.info("Model downloaded successfully to %s", - local_dir_with_model) - return local_dir_with_model - - except Exception as e: - logger.error("Error downloading model: %s", str(e)) - raise - - def upload_to_s3(self, local_dir): - """ - Upload the model directory to S3. - - Args: - local_dir (str): Local directory containing the model files - """ - logger.info("Uploading model to S3 bucket %s...", self.s3_bucket) - - try: - # Walk through all files in the directory - for root, _, files in os.walk(local_dir): - for filename in files: - # Get the full local path - local_path = os.path.join(root, filename) - - # Calculate S3 path (preserve directory structure) - relative_path = os.path.relpath(local_path, local_dir) - s3_path = f"{self.model_name}/{relative_path}" - - # Upload file with progress bar - file_size = os.path.getsize(local_path) - with tqdm(total=file_size, - unit='B', - unit_scale=True, - desc=f"Uploading {filename}") as pbar: - self.s3_client.upload_file( - local_path, - self.s3_bucket, - s3_path, - Callback=lambda bytes_transferred: pbar.update( - bytes_transferred)) - - logger.info("Uploaded %s to s3://%s/%s", filename, - self.s3_bucket, s3_path) - - logger.info("Model upload completed successfully!") - - except Exception as e: - logger.error("Error uploading to S3: %s", str(e)) - raise - - -# "ibm/PowerMoE-3b", "internlm/internlm-chat-7b", -# "internlm/internlm2-chat-7b", "OpenGVLab/Mono-InternVL-2B", -# "internlm/internlm3-8b-instruct", "inceptionai/jais-13b-chat", -# "ai21labs/AI21-Jamba-1.5-Mini", "meta-llama/Meta-Llama-3-8B", -# "decapoda-research/llama-7b-hf", "state-spaces/mamba-130m-hf", -# "tiiuae/falcon-mamba-7b-instruct", "openbmb/MiniCPM-2B-sft-bf16", -# "openbmb/MiniCPM3-4B", "mistralai/Mistral-7B-Instruct-v0.1", -# "mistralai/Mixtral-8x7B-Instruct-v0.1", -# "mistral-community/Mixtral-8x22B-v0.1-AWQ", "mpt", "mosaicml/mpt-7b", -# "nvidia/Minitron-8B-Base", "allenai/OLMo-1B-hf", -# "shanearora/OLMo-7B-1124-hf", "allenai/OLMoE-1B-7B-0924-Instruct", -# "facebook/opt-iml-max-1.3b", "OrionStarAI/Orion-14B-Chat", -# "adept/persimmon-8b-chat", "microsoft/phi-2", -# "microsoft/Phi-3-mini-4k-instruct", -# "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3.5-MoE-instruct", -# "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat", -# "tiiuae/falcon-40b", "stabilityai/stablelm-zephyr-3b", -# "stabilityai/stablelm-3b-4e1t", "bigcode/starcoder2-3b", -# "upstage/solar-pro-preview-instruct", "Tele-AI/TeleChat2-3B", -# "xverse/XVERSE-7B-Chat", "facebook/bart-base", -# "facebook/bart-large-cnn", "microsoft/Florence-2-base", -# "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2", -# "parasail-ai/GritLM-7B-vllm", "internlm/internlm2-1_8b-reward", -# "ai21labs/Jamba-tiny-reward-dev", "llama", -# "intfloat/e5-mistral-7b-instruct", -# "ssmits/Qwen2-7B-Instruct-embed-base", "Qwen/Qwen2.5-Math-RM-72B", -# "Qwen/Qwen2.5-Math-PRM-7B", "jason9693/Qwen2.5-1.5B-apeach", -# "sentence-transformers/stsb-roberta-base-v2", -# "sentence-transformers/all-roberta-large-v1", -# "intfloat/multilingual-e5-large", "royokong/e5-v", -# "TIGER-Lab/VLM2Vec-Full", "MrLight/dse-qwen2-2b-mrl-v1", -# "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", -# "cross-encoder/ms-marco-MiniLM-L-6-v2", -# "cross-encoder/quora-roberta-base", "BAAI/bge-reranker-v2-m3", -# "THUDM/glm-4v-9b", "chatglm2-6b", "deepseek-ai/deepseek-vl2-tiny", -# "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m", -# "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3", -# "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", -# "llava-hf/LLaVA-NeXT-Video-7B-hf", -# "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", -# "TIGER-Lab/Mantis-8B-siglip-llama3", "openbmb/MiniCPM-o-2_6", -# "openbmb/MiniCPM-V-2_6", "allenai/Molmo-7B-D-0924", -# "nvidia/NVLM-D-72B", "google/paligemma-3b-pt-224", -# "microsoft/Phi-3-vision-128k-instruct", "mistralai/Pixtral-12B-2409", -# "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-Audio-7B-Instruct", -# "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct", -# "fixie-ai/ultravox-v0_5-llama-3_2-1b", -# "meta-llama/Llama-3.2-11B-Vision-Instruct", "openai/whisper-large-v3", -# "JackFram/llama-68m", "JackFram/llama-68m", "JackFram/llama-160m", -# "ArthurZ/Ilama-3.2-1B" - - -def main(): - # Configuration - MODEL_ID = [ - "HuggingFaceH4/zephyr-7b-beta", - "llava-hf/llava-1.5-7b-hf", - "ArthurZ/Ilama-3.2-1B", - "meta-llama/Llama-2-7b-hf", - ] - S3_BUCKET = "vllm-ci-model-weights" - # Local directory to temporarily store the model - LOCAL_DIR = "/home/ec2-user/models" - - AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") - AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") - AWS_REGION = "us-west-2" - - # Create transfer object - for model_id in MODEL_ID: - transfer = ModelTransfer(model_id=model_id, - s3_bucket=S3_BUCKET, - aws_access_key_id=AWS_ACCESS_KEY_ID, - aws_secret_access_key=AWS_SECRET_ACCESS_KEY, - aws_region=AWS_REGION) - - try: - # Create local directory if it doesn't exist - os.makedirs(LOCAL_DIR, exist_ok=True) - - # Download model - model_dir = transfer.download_model(LOCAL_DIR) - - # Upload to S3 and cleanup - transfer.upload_to_s3(model_dir) - shutil.rmtree(model_dir) - - except Exception as e: - logger.error("Error in transfer process: %s", str(e)) - raise - - -if __name__ == "__main__": - main() diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index b8eea638bf..816c694e38 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -97,8 +97,8 @@ def test_models( "test_suite", [ ("distilbert/distilgpt2", "ray", "", "L4"), ("distilbert/distilgpt2", "mp", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), + ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"), + ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"), ("distilbert/distilgpt2", "ray", "", "A100"), ("distilbert/distilgpt2", "mp", "", "A100"), ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"), diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 7c126e1047..ee8307749f 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -13,7 +13,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams -from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET MODELS = [ "distilbert/distilgpt2", @@ -142,9 +141,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, metrics_tag_content = stat_logger.labels["model_name"] if served_model_name is None or served_model_name == []: - actual_model_name = model - assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{actual_model_name}", ( # noqa: E501 - f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n" + assert metrics_tag_content == model, ( + f"Metrics tag model_name is wrong! expect: {model!r}\n" f"actual: {metrics_tag_content!r}") else: assert metrics_tag_content == served_model_name[0], ( diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index f26192d27e..d396e52a9d 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -52,7 +52,7 @@ from .conftest import (get_output_from_llm_generator, [{ # Use a small model for a fast test. # Note this is repeated in the test body; to initialize a tokenizer. - "model": "JackFram/llama-160m", + "model": "JackFram/llama-68m", # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -61,14 +61,14 @@ from .conftest import (get_output_from_llm_generator, "per_test_common_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { # Chunked prefill enabled with small value # to make sure we get mixed batches. - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -119,7 +119,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, for token_ids in batch_token_ids] == ([output_len] * batch_size) # Expect detokenized string to match. - tok = AutoTokenizer.from_pretrained("JackFram/llama-160m") + tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) print(f"{actual_token_ids=}") @@ -135,20 +135,27 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, # Print spec metrics. "disable_log_stats": False, }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-160m", - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model_name": "JackFram/llama-68m", + }, + { + "model_name": "JackFram/llama-160m", + }, + ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, "disable_logprobs_during_spec_decoding": False }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -208,7 +215,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( # Try two different tiny base models. # Note that one is equal to the draft model, another isn't. { - "model_name": "JackFram/llama-160m", + "model_name": "JackFram/llama-68m", }, { "model_name": "JackFram/llama-160m", @@ -217,12 +224,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -261,20 +268,27 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( # Skip cuda graph recording for fast test. "enforce_eager": True, }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-160m", - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model_name": "JackFram/llama-68m", + }, + { + "model_name": "JackFram/llama-160m", + }, + ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -322,12 +336,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -377,12 +391,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -435,12 +449,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -500,12 +514,12 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -553,7 +567,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, "test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, # Artificially limit the draft model max model len; this forces vLLM @@ -562,7 +576,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -613,13 +627,13 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "speculative_disable_by_batch_size": 2, "enable_chunked_prefill": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "speculative_disable_by_batch_size": 2, "enable_chunked_prefill": True, @@ -651,7 +665,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize( "common_llm_kwargs", [{ - "model_name": "JackFram/llama-160m", + "model_name": "JackFram/llama-68m", # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -662,14 +676,14 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, "test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": k, "enable_chunked_prefill": False, } # Try a range of common k, as well as large speculation. for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] ] + [{ - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": k, "enable_chunked_prefill": True, "max_num_batched_tokens": 4, @@ -715,7 +729,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, "test_llm_kwargs", [ { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": k, "spec_decoding_acceptance_method": "typical_acceptance_sampler", "enable_chunked_prefill": False @@ -723,7 +737,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, # Try a range of common k. for k in [1, 2, 3] ] + [{ - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": k, "spec_decoding_acceptance_method": "typical_acceptance_sampler", "enable_chunked_prefill": True, diff --git a/vllm/test_utils.py b/vllm/test_utils.py index 920bc1be8d..dcff69f2b9 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -1,71 +1,131 @@ # SPDX-License-Identifier: Apache-2.0 MODELS_ON_S3 = [ + "adept/fuyu-8b", + "ai21labs/AI21-Jamba-1.5-Mini", + "ai21labs/Jamba-tiny-random", + "ai21labs/Jamba-tiny-reward-dev", + "allenai/Molmo-7B-D-0924", + "allenai/OLMo-1B-hf", + "allenai/OLMoE-1B-7B-0924-Instruct", + "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", + "AMead10/Llama-3.2-1B-Instruct-AWQ", + "ArthurZ/Ilama-3.2-1B", + "BAAI/bge-base-en-v1.5", + "BAAI/bge-multilingual-gemma2", + "BAAI/bge-reranker-v2-m3", + "bigcode/starcoder2-3b", + "cross-encoder/ms-marco-MiniLM-L-6-v2", + "cross-encoder/quora-roberta-base", + "deepseek-ai/deepseek-vl2-tiny", "distilbert/distilgpt2", + "facebook/bart-base", + "facebook/bart-large-cnn", + "fixie-ai/ultravox-v0_5-llama-3_2-1b", + "google/gemma-1.1-2b-it", + "google/gemma-2-2b-it", + "google/paligemma-3b-pt-224", + "h2oai/h2ovl-mississippi-800m", + "HuggingFaceM4/Idefics3-8B-Llama3", + "internlm/internlm2-1_8b-reward", + "intfloat/e5-mistral-7b-instruct", + "intfloat/multilingual-e5-large", + "JackFram/llama-160m", + "jason9693/Qwen2.5-1.5B-apeach", + "llava-hf/llava-1.5-7b-hf", + "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + "llava-hf/llava-v1.6-mistral-7b-hf", + "llava-hf/LLaVA-NeXT-Video-7B-hf", "meta-llama/Llama-2-7b-hf", - "meta-llama/Meta-Llama-3-8B", + "meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-1B", "meta-llama/Llama-3.2-1B-Instruct", - "google/gemma-2-2b-it", - "google/gemma-1.1-2b-it", - "openai-community/gpt2", - "ArthurZ/Ilama-3.2-1B", - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-v1.6-mistral-7b-hf", - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "ai21labs/Jamba-tiny-random", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", - "nm-testing/Phi-3-mini-128k-instruct-FP8", - "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", - "microsoft/Phi-3.5-vision-instruct", + "meta-llama/Meta-Llama-3-8B", + "microsoft/phi-2", + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3-vision-128k-instruct", - "AMead10/Llama-3.2-1B-Instruct-AWQ", - "shuyuej/Llama-3.2-1B-Instruct-GPTQ", + "microsoft/Phi-3.5-MoE-instruct", + "microsoft/Phi-3.5-vision-instruct", + "mistralai/Mistral-7B-Instruct-v0.1", + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "mistralai/Pixtral-12B-2409", + "mistral-community/Mixtral-8x22B-v0.1-AWQ", "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", - "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8", - "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", - "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", - "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", - "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "neuralmagic/Llama-3.2-1B-quantized.w8a8", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", - "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", - "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", - "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", - "nm-testing/tinyllama-oneshot-w4a16-channel-v2", - "nm-testing/tinyllama-oneshot-w4a16-group128-v2", - "nm-testing/tinyllama-oneshot-w8a16-per-channel", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", + "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", + "nm-testing/llama2.c-stories42M-pruned2.4-compressed", "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme", "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Phi-3-mini-128k-instruct-FP8", + "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", + "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + "nm-testing/tinyllama-oneshot-w4a16-channel-v2", + "nm-testing/tinyllama-oneshot-w4a16-group128-v2", + "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", + "nm-testing/tinyllama-oneshot-w8a16-per-channel", + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", + "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", + "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", + "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor", "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor", - "nm-testing/llama2.c-stories42M-pruned2.4-compressed", - "Qwen/Qwen2.5-1.5B-Instruct", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", + "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme", + "nvidia/NVLM-D-72B", "openai-community/gpt2", + "openai/whisper-large-v3", + "openbmb/MiniCPM-o-2_6", + "openbmb/MiniCPM-V-2_6", + "OpenGVLab/InternVL2-1B", + "OrionStarAI/Orion-14B-Chat", + "parasail-ai/GritLM-7B-vllm", + "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-Audio-7B-Instruct", + "Qwen/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen2.5-Math-PRM-7B", + "Qwen/Qwen2.5-Math-RM-72B", + "Qwen/Qwen2.5-VL-3B-Instruct", + "royokong/e5-v", + "sentence-transformers/all-roberta-large-v1", + "sentence-transformers/stsb-roberta-base-v2", + "shanearora/OLMo-7B-1124-hf", + "shuyuej/Llama-3.2-1B-Instruct-GPTQ", + "ssmits/Qwen2-7B-Instruct-embed-base", + "stabilityai/stablelm-3b-4e1t", + "stabilityai/stablelm-zephyr-3b", + "state-spaces/mamba-130m-hf", + "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", + "THUDM/glm-4v-9b", + "TIGER-Lab/Mantis-8B-siglip-llama3", + "TIGER-Lab/VLM2Vec-Full", + "tiiuae/falcon-40b", + "tiiuae/falcon-mamba-7b-instruct", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "upstage/solar-pro-preview-instruct", ] MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"