p

Signed-off-by: <>
2025-02-22 00:22:12 +00:00 · 2025-02-22 00:22:12 +00:00 · afa691378a
parent 8c506d7c76
commit afa691378a
6 changed files with 161 additions and 642 deletions
--- a/a.j2
+++ b/a.j2
@ -1,345 +0,0 @@
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% if branch == "main" %}
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %}
-{% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu121" %}
-{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %}
-{% endif %}
-{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-{% set hf_home = "/root/.cache/huggingface" %}
-{% set list_file_diff = list_file_diff | split("|") %}
-
-steps:
-  - label: ":docker: build image"
-    key: image-build
-    depends_on: ~
-    agents:
-      {% if branch == "main" %}
-      queue: cpu_queue_postmerge
-      {% else %}
-      queue: cpu_queue_premerge
-      {% endif %}
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - |
-        #!/bin/bash
-        if [[ -z $(docker manifest inspect {{ docker_image }}) ]]; then
-          echo "Image not found, proceeding with build..."
-        else
-          echo "Image found"
-          exit 0
-        fi
-      - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
-
-  - block: Build CUDA 12.1 image
-    key: block-build-cu121
-    depends_on: ~
-
-  - label: ":docker: build image CUDA 12.1"
-    key: image-build-cu121
-    depends_on: block-build-cu121
-    agents:
-      {% if branch == "main" %}
-      queue: cpu_queue_postmerge
-      {% else %}
-      queue: cpu_queue_premerge
-      {% endif %}
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - |
-        #!/bin/bash
-        if [[ -z $(docker manifest inspect {{ docker_image_cu121 }}) ]]; then
-          echo "Image not found, proceeding with build..."
-        else
-          echo "Image found"
-          exit 0
-        fi
-      - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag {{ docker_image_cu121 }} --target test --progress plain ."
-      - "docker push {{ docker_image_cu121 }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
-  
-  - block: Build CUDA 11.8 image
-    key: block-build-cu118
-    depends_on: ~
-
-  - label: ":docker: build image CUDA 11.8"
-    key: image-build-cu118
-    depends_on: block-build-cu118
-    agents:
-      {% if branch == "main" %}
-      queue: cpu_queue_postmerge
-      {% else %}
-      queue: cpu_queue_premerge
-      {% endif %}
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - |
-        #!/bin/bash
-        if [[ -z $(docker manifest inspect {{ docker_image_cu118 }}) ]]; then
-          echo "Image not found, proceeding with build..."
-        else
-          echo "Image found"
-          exit 0
-        fi
-      - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag {{ docker_image_cu118 }} --target test --progress plain ."
-      - "docker push {{ docker_image_cu118 }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
-  
-  {% for step in steps %}
-  {% if step.fast_check_only != true %}
-
-  {% set ns = namespace(blocked=1) %}
-
-  {% if run_all == "1" %}
-  {% set ns.blocked = 0 %}
-  {% endif %}
-
-  {% if nightly == "1" %}
-  {% set ns.blocked = 0 %}
-  {% endif %}
-
-  {% if step.source_file_dependencies %}
-    {% for source_file in step.source_file_dependencies %}
-      {% for file in list_file_diff %}
-        {% if source_file in file %}
-          {% set ns.blocked = 0 %}
-        {% endif %}
-      {% endfor %}
-    {% endfor %}
-  {% else %}
-    {% set ns.blocked = 0 %}
-  {% endif %}
-
-  {% if ns.blocked == 1 or (step.optional and nightly != "1") %}
-  - block: "Run {{ step.label }}"
-    depends_on: image-build
-    key: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }}
-  {% endif %}
-
-  - label: "{{ step.label }}"
-    {% if ns.blocked == 1 or (step.optional and nightly != "1") %}
-    depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }}
-    {% else %}
-    depends_on: image-build
-    {% endif %}
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue_premerge
-      {% elif step.no_gpu %}
-      queue: cpu_queue_premerge
-      {% elif step.gpu == "a100" %}
-      queue: a100_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    {% if step.num_nodes >= 2%} {# for multi-node test #}
-    commands:
-      - ./.buildkite/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe  }} {{ step.num_nodes }} {{ step.num_gpus }} {{ docker_image }} {% for command in step.commands %}"{{ (command | join(" && ")) | safe }}" {% endfor %}
-    {% endif %}
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 1
-        - exit_status: -10  # Agent was lost
-          limit: 1
-    {% if step.num_nodes < 2 %}
-    plugins:
-      {% if step.gpu != "a100" %}
-      - docker#v5.2.0: {# for GPU test #}
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-xc", "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label == "Core" %}&&  export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_HOME={{ hf_home }}
-            - HF_TOKEN
-            {% if branch == "main" %}
-            - BUILDKITE_ANALYTICS_TOKEN
-            {% endif %}
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-            - {{ hf_home }}:{{ hf_home }}
-      {% else %} {# A100 is managed on EKS #}
-      - kubernetes:
-          podSpec:
-            priorityClassName: ci
-            containers:
-            - image: {{ docker_image }}
-              command:
-                - bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label in ["Basic Correctness Test", "Basic Models Test", "Entrypoints Test", "Metrics, Tracing Test", "Async Engine, Inputs, Utils, Worker Test", "Samplers Test", "Engine Test"] %}&&  export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'
-              resources:
-                limits:
-                  nvidia.com/gpu: {{ step.num_gpus or 1 }}
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: {{ hf_home }}
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: HF_HOME
-                value: {{ hf_home }}
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-            volumes:
-            - name: devshm
-              emptyDir:
-                medium: Memory
-            - name: hf-cache
-              hostPath:
-                path: {{ hf_home }}
-                type: Directory
-      {% endif %}
-    {% endif %}
-  {% endif %}
-  {% endfor %}
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-      - label: "AMD: :docker: build image"
-        depends_on: ~
-        soft_fail: true
-        commands:
-          # Handle the introduction of test target in Dockerfile.rocm
-          - "grep -i 'from base as test' Dockerfile.rocm && docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain . || docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ."
-          - "docker push {{ docker_image_amd }}"
-        key: "amd-build"
-        env:
-          DOCKER_BUILDKIT: "1"
-        retry:
-          automatic:
-            - exit_status: -1  # Agent was lost
-              limit: 1
-            - exit_status: -10  # Agent was lost
-              limit: 1
-            - exit_status: 1  # Machine occasionally fail
-              limit: 1
-        agents:
-          queue: amd-cpu
-
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        depends_on: amd-build
-        agents:
-         {% if step.amd_gpu_type and step.amd_gpu_type=="mi300"%}
-           queue: amd_mi300
-         {% else%}
-           queue: amd_gpu
-         {% endif%}
-         
-        command: bash .buildkite/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        priority: 100
-        soft_fail: true
-        
-    {% endif %}
-    {% endfor %}
-    
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: true
-
-  - block: "Run Intel CPU test"
-    depends_on: ~
-    key: block-intel-cpu
-  
-  - label: "Intel CPU Test"
-    depends_on: block-intel-cpu
-    soft_fail: true
-    agents:
-      queue: intel-cpu
-    command: bash .buildkite/run-cpu-test.sh
-
-  - label: "Intel HPU Test"
-    depends_on: ~
-    soft_fail: true
-    agents:
-      queue: intel-hpu
-    command: bash .buildkite/run-hpu-test.sh
-
-  - block: "Run Intel GPU test"
-    depends_on: ~
-    key: block-intel-gpu
-  
-  - label: "Intel GPU Test"
-    soft_fail: true
-    depends_on: block-intel-gpu
-    agents:
-      queue: intel-gpu
-    command: bash .buildkite/run-xpu-test.sh
-
-  - label: "IBM Power(ppc64le) CPU Test"
-    depends_on: ~
-    soft_fail: true
-    agents:
-      queue: ibm-ppc64le
-    command: bash .buildkite/run-cpu-test-ppc64le.sh
-
-  {% if nightly == "1" %}
-  - label: "GH200 Test"
-    depends_on: ~
-    soft_fail: true
-    agents:
-      queue: gh200_queue
-    command: nvidia-smi && bash .buildkite/run-gh200-test.sh
-  {% endif %}
-
-  - label: "TPU Test"
-    depends_on: ~
-    soft_fail: True
-    agents:
-      queue: tpu_queue
-    commands: 
-    - if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi 
-    - yes | docker system prune -a
--- a/hfs3.py
+++ b/hfs3.py
@ -1,208 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import logging
-import os
-import shutil
-
-import boto3
-from huggingface_hub import HfApi, snapshot_download
-from tqdm import tqdm
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class ModelTransfer:
-
-    def __init__(self,
-                 model_id,
-                 s3_bucket,
-                 aws_access_key_id=None,
-                 aws_secret_access_key=None,
-                 aws_region=None):
-        """
-        Initialize the ModelTransfer class.
-        
-        Args:
-            model_id (str): HuggingFace model ID 
-            s3_bucket (str): Name of the S3 bucket
-            aws_access_key_id (str, optional)
-            aws_secret_access_key (str, optional)
-            aws_region (str, optional): AWS region. Defaults to None.
-        """
-        self.model_id = model_id
-        self.s3_bucket = s3_bucket
-        self.model_name = model_id.split('/')[-1]
-
-        # Initialize S3 client
-        self.s3_client = boto3.client(
-            's3',
-            aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
-            region_name=aws_region)
-
-        # Initialize Hugging Face API
-        self.hf_api = HfApi()
-
-    def download_model(self, local_dir):
-        """
-        Download the model from HuggingFace.
-        
-        Args:
-            local_dir (str): Local directory to save the model
-        
-        Returns:
-            str: Path to the downloaded model directory
-        """
-        logger.info("Downloading model %s...", self.model_id)
-
-        try:
-            local_dir_with_model = os.path.join(local_dir, self.model_name)
-            snapshot_download(repo_id=self.model_id,
-                              local_dir=local_dir_with_model,
-                              local_dir_use_symlinks=False,
-                              token=os.getenv("HF_TOKEN"))
-            logger.info("Model downloaded successfully to %s",
-                        local_dir_with_model)
-            return local_dir_with_model
-
-        except Exception as e:
-            logger.error("Error downloading model: %s", str(e))
-            raise
-
-    def upload_to_s3(self, local_dir):
-        """
-        Upload the model directory to S3.
-        
-        Args:
-            local_dir (str): Local directory containing the model files
-        """
-        logger.info("Uploading model to S3 bucket %s...", self.s3_bucket)
-
-        try:
-            # Walk through all files in the directory
-            for root, _, files in os.walk(local_dir):
-                for filename in files:
-                    # Get the full local path
-                    local_path = os.path.join(root, filename)
-
-                    # Calculate S3 path (preserve directory structure)
-                    relative_path = os.path.relpath(local_path, local_dir)
-                    s3_path = f"{self.model_name}/{relative_path}"
-
-                    # Upload file with progress bar
-                    file_size = os.path.getsize(local_path)
-                    with tqdm(total=file_size,
-                              unit='B',
-                              unit_scale=True,
-                              desc=f"Uploading {filename}") as pbar:
-                        self.s3_client.upload_file(
-                            local_path,
-                            self.s3_bucket,
-                            s3_path,
-                            Callback=lambda bytes_transferred: pbar.update(
-                                bytes_transferred))
-
-                    logger.info("Uploaded %s to s3://%s/%s", filename,
-                                self.s3_bucket, s3_path)
-
-            logger.info("Model upload completed successfully!")
-
-        except Exception as e:
-            logger.error("Error uploading to S3: %s", str(e))
-            raise
-
-
-# "ibm/PowerMoE-3b", "internlm/internlm-chat-7b",
-#         "internlm/internlm2-chat-7b", "OpenGVLab/Mono-InternVL-2B",
-#         "internlm/internlm3-8b-instruct", "inceptionai/jais-13b-chat",
-#         "ai21labs/AI21-Jamba-1.5-Mini", "meta-llama/Meta-Llama-3-8B",
-#         "decapoda-research/llama-7b-hf", "state-spaces/mamba-130m-hf",
-#         "tiiuae/falcon-mamba-7b-instruct", "openbmb/MiniCPM-2B-sft-bf16",
-#         "openbmb/MiniCPM3-4B", "mistralai/Mistral-7B-Instruct-v0.1",
-#         "mistralai/Mixtral-8x7B-Instruct-v0.1",
-#         "mistral-community/Mixtral-8x22B-v0.1-AWQ", "mpt", "mosaicml/mpt-7b",
-#         "nvidia/Minitron-8B-Base", "allenai/OLMo-1B-hf",
-#         "shanearora/OLMo-7B-1124-hf", "allenai/OLMoE-1B-7B-0924-Instruct",
-#         "facebook/opt-iml-max-1.3b", "OrionStarAI/Orion-14B-Chat",
-#         "adept/persimmon-8b-chat", "microsoft/phi-2",
-#         "microsoft/Phi-3-mini-4k-instruct",
-#         "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3.5-MoE-instruct",
-#         "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat",
-#         "tiiuae/falcon-40b", "stabilityai/stablelm-zephyr-3b",
-#         "stabilityai/stablelm-3b-4e1t", "bigcode/starcoder2-3b",
-#         "upstage/solar-pro-preview-instruct", "Tele-AI/TeleChat2-3B",
-#         "xverse/XVERSE-7B-Chat", "facebook/bart-base",
-#         "facebook/bart-large-cnn", "microsoft/Florence-2-base",
-#         "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2",
-#         "parasail-ai/GritLM-7B-vllm", "internlm/internlm2-1_8b-reward",
-#         "ai21labs/Jamba-tiny-reward-dev", "llama",
-#         "intfloat/e5-mistral-7b-instruct",
-#         "ssmits/Qwen2-7B-Instruct-embed-base", "Qwen/Qwen2.5-Math-RM-72B",
-#         "Qwen/Qwen2.5-Math-PRM-7B", "jason9693/Qwen2.5-1.5B-apeach",
-#         "sentence-transformers/stsb-roberta-base-v2",
-#         "sentence-transformers/all-roberta-large-v1",
-#         "intfloat/multilingual-e5-large", "royokong/e5-v",
-#         "TIGER-Lab/VLM2Vec-Full", "MrLight/dse-qwen2-2b-mrl-v1",
-#         "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
-#         "cross-encoder/ms-marco-MiniLM-L-6-v2",
-#         "cross-encoder/quora-roberta-base", "BAAI/bge-reranker-v2-m3",
-#         "THUDM/glm-4v-9b", "chatglm2-6b", "deepseek-ai/deepseek-vl2-tiny",
-#         "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m",
-#         "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3",
-#         "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
-#         "llava-hf/LLaVA-NeXT-Video-7B-hf",
-#         "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-#         "TIGER-Lab/Mantis-8B-siglip-llama3", "openbmb/MiniCPM-o-2_6",
-#         "openbmb/MiniCPM-V-2_6", "allenai/Molmo-7B-D-0924",
-#         "nvidia/NVLM-D-72B", "google/paligemma-3b-pt-224",
-#         "microsoft/Phi-3-vision-128k-instruct", "mistralai/Pixtral-12B-2409",
-#         "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-Audio-7B-Instruct",
-#         "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
-#         "fixie-ai/ultravox-v0_5-llama-3_2-1b",
-#         "meta-llama/Llama-3.2-11B-Vision-Instruct", "openai/whisper-large-v3",
-#         "JackFram/llama-68m", "JackFram/llama-68m", "JackFram/llama-160m",
-#         "ArthurZ/Ilama-3.2-1B"
-
-
-def main():
-    # Configuration
-    MODEL_ID = [
-        "HuggingFaceH4/zephyr-7b-beta",
-        "llava-hf/llava-1.5-7b-hf",
-        "ArthurZ/Ilama-3.2-1B",
-        "meta-llama/Llama-2-7b-hf",
-    ]
-    S3_BUCKET = "vllm-ci-model-weights"
-    # Local directory to temporarily store the model
-    LOCAL_DIR = "/home/ec2-user/models"
-
-    AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
-    AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
-    AWS_REGION = "us-west-2"
-
-    # Create transfer object
-    for model_id in MODEL_ID:
-        transfer = ModelTransfer(model_id=model_id,
-                                 s3_bucket=S3_BUCKET,
-                                 aws_access_key_id=AWS_ACCESS_KEY_ID,
-                                 aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
-                                 aws_region=AWS_REGION)
-
-        try:
-            # Create local directory if it doesn't exist
-            os.makedirs(LOCAL_DIR, exist_ok=True)
-
-            # Download model
-            model_dir = transfer.download_model(LOCAL_DIR)
-
-            # Upload to S3 and cleanup
-            transfer.upload_to_s3(model_dir)
-            shutil.rmtree(model_dir)
-
-        except Exception as e:
-            logger.error("Error in transfer process: %s", str(e))
-            raise
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -97,8 +97,8 @@ def test_models(
    "test_suite", [
        ("distilbert/distilgpt2", "ray", "", "L4"),
        ("distilbert/distilgpt2", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
        ("distilbert/distilgpt2", "ray", "", "A100"),
        ("distilbert/distilgpt2", "mp", "", "A100"),
        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@ -13,7 +13,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
-from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET

 MODELS = [
    "distilbert/distilgpt2",
@ -142,9 +141,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
        metrics_tag_content = stat_logger.labels["model_name"]

    if served_model_name is None or served_model_name == []:
-        actual_model_name = model
-        assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{actual_model_name}", (  # noqa: E501
-            f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
            f"actual: {metrics_tag_content!r}")
    else:
        assert metrics_tag_content == served_model_name[0], (
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@ -52,7 +52,7 @@ from .conftest import (get_output_from_llm_generator,
    [{
        # Use a small model for a fast test.
        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-160m",
+        "model": "JackFram/llama-68m",

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@ -61,14 +61,14 @@ from .conftest import (get_output_from_llm_generator,
    "per_test_common_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-160m",
+            "speculative_model": "JackFram/llama-68m",
            "num_speculative_tokens": 5,
            "enable_chunked_prefill": False,
        },
        {
            # Chunked prefill enabled with small value
            # to make sure we get mixed batches.
-            "speculative_model": "JackFram/llama-160m",
+            "speculative_model": "JackFram/llama-68m",
            "num_speculative_tokens": 5,
            "enable_chunked_prefill": True,
            "max_num_batched_tokens": 4,
@ -119,7 +119,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
            for token_ids in batch_token_ids] == ([output_len] * batch_size)

    # Expect detokenized string to match.
-    tok = AutoTokenizer.from_pretrained("JackFram/llama-160m")
+    tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
    for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
        expected_tokens = tok.decode(actual_token_ids)
        print(f"{actual_token_ids=}")
@ -135,20 +135,27 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
        # Print spec metrics.
        "disable_log_stats": False,
    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-160m",
-    },
-])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        # Try two different tiny base models.
+        # Note that one is equal to the draft model, another isn't.
+        {
+            "model_name": "JackFram/llama-68m",
+        },
+        {
+            "model_name": "JackFram/llama-160m",
+        },
+    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
                         [{
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                             "num_speculative_tokens": 5,
                             "enable_chunked_prefill": False,
                             "disable_logprobs_during_spec_decoding": False
                         }, {
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                             "num_speculative_tokens": 3,
                             "enable_chunked_prefill": True,
                             "max_num_batched_tokens": 4,
@ -208,7 +215,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
        # Try two different tiny base models.
        # Note that one is equal to the draft model, another isn't.
        {
-            "model_name": "JackFram/llama-160m",
+            "model_name": "JackFram/llama-68m",
        },
        {
            "model_name": "JackFram/llama-160m",
@ -217,12 +224,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -261,20 +268,27 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-160m",
-    },
-])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        # Try two different tiny base models.
+        # Note that one is equal to the draft model, another isn't.
+        {
+            "model_name": "JackFram/llama-68m",
+        },
+        {
+            "model_name": "JackFram/llama-160m",
+        },
+    ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -322,12 +336,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -377,12 +391,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -435,12 +449,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -500,12 +514,12 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -553,7 +567,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-160m",
+            "speculative_model": "JackFram/llama-68m",
            "num_speculative_tokens": 5,

            # Artificially limit the draft model max model len; this forces vLLM
@ -562,7 +576,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
            "enable_chunked_prefill": False,
        },
        {
-            "speculative_model": "JackFram/llama-160m",
+            "speculative_model": "JackFram/llama-68m",
            "num_speculative_tokens": 5,
            "enable_chunked_prefill": True,
            "max_num_batched_tokens": 4,
@ -613,13 +627,13 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "speculative_disable_by_batch_size": 2,
        "enable_chunked_prefill": False,
    },
    {
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": 5,
        "speculative_disable_by_batch_size": 2,
        "enable_chunked_prefill": True,
@ -651,7 +665,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model_name": "JackFram/llama-160m",
+        "model_name": "JackFram/llama-68m",

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@ -662,14 +676,14 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-160m",
+            "speculative_model": "JackFram/llama-68m",
            "num_speculative_tokens": k,
            "enable_chunked_prefill": False,
        }
        # Try a range of common k, as well as large speculation.
        for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
    ] + [{
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": k,
        "enable_chunked_prefill": True,
        "max_num_batched_tokens": 4,
@ -715,7 +729,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
    "test_llm_kwargs",
    [
        {
-            "speculative_model": "JackFram/llama-160m",
+            "speculative_model": "JackFram/llama-68m",
            "num_speculative_tokens": k,
            "spec_decoding_acceptance_method": "typical_acceptance_sampler",
            "enable_chunked_prefill": False
@ -723,7 +737,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        # Try a range of common k.
        for k in [1, 2, 3]
    ] + [{
-        "speculative_model": "JackFram/llama-160m",
+        "speculative_model": "JackFram/llama-68m",
        "num_speculative_tokens": k,
        "spec_decoding_acceptance_method": "typical_acceptance_sampler",
        "enable_chunked_prefill": True,
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@ -1,71 +1,131 @@
 # SPDX-License-Identifier: Apache-2.0
 MODELS_ON_S3 = [
+    "adept/fuyu-8b",
+    "ai21labs/AI21-Jamba-1.5-Mini",
+    "ai21labs/Jamba-tiny-random",
+    "ai21labs/Jamba-tiny-reward-dev",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/OLMo-1B-hf",
+    "allenai/OLMoE-1B-7B-0924-Instruct",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "ArthurZ/Ilama-3.2-1B",
+    "BAAI/bge-base-en-v1.5",
+    "BAAI/bge-multilingual-gemma2",
+    "BAAI/bge-reranker-v2-m3",
+    "bigcode/starcoder2-3b",
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",
+    "cross-encoder/quora-roberta-base",
+    "deepseek-ai/deepseek-vl2-tiny",
    "distilbert/distilgpt2",
+    "facebook/bart-base",
+    "facebook/bart-large-cnn",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "google/gemma-1.1-2b-it",
+    "google/gemma-2-2b-it",
+    "google/paligemma-3b-pt-224",
+    "h2oai/h2ovl-mississippi-800m",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "internlm/internlm2-1_8b-reward",
+    "intfloat/e5-mistral-7b-instruct",
+    "intfloat/multilingual-e5-large",
+    "JackFram/llama-160m",
+    "jason9693/Qwen2.5-1.5B-apeach",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    "meta-llama/Llama-2-7b-hf",
-    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "meta-llama/Llama-3.2-1B",
    "meta-llama/Llama-3.2-1B-Instruct",
-    "google/gemma-2-2b-it",
-    "google/gemma-1.1-2b-it",
-    "openai-community/gpt2",
-    "ArthurZ/Ilama-3.2-1B",
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "ai21labs/Jamba-tiny-random",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Phi-3-mini-128k-instruct-FP8",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
-    "microsoft/Phi-3.5-vision-instruct",
+    "meta-llama/Meta-Llama-3-8B",
+    "microsoft/phi-2",
+    "microsoft/Phi-3-mini-4k-instruct",
+    "microsoft/Phi-3-small-8k-instruct",
    "microsoft/Phi-3-vision-128k-instruct",
-    "AMead10/Llama-3.2-1B-Instruct-AWQ",
-    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "microsoft/Phi-3.5-MoE-instruct",
+    "microsoft/Phi-3.5-vision-instruct",
+    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/Mixtral-8x22B-v0.1-AWQ",
    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
-    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
-    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
-    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
-    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
-    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
-    "Qwen/Qwen2.5-1.5B-Instruct",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nvidia/NVLM-D-72B",
    "openai-community/gpt2",
+    "openai/whisper-large-v3",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "OpenGVLab/InternVL2-1B",
+    "OrionStarAI/Orion-14B-Chat",
+    "parasail-ai/GritLM-7B-vllm",
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+    "Qwen/Qwen2-7B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-Math-PRM-7B",
+    "Qwen/Qwen2.5-Math-RM-72B",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
+    "royokong/e5-v",
+    "sentence-transformers/all-roberta-large-v1",
+    "sentence-transformers/stsb-roberta-base-v2",
+    "shanearora/OLMo-7B-1124-hf",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ssmits/Qwen2-7B-Instruct-embed-base",
+    "stabilityai/stablelm-3b-4e1t",
+    "stabilityai/stablelm-zephyr-3b",
+    "state-spaces/mamba-130m-hf",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "THUDM/glm-4v-9b",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "TIGER-Lab/VLM2Vec-Full",
+    "tiiuae/falcon-40b",
+    "tiiuae/falcon-mamba-7b-instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "upstage/solar-pro-preview-instruct",
 ]

 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"