Signed-off-by:  <>
This commit is contained in:
EC2 Default User 2025-02-22 00:22:12 +00:00
parent 8c506d7c76
commit afa691378a
6 changed files with 161 additions and 642 deletions

345
a.j2
View File

@ -1,345 +0,0 @@
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
{% if branch == "main" %}
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %}
{% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu121" %}
{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %}
{% endif %}
{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
{% set default_working_dir = "/vllm-workspace/tests" %}
{% set hf_home = "/root/.cache/huggingface" %}
{% set list_file_diff = list_file_diff | split("|") %}
steps:
- label: ":docker: build image"
key: image-build
depends_on: ~
agents:
{% if branch == "main" %}
queue: cpu_queue_postmerge
{% else %}
queue: cpu_queue_premerge
{% endif %}
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image }}) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
- "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- block: Build CUDA 12.1 image
key: block-build-cu121
depends_on: ~
- label: ":docker: build image CUDA 12.1"
key: image-build-cu121
depends_on: block-build-cu121
agents:
{% if branch == "main" %}
queue: cpu_queue_postmerge
{% else %}
queue: cpu_queue_premerge
{% endif %}
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image_cu121 }}) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
- "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag {{ docker_image_cu121 }} --target test --progress plain ."
- "docker push {{ docker_image_cu121 }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- block: Build CUDA 11.8 image
key: block-build-cu118
depends_on: ~
- label: ":docker: build image CUDA 11.8"
key: image-build-cu118
depends_on: block-build-cu118
agents:
{% if branch == "main" %}
queue: cpu_queue_postmerge
{% else %}
queue: cpu_queue_premerge
{% endif %}
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image_cu118 }}) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
- "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag {{ docker_image_cu118 }} --target test --progress plain ."
- "docker push {{ docker_image_cu118 }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
{% for step in steps %}
{% if step.fast_check_only != true %}
{% set ns = namespace(blocked=1) %}
{% if run_all == "1" %}
{% set ns.blocked = 0 %}
{% endif %}
{% if nightly == "1" %}
{% set ns.blocked = 0 %}
{% endif %}
{% if step.source_file_dependencies %}
{% for source_file in step.source_file_dependencies %}
{% for file in list_file_diff %}
{% if source_file in file %}
{% set ns.blocked = 0 %}
{% endif %}
{% endfor %}
{% endfor %}
{% else %}
{% set ns.blocked = 0 %}
{% endif %}
{% if ns.blocked == 1 or (step.optional and nightly != "1") %}
- block: "Run {{ step.label }}"
depends_on: image-build
key: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }}
{% endif %}
- label: "{{ step.label }}"
{% if ns.blocked == 1 or (step.optional and nightly != "1") %}
depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }}
{% else %}
depends_on: image-build
{% endif %}
agents:
{% if step.label == "Documentation Build" %}
queue: small_cpu_queue_premerge
{% elif step.no_gpu %}
queue: cpu_queue_premerge
{% elif step.gpu == "a100" %}
queue: a100_queue
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
queue: gpu_4_queue
{% else %}
queue: gpu_1_queue
{% endif %}
{% if step.num_nodes >= 2%} {# for multi-node test #}
commands:
- ./.buildkite/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ docker_image }} {% for command in step.commands %}"{{ (command | join(" && ")) | safe }}" {% endfor %}
{% endif %}
soft_fail: {{ step.soft_fail or false }}
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
{% if step.num_nodes < 2 %}
plugins:
{% if step.gpu != "a100" %}
- docker#v5.2.0: {# for GPU test #}
image: {{ docker_image }}
always-pull: true
propagate-environment: true
{% if not step.no_gpu %}
gpus: all
{% endif %}
{% if step.label == "Benchmarks" %}
mount-buildkite-agent: true
{% endif %}
command: ["bash", "-xc", "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label == "Core" %}&& export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
environment:
- VLLM_USAGE_SOURCE=ci-test
- HF_HOME={{ hf_home }}
- HF_TOKEN
{% if branch == "main" %}
- BUILDKITE_ANALYTICS_TOKEN
{% endif %}
{% if step.label == "Speculative decoding tests" %}
- VLLM_ATTENTION_BACKEND=XFORMERS
{% endif %}
volumes:
- /dev/shm:/dev/shm
- {{ hf_home }}:{{ hf_home }}
{% else %} {# A100 is managed on EKS #}
- kubernetes:
podSpec:
priorityClassName: ci
containers:
- image: {{ docker_image }}
command:
- bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label in ["Basic Correctness Test", "Basic Models Test", "Entrypoints Test", "Metrics, Tracing Test", "Async Engine, Inputs, Utils, Worker Test", "Samplers Test", "Engine Test"] %}&& export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'
resources:
limits:
nvidia.com/gpu: {{ step.num_gpus or 1 }}
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: {{ hf_home }}
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: {{ hf_home }}
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: {{ hf_home }}
type: Directory
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
- group: "AMD Tests"
depends_on: ~
steps:
- label: "AMD: :docker: build image"
depends_on: ~
soft_fail: true
commands:
# Handle the introduction of test target in Dockerfile.rocm
- "grep -i 'from base as test' Dockerfile.rocm && docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain . || docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ."
- "docker push {{ docker_image_amd }}"
key: "amd-build"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1
agents:
queue: amd-cpu
{% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
depends_on: amd-build
agents:
{% if step.amd_gpu_type and step.amd_gpu_type=="mi300"%}
queue: amd_mi300
{% else%}
queue: amd_gpu
{% endif%}
command: bash .buildkite/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
priority: 100
soft_fail: true
{% endif %}
{% endfor %}
- label: "Neuron Test"
depends_on: ~
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
soft_fail: true
- block: "Run Intel CPU test"
depends_on: ~
key: block-intel-cpu
- label: "Intel CPU Test"
depends_on: block-intel-cpu
soft_fail: true
agents:
queue: intel-cpu
command: bash .buildkite/run-cpu-test.sh
- label: "Intel HPU Test"
depends_on: ~
soft_fail: true
agents:
queue: intel-hpu
command: bash .buildkite/run-hpu-test.sh
- block: "Run Intel GPU test"
depends_on: ~
key: block-intel-gpu
- label: "Intel GPU Test"
soft_fail: true
depends_on: block-intel-gpu
agents:
queue: intel-gpu
command: bash .buildkite/run-xpu-test.sh
- label: "IBM Power(ppc64le) CPU Test"
depends_on: ~
soft_fail: true
agents:
queue: ibm-ppc64le
command: bash .buildkite/run-cpu-test-ppc64le.sh
{% if nightly == "1" %}
- label: "GH200 Test"
depends_on: ~
soft_fail: true
agents:
queue: gh200_queue
command: nvidia-smi && bash .buildkite/run-gh200-test.sh
{% endif %}
- label: "TPU Test"
depends_on: ~
soft_fail: True
agents:
queue: tpu_queue
commands:
- if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi
- yes | docker system prune -a

208
hfs3.py
View File

@ -1,208 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import logging
import os
import shutil
import boto3
from huggingface_hub import HfApi, snapshot_download
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ModelTransfer:
def __init__(self,
model_id,
s3_bucket,
aws_access_key_id=None,
aws_secret_access_key=None,
aws_region=None):
"""
Initialize the ModelTransfer class.
Args:
model_id (str): HuggingFace model ID
s3_bucket (str): Name of the S3 bucket
aws_access_key_id (str, optional)
aws_secret_access_key (str, optional)
aws_region (str, optional): AWS region. Defaults to None.
"""
self.model_id = model_id
self.s3_bucket = s3_bucket
self.model_name = model_id.split('/')[-1]
# Initialize S3 client
self.s3_client = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=aws_region)
# Initialize Hugging Face API
self.hf_api = HfApi()
def download_model(self, local_dir):
"""
Download the model from HuggingFace.
Args:
local_dir (str): Local directory to save the model
Returns:
str: Path to the downloaded model directory
"""
logger.info("Downloading model %s...", self.model_id)
try:
local_dir_with_model = os.path.join(local_dir, self.model_name)
snapshot_download(repo_id=self.model_id,
local_dir=local_dir_with_model,
local_dir_use_symlinks=False,
token=os.getenv("HF_TOKEN"))
logger.info("Model downloaded successfully to %s",
local_dir_with_model)
return local_dir_with_model
except Exception as e:
logger.error("Error downloading model: %s", str(e))
raise
def upload_to_s3(self, local_dir):
"""
Upload the model directory to S3.
Args:
local_dir (str): Local directory containing the model files
"""
logger.info("Uploading model to S3 bucket %s...", self.s3_bucket)
try:
# Walk through all files in the directory
for root, _, files in os.walk(local_dir):
for filename in files:
# Get the full local path
local_path = os.path.join(root, filename)
# Calculate S3 path (preserve directory structure)
relative_path = os.path.relpath(local_path, local_dir)
s3_path = f"{self.model_name}/{relative_path}"
# Upload file with progress bar
file_size = os.path.getsize(local_path)
with tqdm(total=file_size,
unit='B',
unit_scale=True,
desc=f"Uploading {filename}") as pbar:
self.s3_client.upload_file(
local_path,
self.s3_bucket,
s3_path,
Callback=lambda bytes_transferred: pbar.update(
bytes_transferred))
logger.info("Uploaded %s to s3://%s/%s", filename,
self.s3_bucket, s3_path)
logger.info("Model upload completed successfully!")
except Exception as e:
logger.error("Error uploading to S3: %s", str(e))
raise
# "ibm/PowerMoE-3b", "internlm/internlm-chat-7b",
# "internlm/internlm2-chat-7b", "OpenGVLab/Mono-InternVL-2B",
# "internlm/internlm3-8b-instruct", "inceptionai/jais-13b-chat",
# "ai21labs/AI21-Jamba-1.5-Mini", "meta-llama/Meta-Llama-3-8B",
# "decapoda-research/llama-7b-hf", "state-spaces/mamba-130m-hf",
# "tiiuae/falcon-mamba-7b-instruct", "openbmb/MiniCPM-2B-sft-bf16",
# "openbmb/MiniCPM3-4B", "mistralai/Mistral-7B-Instruct-v0.1",
# "mistralai/Mixtral-8x7B-Instruct-v0.1",
# "mistral-community/Mixtral-8x22B-v0.1-AWQ", "mpt", "mosaicml/mpt-7b",
# "nvidia/Minitron-8B-Base", "allenai/OLMo-1B-hf",
# "shanearora/OLMo-7B-1124-hf", "allenai/OLMoE-1B-7B-0924-Instruct",
# "facebook/opt-iml-max-1.3b", "OrionStarAI/Orion-14B-Chat",
# "adept/persimmon-8b-chat", "microsoft/phi-2",
# "microsoft/Phi-3-mini-4k-instruct",
# "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3.5-MoE-instruct",
# "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat",
# "tiiuae/falcon-40b", "stabilityai/stablelm-zephyr-3b",
# "stabilityai/stablelm-3b-4e1t", "bigcode/starcoder2-3b",
# "upstage/solar-pro-preview-instruct", "Tele-AI/TeleChat2-3B",
# "xverse/XVERSE-7B-Chat", "facebook/bart-base",
# "facebook/bart-large-cnn", "microsoft/Florence-2-base",
# "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2",
# "parasail-ai/GritLM-7B-vllm", "internlm/internlm2-1_8b-reward",
# "ai21labs/Jamba-tiny-reward-dev", "llama",
# "intfloat/e5-mistral-7b-instruct",
# "ssmits/Qwen2-7B-Instruct-embed-base", "Qwen/Qwen2.5-Math-RM-72B",
# "Qwen/Qwen2.5-Math-PRM-7B", "jason9693/Qwen2.5-1.5B-apeach",
# "sentence-transformers/stsb-roberta-base-v2",
# "sentence-transformers/all-roberta-large-v1",
# "intfloat/multilingual-e5-large", "royokong/e5-v",
# "TIGER-Lab/VLM2Vec-Full", "MrLight/dse-qwen2-2b-mrl-v1",
# "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
# "cross-encoder/ms-marco-MiniLM-L-6-v2",
# "cross-encoder/quora-roberta-base", "BAAI/bge-reranker-v2-m3",
# "THUDM/glm-4v-9b", "chatglm2-6b", "deepseek-ai/deepseek-vl2-tiny",
# "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m",
# "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3",
# "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
# "llava-hf/LLaVA-NeXT-Video-7B-hf",
# "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
# "TIGER-Lab/Mantis-8B-siglip-llama3", "openbmb/MiniCPM-o-2_6",
# "openbmb/MiniCPM-V-2_6", "allenai/Molmo-7B-D-0924",
# "nvidia/NVLM-D-72B", "google/paligemma-3b-pt-224",
# "microsoft/Phi-3-vision-128k-instruct", "mistralai/Pixtral-12B-2409",
# "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-Audio-7B-Instruct",
# "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
# "fixie-ai/ultravox-v0_5-llama-3_2-1b",
# "meta-llama/Llama-3.2-11B-Vision-Instruct", "openai/whisper-large-v3",
# "JackFram/llama-68m", "JackFram/llama-68m", "JackFram/llama-160m",
# "ArthurZ/Ilama-3.2-1B"
def main():
# Configuration
MODEL_ID = [
"HuggingFaceH4/zephyr-7b-beta",
"llava-hf/llava-1.5-7b-hf",
"ArthurZ/Ilama-3.2-1B",
"meta-llama/Llama-2-7b-hf",
]
S3_BUCKET = "vllm-ci-model-weights"
# Local directory to temporarily store the model
LOCAL_DIR = "/home/ec2-user/models"
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = "us-west-2"
# Create transfer object
for model_id in MODEL_ID:
transfer = ModelTransfer(model_id=model_id,
s3_bucket=S3_BUCKET,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_region=AWS_REGION)
try:
# Create local directory if it doesn't exist
os.makedirs(LOCAL_DIR, exist_ok=True)
# Download model
model_dir = transfer.download_model(LOCAL_DIR)
# Upload to S3 and cleanup
transfer.upload_to_s3(model_dir)
shutil.rmtree(model_dir)
except Exception as e:
logger.error("Error in transfer process: %s", str(e))
raise
if __name__ == "__main__":
main()

View File

@ -97,8 +97,8 @@ def test_models(
"test_suite", [
("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),

View File

@ -13,7 +13,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
MODELS = [
"distilbert/distilgpt2",
@ -142,9 +141,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
metrics_tag_content = stat_logger.labels["model_name"]
if served_model_name is None or served_model_name == []:
actual_model_name = model
assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{actual_model_name}", ( # noqa: E501
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
assert metrics_tag_content == model, (
f"Metrics tag model_name is wrong! expect: {model!r}\n"
f"actual: {metrics_tag_content!r}")
else:
assert metrics_tag_content == served_model_name[0], (

View File

@ -52,7 +52,7 @@ from .conftest import (get_output_from_llm_generator,
[{
# Use a small model for a fast test.
# Note this is repeated in the test body; to initialize a tokenizer.
"model": "JackFram/llama-160m",
"model": "JackFram/llama-68m",
# Skip cuda graph recording for fast test.
"enforce_eager": True,
@ -61,14 +61,14 @@ from .conftest import (get_output_from_llm_generator,
"per_test_common_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
# Chunked prefill enabled with small value
# to make sure we get mixed batches.
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -119,7 +119,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
for token_ids in batch_token_ids] == ([output_len] * batch_size)
# Expect detokenized string to match.
tok = AutoTokenizer.from_pretrained("JackFram/llama-160m")
tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
expected_tokens = tok.decode(actual_token_ids)
print(f"{actual_token_ids=}")
@ -135,20 +135,27 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Print spec metrics.
"disable_log_stats": False,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"model_name": "JackFram/llama-160m",
},
])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
# Try two different tiny base models.
# Note that one is equal to the draft model, another isn't.
{
"model_name": "JackFram/llama-68m",
},
{
"model_name": "JackFram/llama-160m",
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
"disable_logprobs_during_spec_decoding": False
}, {
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -208,7 +215,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Try two different tiny base models.
# Note that one is equal to the draft model, another isn't.
{
"model_name": "JackFram/llama-160m",
"model_name": "JackFram/llama-68m",
},
{
"model_name": "JackFram/llama-160m",
@ -217,12 +224,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -261,20 +268,27 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
# Skip cuda graph recording for fast test.
"enforce_eager": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"model_name": "JackFram/llama-160m",
},
])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
# Try two different tiny base models.
# Note that one is equal to the draft model, another isn't.
{
"model_name": "JackFram/llama-68m",
},
{
"model_name": "JackFram/llama-160m",
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -322,12 +336,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -377,12 +391,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -435,12 +449,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -500,12 +514,12 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -553,7 +567,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
# Artificially limit the draft model max model len; this forces vLLM
@ -562,7 +576,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -613,13 +627,13 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"speculative_disable_by_batch_size": 2,
"enable_chunked_prefill": False,
},
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"speculative_disable_by_batch_size": 2,
"enable_chunked_prefill": True,
@ -651,7 +665,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": "JackFram/llama-68m",
# Skip cuda graph recording for fast test.
"enforce_eager": True,
@ -662,14 +676,14 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": k,
"enable_chunked_prefill": False,
}
# Try a range of common k, as well as large speculation.
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
] + [{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": k,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4,
@ -715,7 +729,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": k,
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
"enable_chunked_prefill": False
@ -723,7 +737,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# Try a range of common k.
for k in [1, 2, 3]
] + [{
"speculative_model": "JackFram/llama-160m",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": k,
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
"enable_chunked_prefill": True,

View File

@ -1,71 +1,131 @@
# SPDX-License-Identifier: Apache-2.0
MODELS_ON_S3 = [
"adept/fuyu-8b",
"ai21labs/AI21-Jamba-1.5-Mini",
"ai21labs/Jamba-tiny-random",
"ai21labs/Jamba-tiny-reward-dev",
"allenai/Molmo-7B-D-0924",
"allenai/OLMo-1B-hf",
"allenai/OLMoE-1B-7B-0924-Instruct",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"ArthurZ/Ilama-3.2-1B",
"BAAI/bge-base-en-v1.5",
"BAAI/bge-multilingual-gemma2",
"BAAI/bge-reranker-v2-m3",
"bigcode/starcoder2-3b",
"cross-encoder/ms-marco-MiniLM-L-6-v2",
"cross-encoder/quora-roberta-base",
"deepseek-ai/deepseek-vl2-tiny",
"distilbert/distilgpt2",
"facebook/bart-base",
"facebook/bart-large-cnn",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"google/gemma-1.1-2b-it",
"google/gemma-2-2b-it",
"google/paligemma-3b-pt-224",
"h2oai/h2ovl-mississippi-800m",
"HuggingFaceM4/Idefics3-8B-Llama3",
"internlm/internlm2-1_8b-reward",
"intfloat/e5-mistral-7b-instruct",
"intfloat/multilingual-e5-large",
"JackFram/llama-160m",
"jason9693/Qwen2.5-1.5B-apeach",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"google/gemma-2-2b-it",
"google/gemma-1.1-2b-it",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"ai21labs/Jamba-tiny-random",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"microsoft/Phi-3.5-vision-instruct",
"meta-llama/Meta-Llama-3-8B",
"microsoft/phi-2",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-small-8k-instruct",
"microsoft/Phi-3-vision-128k-instruct",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"microsoft/Phi-3.5-MoE-instruct",
"microsoft/Phi-3.5-vision-instruct",
"mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mixtral-8x7B-Instruct-v0.1",
"mistralai/Pixtral-12B-2409",
"mistral-community/Mixtral-8x22B-v0.1-AWQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
"Qwen/Qwen2.5-1.5B-Instruct",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nvidia/NVLM-D-72B",
"openai-community/gpt2",
"openai/whisper-large-v3",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
"OpenGVLab/InternVL2-1B",
"OrionStarAI/Orion-14B-Chat",
"parasail-ai/GritLM-7B-vllm",
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-1.5B-Instruct",
"Qwen/Qwen2.5-Math-PRM-7B",
"Qwen/Qwen2.5-Math-RM-72B",
"Qwen/Qwen2.5-VL-3B-Instruct",
"royokong/e5-v",
"sentence-transformers/all-roberta-large-v1",
"sentence-transformers/stsb-roberta-base-v2",
"shanearora/OLMo-7B-1124-hf",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ssmits/Qwen2-7B-Instruct-embed-base",
"stabilityai/stablelm-3b-4e1t",
"stabilityai/stablelm-zephyr-3b",
"state-spaces/mamba-130m-hf",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"THUDM/glm-4v-9b",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"TIGER-Lab/VLM2Vec-Full",
"tiiuae/falcon-40b",
"tiiuae/falcon-mamba-7b-instruct",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"upstage/solar-pro-preview-instruct",
]
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"