mirror of https://github.com/vllm-project/vllm
parent
8c506d7c76
commit
afa691378a
345
a.j2
345
a.j2
|
@ -1,345 +0,0 @@
|
|||
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
|
||||
{% if branch == "main" %}
|
||||
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %}
|
||||
{% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu121" %}
|
||||
{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %}
|
||||
{% endif %}
|
||||
{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
|
||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||
{% set hf_home = "/root/.cache/huggingface" %}
|
||||
{% set list_file_diff = list_file_diff | split("|") %}
|
||||
|
||||
steps:
|
||||
- label: ":docker: build image"
|
||||
key: image-build
|
||||
depends_on: ~
|
||||
agents:
|
||||
{% if branch == "main" %}
|
||||
queue: cpu_queue_postmerge
|
||||
{% else %}
|
||||
queue: cpu_queue_premerge
|
||||
{% endif %}
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- |
|
||||
#!/bin/bash
|
||||
if [[ -z $(docker manifest inspect {{ docker_image }}) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
- "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
|
||||
- "docker push {{ docker_image }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- block: Build CUDA 12.1 image
|
||||
key: block-build-cu121
|
||||
depends_on: ~
|
||||
|
||||
- label: ":docker: build image CUDA 12.1"
|
||||
key: image-build-cu121
|
||||
depends_on: block-build-cu121
|
||||
agents:
|
||||
{% if branch == "main" %}
|
||||
queue: cpu_queue_postmerge
|
||||
{% else %}
|
||||
queue: cpu_queue_premerge
|
||||
{% endif %}
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- |
|
||||
#!/bin/bash
|
||||
if [[ -z $(docker manifest inspect {{ docker_image_cu121 }}) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
- "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag {{ docker_image_cu121 }} --target test --progress plain ."
|
||||
- "docker push {{ docker_image_cu121 }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- block: Build CUDA 11.8 image
|
||||
key: block-build-cu118
|
||||
depends_on: ~
|
||||
|
||||
- label: ":docker: build image CUDA 11.8"
|
||||
key: image-build-cu118
|
||||
depends_on: block-build-cu118
|
||||
agents:
|
||||
{% if branch == "main" %}
|
||||
queue: cpu_queue_postmerge
|
||||
{% else %}
|
||||
queue: cpu_queue_premerge
|
||||
{% endif %}
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- |
|
||||
#!/bin/bash
|
||||
if [[ -z $(docker manifest inspect {{ docker_image_cu118 }}) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
- "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag {{ docker_image_cu118 }} --target test --progress plain ."
|
||||
- "docker push {{ docker_image_cu118 }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
{% for step in steps %}
|
||||
{% if step.fast_check_only != true %}
|
||||
|
||||
{% set ns = namespace(blocked=1) %}
|
||||
|
||||
{% if run_all == "1" %}
|
||||
{% set ns.blocked = 0 %}
|
||||
{% endif %}
|
||||
|
||||
{% if nightly == "1" %}
|
||||
{% set ns.blocked = 0 %}
|
||||
{% endif %}
|
||||
|
||||
{% if step.source_file_dependencies %}
|
||||
{% for source_file in step.source_file_dependencies %}
|
||||
{% for file in list_file_diff %}
|
||||
{% if source_file in file %}
|
||||
{% set ns.blocked = 0 %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
{% set ns.blocked = 0 %}
|
||||
{% endif %}
|
||||
|
||||
{% if ns.blocked == 1 or (step.optional and nightly != "1") %}
|
||||
- block: "Run {{ step.label }}"
|
||||
depends_on: image-build
|
||||
key: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }}
|
||||
{% endif %}
|
||||
|
||||
- label: "{{ step.label }}"
|
||||
{% if ns.blocked == 1 or (step.optional and nightly != "1") %}
|
||||
depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }}
|
||||
{% else %}
|
||||
depends_on: image-build
|
||||
{% endif %}
|
||||
agents:
|
||||
{% if step.label == "Documentation Build" %}
|
||||
queue: small_cpu_queue_premerge
|
||||
{% elif step.no_gpu %}
|
||||
queue: cpu_queue_premerge
|
||||
{% elif step.gpu == "a100" %}
|
||||
queue: a100_queue
|
||||
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
|
||||
queue: gpu_4_queue
|
||||
{% else %}
|
||||
queue: gpu_1_queue
|
||||
{% endif %}
|
||||
{% if step.num_nodes >= 2%} {# for multi-node test #}
|
||||
commands:
|
||||
- ./.buildkite/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ docker_image }} {% for command in step.commands %}"{{ (command | join(" && ")) | safe }}" {% endfor %}
|
||||
{% endif %}
|
||||
soft_fail: {{ step.soft_fail or false }}
|
||||
{% if step.parallelism %}
|
||||
parallelism: {{ step.parallelism }}
|
||||
{% endif %}
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 1
|
||||
{% if step.num_nodes < 2 %}
|
||||
plugins:
|
||||
{% if step.gpu != "a100" %}
|
||||
- docker#v5.2.0: {# for GPU test #}
|
||||
image: {{ docker_image }}
|
||||
always-pull: true
|
||||
propagate-environment: true
|
||||
{% if not step.no_gpu %}
|
||||
gpus: all
|
||||
{% endif %}
|
||||
{% if step.label == "Benchmarks" %}
|
||||
mount-buildkite-agent: true
|
||||
{% endif %}
|
||||
command: ["bash", "-xc", "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label == "Core" %}&& export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE=ci-test
|
||||
- HF_HOME={{ hf_home }}
|
||||
- HF_TOKEN
|
||||
{% if branch == "main" %}
|
||||
- BUILDKITE_ANALYTICS_TOKEN
|
||||
{% endif %}
|
||||
{% if step.label == "Speculative decoding tests" %}
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS
|
||||
{% endif %}
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
- {{ hf_home }}:{{ hf_home }}
|
||||
{% else %} {# A100 is managed on EKS #}
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: ci
|
||||
containers:
|
||||
- image: {{ docker_image }}
|
||||
command:
|
||||
- bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 {% if step.label in ["Basic Correctness Test", "Basic Models Test", "Entrypoints Test", "Metrics, Tracing Test", "Async Engine, Inputs, Utils, Worker Test", "Samplers Test", "Engine Test"] %}&& export VLLM_CI_USE_S3=1{% endif %} && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: {{ step.num_gpus or 1 }}
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
- name: hf-cache
|
||||
mountPath: {{ hf_home }}
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: {{ hf_home }}
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: hf-cache
|
||||
hostPath:
|
||||
path: {{ hf_home }}
|
||||
type: Directory
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
- group: "AMD Tests"
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "AMD: :docker: build image"
|
||||
depends_on: ~
|
||||
soft_fail: true
|
||||
commands:
|
||||
# Handle the introduction of test target in Dockerfile.rocm
|
||||
- "grep -i 'from base as test' Dockerfile.rocm && docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain . || docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ."
|
||||
- "docker push {{ docker_image_amd }}"
|
||||
key: "amd-build"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: 1 # Machine occasionally fail
|
||||
limit: 1
|
||||
agents:
|
||||
queue: amd-cpu
|
||||
|
||||
{% for step in steps %}
|
||||
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
||||
- label: "AMD: {{ step.label }}"
|
||||
depends_on: amd-build
|
||||
agents:
|
||||
{% if step.amd_gpu_type and step.amd_gpu_type=="mi300"%}
|
||||
queue: amd_mi300
|
||||
{% else%}
|
||||
queue: amd_gpu
|
||||
{% endif%}
|
||||
|
||||
command: bash .buildkite/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
priority: 100
|
||||
soft_fail: true
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
- label: "Neuron Test"
|
||||
depends_on: ~
|
||||
agents:
|
||||
queue: neuron
|
||||
command: bash .buildkite/run-neuron-test.sh
|
||||
soft_fail: true
|
||||
|
||||
- block: "Run Intel CPU test"
|
||||
depends_on: ~
|
||||
key: block-intel-cpu
|
||||
|
||||
- label: "Intel CPU Test"
|
||||
depends_on: block-intel-cpu
|
||||
soft_fail: true
|
||||
agents:
|
||||
queue: intel-cpu
|
||||
command: bash .buildkite/run-cpu-test.sh
|
||||
|
||||
- label: "Intel HPU Test"
|
||||
depends_on: ~
|
||||
soft_fail: true
|
||||
agents:
|
||||
queue: intel-hpu
|
||||
command: bash .buildkite/run-hpu-test.sh
|
||||
|
||||
- block: "Run Intel GPU test"
|
||||
depends_on: ~
|
||||
key: block-intel-gpu
|
||||
|
||||
- label: "Intel GPU Test"
|
||||
soft_fail: true
|
||||
depends_on: block-intel-gpu
|
||||
agents:
|
||||
queue: intel-gpu
|
||||
command: bash .buildkite/run-xpu-test.sh
|
||||
|
||||
- label: "IBM Power(ppc64le) CPU Test"
|
||||
depends_on: ~
|
||||
soft_fail: true
|
||||
agents:
|
||||
queue: ibm-ppc64le
|
||||
command: bash .buildkite/run-cpu-test-ppc64le.sh
|
||||
|
||||
{% if nightly == "1" %}
|
||||
- label: "GH200 Test"
|
||||
depends_on: ~
|
||||
soft_fail: true
|
||||
agents:
|
||||
queue: gh200_queue
|
||||
command: nvidia-smi && bash .buildkite/run-gh200-test.sh
|
||||
{% endif %}
|
||||
|
||||
- label: "TPU Test"
|
||||
depends_on: ~
|
||||
soft_fail: True
|
||||
agents:
|
||||
queue: tpu_queue
|
||||
commands:
|
||||
- if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi
|
||||
- yes | docker system prune -a
|
208
hfs3.py
208
hfs3.py
|
@ -1,208 +0,0 @@
|
|||
# SPDX-License-Identifier: Apache-2.0
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import boto3
|
||||
from huggingface_hub import HfApi, snapshot_download
|
||||
from tqdm import tqdm
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelTransfer:
|
||||
|
||||
def __init__(self,
|
||||
model_id,
|
||||
s3_bucket,
|
||||
aws_access_key_id=None,
|
||||
aws_secret_access_key=None,
|
||||
aws_region=None):
|
||||
"""
|
||||
Initialize the ModelTransfer class.
|
||||
|
||||
Args:
|
||||
model_id (str): HuggingFace model ID
|
||||
s3_bucket (str): Name of the S3 bucket
|
||||
aws_access_key_id (str, optional)
|
||||
aws_secret_access_key (str, optional)
|
||||
aws_region (str, optional): AWS region. Defaults to None.
|
||||
"""
|
||||
self.model_id = model_id
|
||||
self.s3_bucket = s3_bucket
|
||||
self.model_name = model_id.split('/')[-1]
|
||||
|
||||
# Initialize S3 client
|
||||
self.s3_client = boto3.client(
|
||||
's3',
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
region_name=aws_region)
|
||||
|
||||
# Initialize Hugging Face API
|
||||
self.hf_api = HfApi()
|
||||
|
||||
def download_model(self, local_dir):
|
||||
"""
|
||||
Download the model from HuggingFace.
|
||||
|
||||
Args:
|
||||
local_dir (str): Local directory to save the model
|
||||
|
||||
Returns:
|
||||
str: Path to the downloaded model directory
|
||||
"""
|
||||
logger.info("Downloading model %s...", self.model_id)
|
||||
|
||||
try:
|
||||
local_dir_with_model = os.path.join(local_dir, self.model_name)
|
||||
snapshot_download(repo_id=self.model_id,
|
||||
local_dir=local_dir_with_model,
|
||||
local_dir_use_symlinks=False,
|
||||
token=os.getenv("HF_TOKEN"))
|
||||
logger.info("Model downloaded successfully to %s",
|
||||
local_dir_with_model)
|
||||
return local_dir_with_model
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error downloading model: %s", str(e))
|
||||
raise
|
||||
|
||||
def upload_to_s3(self, local_dir):
|
||||
"""
|
||||
Upload the model directory to S3.
|
||||
|
||||
Args:
|
||||
local_dir (str): Local directory containing the model files
|
||||
"""
|
||||
logger.info("Uploading model to S3 bucket %s...", self.s3_bucket)
|
||||
|
||||
try:
|
||||
# Walk through all files in the directory
|
||||
for root, _, files in os.walk(local_dir):
|
||||
for filename in files:
|
||||
# Get the full local path
|
||||
local_path = os.path.join(root, filename)
|
||||
|
||||
# Calculate S3 path (preserve directory structure)
|
||||
relative_path = os.path.relpath(local_path, local_dir)
|
||||
s3_path = f"{self.model_name}/{relative_path}"
|
||||
|
||||
# Upload file with progress bar
|
||||
file_size = os.path.getsize(local_path)
|
||||
with tqdm(total=file_size,
|
||||
unit='B',
|
||||
unit_scale=True,
|
||||
desc=f"Uploading {filename}") as pbar:
|
||||
self.s3_client.upload_file(
|
||||
local_path,
|
||||
self.s3_bucket,
|
||||
s3_path,
|
||||
Callback=lambda bytes_transferred: pbar.update(
|
||||
bytes_transferred))
|
||||
|
||||
logger.info("Uploaded %s to s3://%s/%s", filename,
|
||||
self.s3_bucket, s3_path)
|
||||
|
||||
logger.info("Model upload completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error uploading to S3: %s", str(e))
|
||||
raise
|
||||
|
||||
|
||||
# "ibm/PowerMoE-3b", "internlm/internlm-chat-7b",
|
||||
# "internlm/internlm2-chat-7b", "OpenGVLab/Mono-InternVL-2B",
|
||||
# "internlm/internlm3-8b-instruct", "inceptionai/jais-13b-chat",
|
||||
# "ai21labs/AI21-Jamba-1.5-Mini", "meta-llama/Meta-Llama-3-8B",
|
||||
# "decapoda-research/llama-7b-hf", "state-spaces/mamba-130m-hf",
|
||||
# "tiiuae/falcon-mamba-7b-instruct", "openbmb/MiniCPM-2B-sft-bf16",
|
||||
# "openbmb/MiniCPM3-4B", "mistralai/Mistral-7B-Instruct-v0.1",
|
||||
# "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
# "mistral-community/Mixtral-8x22B-v0.1-AWQ", "mpt", "mosaicml/mpt-7b",
|
||||
# "nvidia/Minitron-8B-Base", "allenai/OLMo-1B-hf",
|
||||
# "shanearora/OLMo-7B-1124-hf", "allenai/OLMoE-1B-7B-0924-Instruct",
|
||||
# "facebook/opt-iml-max-1.3b", "OrionStarAI/Orion-14B-Chat",
|
||||
# "adept/persimmon-8b-chat", "microsoft/phi-2",
|
||||
# "microsoft/Phi-3-mini-4k-instruct",
|
||||
# "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3.5-MoE-instruct",
|
||||
# "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat",
|
||||
# "tiiuae/falcon-40b", "stabilityai/stablelm-zephyr-3b",
|
||||
# "stabilityai/stablelm-3b-4e1t", "bigcode/starcoder2-3b",
|
||||
# "upstage/solar-pro-preview-instruct", "Tele-AI/TeleChat2-3B",
|
||||
# "xverse/XVERSE-7B-Chat", "facebook/bart-base",
|
||||
# "facebook/bart-large-cnn", "microsoft/Florence-2-base",
|
||||
# "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2",
|
||||
# "parasail-ai/GritLM-7B-vllm", "internlm/internlm2-1_8b-reward",
|
||||
# "ai21labs/Jamba-tiny-reward-dev", "llama",
|
||||
# "intfloat/e5-mistral-7b-instruct",
|
||||
# "ssmits/Qwen2-7B-Instruct-embed-base", "Qwen/Qwen2.5-Math-RM-72B",
|
||||
# "Qwen/Qwen2.5-Math-PRM-7B", "jason9693/Qwen2.5-1.5B-apeach",
|
||||
# "sentence-transformers/stsb-roberta-base-v2",
|
||||
# "sentence-transformers/all-roberta-large-v1",
|
||||
# "intfloat/multilingual-e5-large", "royokong/e5-v",
|
||||
# "TIGER-Lab/VLM2Vec-Full", "MrLight/dse-qwen2-2b-mrl-v1",
|
||||
# "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
|
||||
# "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
# "cross-encoder/quora-roberta-base", "BAAI/bge-reranker-v2-m3",
|
||||
# "THUDM/glm-4v-9b", "chatglm2-6b", "deepseek-ai/deepseek-vl2-tiny",
|
||||
# "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m",
|
||||
# "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
# "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
# "llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
# "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
# "TIGER-Lab/Mantis-8B-siglip-llama3", "openbmb/MiniCPM-o-2_6",
|
||||
# "openbmb/MiniCPM-V-2_6", "allenai/Molmo-7B-D-0924",
|
||||
# "nvidia/NVLM-D-72B", "google/paligemma-3b-pt-224",
|
||||
# "microsoft/Phi-3-vision-128k-instruct", "mistralai/Pixtral-12B-2409",
|
||||
# "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-Audio-7B-Instruct",
|
||||
# "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
# "fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||
# "meta-llama/Llama-3.2-11B-Vision-Instruct", "openai/whisper-large-v3",
|
||||
# "JackFram/llama-68m", "JackFram/llama-68m", "JackFram/llama-160m",
|
||||
# "ArthurZ/Ilama-3.2-1B"
|
||||
|
||||
|
||||
def main():
|
||||
# Configuration
|
||||
MODEL_ID = [
|
||||
"HuggingFaceH4/zephyr-7b-beta",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
]
|
||||
S3_BUCKET = "vllm-ci-model-weights"
|
||||
# Local directory to temporarily store the model
|
||||
LOCAL_DIR = "/home/ec2-user/models"
|
||||
|
||||
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
||||
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
||||
AWS_REGION = "us-west-2"
|
||||
|
||||
# Create transfer object
|
||||
for model_id in MODEL_ID:
|
||||
transfer = ModelTransfer(model_id=model_id,
|
||||
s3_bucket=S3_BUCKET,
|
||||
aws_access_key_id=AWS_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
|
||||
aws_region=AWS_REGION)
|
||||
|
||||
try:
|
||||
# Create local directory if it doesn't exist
|
||||
os.makedirs(LOCAL_DIR, exist_ok=True)
|
||||
|
||||
# Download model
|
||||
model_dir = transfer.download_model(LOCAL_DIR)
|
||||
|
||||
# Upload to S3 and cleanup
|
||||
transfer.upload_to_s3(model_dir)
|
||||
shutil.rmtree(model_dir)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error in transfer process: %s", str(e))
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -97,8 +97,8 @@ def test_models(
|
|||
"test_suite", [
|
||||
("distilbert/distilgpt2", "ray", "", "L4"),
|
||||
("distilbert/distilgpt2", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||
("distilbert/distilgpt2", "ray", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
|
||||
|
|
|
@ -13,7 +13,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
|||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.metrics import RayPrometheusStatLogger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
|
@ -142,9 +141,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
|
|||
metrics_tag_content = stat_logger.labels["model_name"]
|
||||
|
||||
if served_model_name is None or served_model_name == []:
|
||||
actual_model_name = model
|
||||
assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{actual_model_name}", ( # noqa: E501
|
||||
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
|
||||
assert metrics_tag_content == model, (
|
||||
f"Metrics tag model_name is wrong! expect: {model!r}\n"
|
||||
f"actual: {metrics_tag_content!r}")
|
||||
else:
|
||||
assert metrics_tag_content == served_model_name[0], (
|
||||
|
|
|
@ -52,7 +52,7 @@ from .conftest import (get_output_from_llm_generator,
|
|||
[{
|
||||
# Use a small model for a fast test.
|
||||
# Note this is repeated in the test body; to initialize a tokenizer.
|
||||
"model": "JackFram/llama-160m",
|
||||
"model": "JackFram/llama-68m",
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
@ -61,14 +61,14 @@ from .conftest import (get_output_from_llm_generator,
|
|||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
# Chunked prefill enabled with small value
|
||||
# to make sure we get mixed batches.
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -119,7 +119,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
|||
for token_ids in batch_token_ids] == ([output_len] * batch_size)
|
||||
|
||||
# Expect detokenized string to match.
|
||||
tok = AutoTokenizer.from_pretrained("JackFram/llama-160m")
|
||||
tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
|
||||
for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
|
||||
expected_tokens = tok.decode(actual_token_ids)
|
||||
print(f"{actual_token_ids=}")
|
||||
|
@ -135,20 +135,27 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
|||
# Print spec metrics.
|
||||
"disable_log_stats": False,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||
{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
# Try two different tiny base models.
|
||||
# Note that one is equal to the draft model, another isn't.
|
||||
{
|
||||
"model_name": "JackFram/llama-68m",
|
||||
},
|
||||
{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs",
|
||||
[{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
"disable_logprobs_during_spec_decoding": False
|
||||
}, {
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 3,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -208,7 +215,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|||
# Try two different tiny base models.
|
||||
# Note that one is equal to the draft model, another isn't.
|
||||
{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
"model_name": "JackFram/llama-68m",
|
||||
},
|
||||
{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
|
@ -217,12 +224,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -261,20 +268,27 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||
{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
# Try two different tiny base models.
|
||||
# Note that one is equal to the draft model, another isn't.
|
||||
{
|
||||
"model_name": "JackFram/llama-68m",
|
||||
},
|
||||
{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -322,12 +336,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
|||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -377,12 +391,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
|||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -435,12 +449,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -500,12 +514,12 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
|||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -553,7 +567,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
|||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
|
||||
# Artificially limit the draft model max model len; this forces vLLM
|
||||
|
@ -562,7 +576,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
|||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -613,13 +627,13 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
|||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"speculative_disable_by_batch_size": 2,
|
||||
"enable_chunked_prefill": False,
|
||||
},
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": 5,
|
||||
"speculative_disable_by_batch_size": 2,
|
||||
"enable_chunked_prefill": True,
|
||||
|
@ -651,7 +665,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
|||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model_name": "JackFram/llama-160m",
|
||||
"model_name": "JackFram/llama-68m",
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
@ -662,14 +676,14 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
|||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"enable_chunked_prefill": False,
|
||||
}
|
||||
# Try a range of common k, as well as large speculation.
|
||||
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
|
||||
] + [{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_batched_tokens": 4,
|
||||
|
@ -715,7 +729,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
|
||||
"enable_chunked_prefill": False
|
||||
|
@ -723,7 +737,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|||
# Try a range of common k.
|
||||
for k in [1, 2, 3]
|
||||
] + [{
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
"speculative_model": "JackFram/llama-68m",
|
||||
"num_speculative_tokens": k,
|
||||
"spec_decoding_acceptance_method": "typical_acceptance_sampler",
|
||||
"enable_chunked_prefill": True,
|
||||
|
|
|
@ -1,71 +1,131 @@
|
|||
# SPDX-License-Identifier: Apache-2.0
|
||||
MODELS_ON_S3 = [
|
||||
"adept/fuyu-8b",
|
||||
"ai21labs/AI21-Jamba-1.5-Mini",
|
||||
"ai21labs/Jamba-tiny-random",
|
||||
"ai21labs/Jamba-tiny-reward-dev",
|
||||
"allenai/Molmo-7B-D-0924",
|
||||
"allenai/OLMo-1B-hf",
|
||||
"allenai/OLMoE-1B-7B-0924-Instruct",
|
||||
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
|
||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
"BAAI/bge-reranker-v2-m3",
|
||||
"bigcode/starcoder2-3b",
|
||||
"cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"cross-encoder/quora-roberta-base",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"distilbert/distilgpt2",
|
||||
"facebook/bart-base",
|
||||
"facebook/bart-large-cnn",
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||
"google/gemma-1.1-2b-it",
|
||||
"google/gemma-2-2b-it",
|
||||
"google/paligemma-3b-pt-224",
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
"internlm/internlm2-1_8b-reward",
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
"intfloat/multilingual-e5-large",
|
||||
"JackFram/llama-160m",
|
||||
"jason9693/Qwen2.5-1.5B-apeach",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"google/gemma-2-2b-it",
|
||||
"google/gemma-1.1-2b-it",
|
||||
"openai-community/gpt2",
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"ai21labs/Jamba-tiny-random",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||
"microsoft/Phi-3.5-vision-instruct",
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
"microsoft/phi-2",
|
||||
"microsoft/Phi-3-mini-4k-instruct",
|
||||
"microsoft/Phi-3-small-8k-instruct",
|
||||
"microsoft/Phi-3-vision-128k-instruct",
|
||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
"microsoft/Phi-3.5-vision-instruct",
|
||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
"mistral-community/Mixtral-8x22B-v0.1-AWQ",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
||||
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
|
||||
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
||||
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
|
||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
|
||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
|
||||
"nvidia/NVLM-D-72B",
|
||||
"openai-community/gpt2",
|
||||
"openai/whisper-large-v3",
|
||||
"openbmb/MiniCPM-o-2_6",
|
||||
"openbmb/MiniCPM-V-2_6",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"OrionStarAI/Orion-14B-Chat",
|
||||
"parasail-ai/GritLM-7B-vllm",
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
|
||||
"Qwen/Qwen2-7B-Instruct",
|
||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||
"Qwen/Qwen2-VL-2B-Instruct",
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
"Qwen/Qwen2.5-Math-PRM-7B",
|
||||
"Qwen/Qwen2.5-Math-RM-72B",
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"royokong/e5-v",
|
||||
"sentence-transformers/all-roberta-large-v1",
|
||||
"sentence-transformers/stsb-roberta-base-v2",
|
||||
"shanearora/OLMo-7B-1124-hf",
|
||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
|
||||
"ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
"stabilityai/stablelm-3b-4e1t",
|
||||
"stabilityai/stablelm-zephyr-3b",
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
|
||||
"THUDM/glm-4v-9b",
|
||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
"TIGER-Lab/VLM2Vec-Full",
|
||||
"tiiuae/falcon-40b",
|
||||
"tiiuae/falcon-mamba-7b-instruct",
|
||||
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"upstage/solar-pro-preview-instruct",
|
||||
]
|
||||
|
||||
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
|
||||
|
|
Loading…
Reference in New Issue