mirror of https://github.com/vllm-project/vllm
[mypy] Enable type checking for test directory (#5017)
This commit is contained in:
parent
1b8a0d71cf
commit
0e9164b40a
|
@ -47,5 +47,5 @@ jobs:
|
|||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/logging --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy tests --config-file pyproject.toml
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ import time
|
|||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import AsyncGenerator, List, Optional, Tuple
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
|
@ -200,12 +200,12 @@ def calculate_metrics(
|
|||
dur_s: float,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||
actual_output_lens = []
|
||||
actual_output_lens: List[int] = []
|
||||
total_input = 0
|
||||
completed = 0
|
||||
itls = []
|
||||
tpots = []
|
||||
ttfts = []
|
||||
itls: List[float] = []
|
||||
tpots: List[float] = []
|
||||
ttfts: List[float] = []
|
||||
for i in range(len(outputs)):
|
||||
if outputs[i].success:
|
||||
# We use the tokenizer to count the number of output tokens for all
|
||||
|
@ -265,7 +265,7 @@ async def benchmark(
|
|||
disable_tqdm: bool,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
|
@ -292,7 +292,7 @@ async def benchmark(
|
|||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
tasks = []
|
||||
tasks: List[asyncio.Task] = []
|
||||
async for request in get_request(input_requests, request_rate):
|
||||
prompt, prompt_len, output_len = request
|
||||
request_func_input = RequestFuncInput(
|
||||
|
@ -310,7 +310,7 @@ async def benchmark(
|
|||
pbar=pbar)))
|
||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
|
||||
if not disable_tqdm:
|
||||
if pbar is not None:
|
||||
pbar.close()
|
||||
|
||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||
|
@ -466,7 +466,7 @@ def main(args: argparse.Namespace):
|
|||
|
||||
# Save config and results to json
|
||||
if args.save_result:
|
||||
result_json = {}
|
||||
result_json: Dict[str, Any] = {}
|
||||
|
||||
# Setup
|
||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
|
|
|
@ -108,8 +108,8 @@ def run_vllm(
|
|||
)
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts = []
|
||||
sampling_params = []
|
||||
prompts: List[str] = []
|
||||
sampling_params: List[SamplingParams] = []
|
||||
for prompt, _, output_len in requests:
|
||||
prompts.append(prompt)
|
||||
sampling_params.append(
|
||||
|
|
|
@ -86,9 +86,9 @@ def dequant_no_scale(
|
|||
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
||||
# the generic pytorch version.
|
||||
# Just visual comparison.
|
||||
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
|
||||
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
|
||||
|
||||
n = parts.sum().item()
|
||||
n = int(parts.sum().item())
|
||||
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
|
@ -204,7 +204,7 @@ def main():
|
|||
sys.stdout = sys.__stdout__
|
||||
|
||||
|
||||
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
||||
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
|
||||
methods):
|
||||
|
||||
# I didn't see visible improvements from increasing these, but feel free :)
|
||||
|
@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
|||
print('')
|
||||
|
||||
|
||||
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
|
||||
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
|
||||
nbooks: int, bits: int, method) -> float:
|
||||
|
||||
n = parts.sum().item()
|
||||
n = int(parts.sum().item())
|
||||
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import argparse
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
|
@ -23,8 +24,9 @@ ACT_ORDER_OPTS = [False, True]
|
|||
K_FULL_OPTS = [False, True]
|
||||
|
||||
|
||||
def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
|
||||
size_m, size_k, size_n):
|
||||
def bench_run(results: List[benchmark.Measurement], model: str,
|
||||
act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
|
||||
size_m: int, size_k: int, size_n: int):
|
||||
label = "Quant Matmul"
|
||||
|
||||
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
||||
|
@ -156,7 +158,7 @@ def main(args):
|
|||
for i, model in enumerate(args.models):
|
||||
print(f"[{i}] {model}")
|
||||
|
||||
results = []
|
||||
results: List[benchmark.Measurement] = []
|
||||
|
||||
for model in args.models:
|
||||
for layer in WEIGHT_SHAPES[model]:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import argparse
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Tuple
|
||||
from typing import Any, Dict, List, Tuple, TypedDict
|
||||
|
||||
import ray
|
||||
import torch
|
||||
|
@ -12,8 +12,17 @@ from transformers import AutoConfig
|
|||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
|
||||
|
||||
class BenchmarkConfig(TypedDict):
|
||||
BLOCK_SIZE_M: int
|
||||
BLOCK_SIZE_N: int
|
||||
BLOCK_SIZE_K: int
|
||||
GROUP_SIZE_M: int
|
||||
num_warps: int
|
||||
num_stages: int
|
||||
|
||||
|
||||
def benchmark_config(
|
||||
config: Dict[str, int],
|
||||
config: BenchmarkConfig,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
|
@ -92,7 +101,7 @@ def benchmark_config(
|
|||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
latencies = []
|
||||
latencies: List[float] = []
|
||||
for i in range(num_iters):
|
||||
prepare(i)
|
||||
torch.cuda.synchronize()
|
||||
|
@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
|
|||
# Reduced search space for faster tuning.
|
||||
# TODO(woosuk): Increase the search space and use a performance model to
|
||||
# prune the search space.
|
||||
configs = []
|
||||
configs: List[BenchmarkConfig] = []
|
||||
for num_stages in [2, 3, 4, 5]:
|
||||
for block_m in [16, 32, 64, 128, 256]:
|
||||
for block_k in [64, 128, 256]:
|
||||
|
@ -175,8 +184,8 @@ class BenchmarkWorker:
|
|||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8: bool,
|
||||
search_space: List[Dict[str, int]],
|
||||
) -> Dict[str, int]:
|
||||
search_space: List[BenchmarkConfig],
|
||||
) -> BenchmarkConfig:
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
for config in tqdm(search_space):
|
||||
|
@ -199,10 +208,11 @@ class BenchmarkWorker:
|
|||
best_config = config
|
||||
now = datetime.now()
|
||||
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||
assert best_config is not None
|
||||
return best_config
|
||||
|
||||
|
||||
def sort_config(config: Dict[str, int]) -> Dict[str, int]:
|
||||
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||
return {
|
||||
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
||||
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
||||
|
@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
|
|||
|
||||
|
||||
def save_configs(
|
||||
configs: Dict[int, Dict[str, int]],
|
||||
configs: Dict[int, BenchmarkConfig],
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import argparse
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -54,14 +54,17 @@ def main(
|
|||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables = []
|
||||
block_tables_lst: List[List[int]] = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
random.randint(0, NUM_BLOCKS - 1)
|
||||
for _ in range(max_num_blocks_per_seq)
|
||||
]
|
||||
block_tables.append(block_table)
|
||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
|
||||
block_tables_lst.append(block_table)
|
||||
|
||||
block_tables = torch.tensor(block_tables_lst,
|
||||
dtype=torch.int,
|
||||
device=device)
|
||||
|
||||
# Create the KV cache.
|
||||
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
import argparse
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||
get_rope)
|
||||
|
||||
|
||||
def benchmark_rope_kernels_multi_lora(
|
||||
|
@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||
})
|
||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||
# instances to simulate the same behavior
|
||||
non_batched_ropes = []
|
||||
non_batched_ropes: List[RotaryEmbedding] = []
|
||||
for scaling_factor in scaling_factors:
|
||||
non_batched_ropes.append(
|
||||
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||
|
|
|
@ -2,7 +2,7 @@ import argparse
|
|||
import glob
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -19,7 +19,7 @@ def _prepare_hf_weights(
|
|||
quantized_model_dir: str,
|
||||
load_format: str = "auto",
|
||||
fall_back_to_pt: bool = True,
|
||||
) -> Tuple[str, List[str], bool]:
|
||||
) -> Tuple[List[str], bool]:
|
||||
if not os.path.isdir(quantized_model_dir):
|
||||
raise FileNotFoundError(
|
||||
f"The quantized model directory `{quantized_model_dir}` "
|
||||
|
@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str,
|
|||
|
||||
|
||||
def _kv_scales_extractor(
|
||||
hf_tensor_files: Iterable[str],
|
||||
hf_tensor_files: List[str],
|
||||
use_safetensors: bool,
|
||||
rank_keyword: str = "rank",
|
||||
expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
|
||||
|
@ -115,7 +115,7 @@ def _kv_scales_extractor(
|
|||
for char in rank_keyword:
|
||||
assert not char.isdecimal(
|
||||
), f"Rank keyword {rank_keyword} contains a numeric character!"
|
||||
rank_scales_map = {}
|
||||
rank_scales_map: Dict[int, Dict[int, float]] = {}
|
||||
for tensor_file in hf_tensor_files:
|
||||
try:
|
||||
rank_idx = tensor_file.find(rank_keyword)
|
||||
|
@ -141,7 +141,7 @@ def _kv_scales_extractor(
|
|||
raise
|
||||
|
||||
if rank not in rank_scales_map:
|
||||
layer_scales_map = {}
|
||||
layer_scales_map: Dict[int, float] = {}
|
||||
rank_scales_map[rank] = layer_scales_map
|
||||
else:
|
||||
raise RuntimeError(
|
||||
|
@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str,
|
|||
"does not exist.")
|
||||
metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
|
||||
|
||||
result = {}
|
||||
result: Dict[str, Any] = {}
|
||||
for file in metadata_files:
|
||||
with open(file) as f:
|
||||
try:
|
||||
|
|
|
@ -5,7 +5,7 @@ distributively on a multi-nodes cluster.
|
|||
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
|
||||
"""
|
||||
|
||||
from typing import Dict
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import ray
|
||||
|
@ -40,8 +40,8 @@ class LLMPredictor:
|
|||
# The output is a list of RequestOutput objects that contain the prompt,
|
||||
# generated text, and other information.
|
||||
outputs = self.llm.generate(batch["text"], sampling_params)
|
||||
prompt = []
|
||||
generated_text = []
|
||||
prompt: List[str] = []
|
||||
generated_text: List[str] = []
|
||||
for output in outputs:
|
||||
prompt.append(output.prompt)
|
||||
generated_text.append(' '.join([o.text for o in output.outputs]))
|
||||
|
@ -71,7 +71,7 @@ def scheduling_strategy_fn():
|
|||
pg, placement_group_capture_child_tasks=True))
|
||||
|
||||
|
||||
resources_kwarg = {}
|
||||
resources_kwarg: Dict[str, Any] = {}
|
||||
if tensor_parallel_size == 1:
|
||||
# For tensor_parallel_size == 1, we simply set num_gpus=1.
|
||||
resources_kwarg["num_gpus"] = 1
|
||||
|
|
|
@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
|
|||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/logging --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy tests --config-file pyproject.toml
|
||||
|
||||
|
||||
# If git diff returns a file that is in the skip list, the file may be checked anyway:
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.block_table import BlockTable
|
||||
|
@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
|
|||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_tables = []
|
||||
block_tables: List[BlockTable] = []
|
||||
for i in range(5):
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
|
||||
|
@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
|
|||
num_immutable_blocks_per_alloc = len(
|
||||
chunked_tokens) - num_mutable_blocks_per_alloc
|
||||
|
||||
block_tables = []
|
||||
block_tables: List[BlockTable] = []
|
||||
for alloc_i in range(1, 6):
|
||||
|
||||
block_tables.append(
|
||||
|
@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
|
|||
)
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
appended_so_far = []
|
||||
appended_so_far: List[int] = []
|
||||
for append in chunk_list(token_ids_to_append, append_size):
|
||||
block_table.append_token_ids(append)
|
||||
appended_so_far.extend(append)
|
||||
|
|
|
@ -123,7 +123,7 @@ class TestPrefixCachingBlock:
|
|||
num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
blocks = []
|
||||
blocks: List[PrefixCachingBlock] = []
|
||||
num_blocks = math.ceil(
|
||||
len(token_ids) / block_size) + num_empty_trailing_blocks
|
||||
|
||||
|
@ -608,7 +608,7 @@ class TestPrefixCachingBlockAllocator:
|
|||
) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
blocks = []
|
||||
blocks: List[Block] = []
|
||||
num_blocks = math.ceil(len(token_ids) / block_size)
|
||||
|
||||
if num_blocks == 0:
|
||||
|
|
|
@ -483,11 +483,11 @@ def test_chunked_prefill_preempt():
|
|||
# The request should be preempted.
|
||||
scheduler.block_manager.can_append_slots = MagicMock()
|
||||
|
||||
def cannot_append_second_group(seq_group, num_lookahead_slots):
|
||||
def cannot_append_second_group1(seq_group, num_lookahead_slots):
|
||||
return seq_group.request_id != "1"
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group)
|
||||
cannot_append_second_group1)
|
||||
|
||||
# The running prefill is now preempted.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
|
@ -505,11 +505,11 @@ def test_chunked_prefill_preempt():
|
|||
assert seq_group.get_num_uncomputed_tokens() == 30
|
||||
|
||||
# We should be able to run prefill twice as it is chunked.
|
||||
def cannot_append_second_group(seq_group, num_lookahead_slots):
|
||||
def cannot_append_second_group2(seq_group, num_lookahead_slots):
|
||||
return True
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group)
|
||||
cannot_append_second_group2)
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
|
@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs():
|
|||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running = []
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=65)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import time
|
||||
from collections import deque
|
||||
from typing import List
|
||||
from typing import Deque, List, Set, Tuple
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest # noqa
|
||||
|
@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group():
|
|||
|
||||
# Add multiple seq groups to scheduler.
|
||||
num_seq_group = 4
|
||||
request_ids = set()
|
||||
request_ids: Set[str] = set()
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i), block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len():
|
|||
Test prompt longer than max_prompt_len is aborted.
|
||||
"""
|
||||
scheduler = initialize_scheduler(max_model_len=30)
|
||||
_, seq_group = create_dummy_prompt(0, prompt_length=60)
|
||||
_, seq_group = create_dummy_prompt("0", prompt_length=60)
|
||||
waiting = deque([seq_group])
|
||||
budget = create_token_budget()
|
||||
remaining_waiting, output = scheduler._schedule_prefills(
|
||||
|
@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget():
|
|||
Test token budget respected.
|
||||
"""
|
||||
scheduler = initialize_scheduler()
|
||||
waiting = deque()
|
||||
waiting: Deque[SequenceGroup] = deque()
|
||||
budget = create_token_budget(token_budget=0)
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i), prompt_length=60)
|
||||
|
@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs():
|
|||
Test max seq respected.
|
||||
"""
|
||||
scheduler = initialize_scheduler()
|
||||
waiting = deque()
|
||||
waiting: Deque[SequenceGroup] = deque()
|
||||
budget = create_token_budget(max_num_seqs=2)
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i), prompt_length=60)
|
||||
|
@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora():
|
|||
"""
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
||||
scheduler = initialize_scheduler(lora_config=lora_config)
|
||||
waiting = deque()
|
||||
waiting: Deque[SequenceGroup] = deque()
|
||||
budget = create_token_budget(token_budget=120)
|
||||
curr_loras = set()
|
||||
curr_loras: Set[int] = set()
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
|
@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity():
|
|||
Test sequence cannot be scheduled due to block manager has no capacity.
|
||||
"""
|
||||
scheduler = initialize_scheduler()
|
||||
waiting = deque()
|
||||
waiting: Deque[SequenceGroup] = deque()
|
||||
budget = create_token_budget()
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i), prompt_length=60)
|
||||
|
@ -536,7 +536,7 @@ def test_decode_schedule_preempted():
|
|||
Test decodes cannot be scheduled and preempted.
|
||||
"""
|
||||
scheduler = initialize_scheduler()
|
||||
running = deque()
|
||||
running: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
for i in range(3):
|
||||
|
@ -577,7 +577,7 @@ def test_decode_swap_beam_search():
|
|||
Test best_of > 1 swap out blocks
|
||||
"""
|
||||
scheduler = initialize_scheduler()
|
||||
running = deque()
|
||||
running: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
budget = create_token_budget()
|
||||
|
@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update():
|
|||
"""
|
||||
scheduler = initialize_scheduler()
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
running = deque()
|
||||
running: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
|
@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update():
|
|||
|
||||
def test_schedule_swapped_simple():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
blocks_to_swap_out = []
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
|
@ -683,10 +683,10 @@ def test_schedule_swapped_simple():
|
|||
|
||||
def test_schedule_swapped_max_token_budget():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
blocks_to_swap_out = []
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for _ in range(2):
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
|
@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget():
|
|||
|
||||
def test_schedule_swapped_max_seqs():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
blocks_to_swap_out = []
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(4):
|
||||
_, seq_group = create_dummy_prompt(str(i), prompt_length=60)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
|
@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs():
|
|||
def test_schedule_swapped_max_loras():
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
||||
scheduler = initialize_scheduler(lora_config=lora_config)
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = set()
|
||||
blocks_to_swap_out = []
|
||||
curr_loras: Set[int] = set()
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
|
@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras():
|
|||
|
||||
def test_schedule_swapped_cannot_swap_in():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
blocks_to_swap_out = []
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for _ in range(2):
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
|
@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in():
|
|||
|
||||
def test_infeasible_swap():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
blocks_to_swap_out = []
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for _ in range(2):
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
|
@ -834,13 +834,13 @@ def test_infeasible_swap():
|
|||
|
||||
def test_schedule_swapped_blocks_to_copy():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
swapped: Deque[SequenceGroup] = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
blocks_to_swap_out = []
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
swapped.append(seq_group)
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import time
|
||||
from typing import Iterable, Optional, Tuple
|
||||
from typing import List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Tuple
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder(
|
|||
lora_request: Optional[LoRARequest] = None,
|
||||
use_beam_search: bool = False,
|
||||
best_of: int = 1,
|
||||
) -> Tuple[Sequence, SequenceGroup]:
|
||||
) -> Tuple[Sequence, Sequence, SequenceGroup]:
|
||||
if not block_size:
|
||||
block_size = decoder_prompt_length
|
||||
|
||||
|
@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder(
|
|||
|
||||
def create_seq_group(
|
||||
seq_prompt_len: int = 1024,
|
||||
seq_output_lens: Iterable[int] = (128, ),
|
||||
seq_output_lens: GenericSequence[int] = (128, ),
|
||||
request_id: str = '0',
|
||||
seq_id_start: int = 0,
|
||||
sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
|
||||
|
@ -98,7 +100,7 @@ def create_seq_group(
|
|||
|
||||
prompt_token_ids = [0] * seq_prompt_len
|
||||
|
||||
seqs = []
|
||||
seqs: List[Sequence] = []
|
||||
for seq_id_offset, output_len in enumerate(seq_output_lens):
|
||||
seq = Sequence(
|
||||
seq_id=seq_id_start + seq_id_offset,
|
||||
|
@ -125,7 +127,7 @@ def create_seq_group(
|
|||
|
||||
def create_seq_group_encoder_decoder(
|
||||
seq_prompt_len: int = 1024,
|
||||
seq_output_lens: Iterable[int] = (128, ),
|
||||
seq_output_lens: GenericSequence[int] = (128, ),
|
||||
request_id: str = '0',
|
||||
seq_id_start: int = 0,
|
||||
sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import multiprocessing
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
@ -17,9 +18,9 @@ from vllm.utils import update_environment_variables
|
|||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes = []
|
||||
processes: List[multiprocessing.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env = {}
|
||||
env: Dict[str, str] = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
|
|
|
@ -6,7 +6,7 @@ from vllm.utils import cuda_device_count_stateless
|
|||
|
||||
|
||||
@ray.remote
|
||||
class _CUDADeviceCountStatelessTestActor():
|
||||
class _CUDADeviceCountStatelessTestActor:
|
||||
|
||||
def get_count(self):
|
||||
return cuda_device_count_stateless()
|
||||
|
@ -22,7 +22,8 @@ def test_cuda_device_count_stateless():
|
|||
"""Test that cuda_device_count_stateless changes return value if
|
||||
CUDA_VISIBLE_DEVICES is changed."""
|
||||
|
||||
actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
|
||||
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
|
||||
num_gpus=2).remote()
|
||||
assert sorted(ray.get(
|
||||
actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
|
||||
assert ray.get(actor.get_count.remote()) == 2
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# imports for guided decoding tests
|
||||
import json
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
|
@ -453,7 +454,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
|
|||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True)
|
||||
chunks = []
|
||||
chunks: List[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
|
@ -499,7 +500,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
|
|||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks = []
|
||||
chunks: List[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
|
|
|
@ -72,27 +72,27 @@ def ref_single_query_cached_kv_attention(
|
|||
block_size = value_cache.shape[3]
|
||||
num_seqs = query.shape[0]
|
||||
|
||||
block_tables = block_tables.cpu().tolist()
|
||||
seq_lens = seq_lens.cpu().tolist()
|
||||
block_tables_lst = block_tables.cpu().tolist()
|
||||
seq_lens_lst = seq_lens.cpu().tolist()
|
||||
for i in range(num_seqs):
|
||||
q = query[i].unsqueeze(0)
|
||||
block_table = block_tables[i]
|
||||
seq_len = int(seq_lens[i])
|
||||
block_table = block_tables_lst[i]
|
||||
seq_len = int(seq_lens_lst[i])
|
||||
|
||||
keys = []
|
||||
values = []
|
||||
keys_lst: List[torch.Tensor] = []
|
||||
values_lst: List[torch.Tensor] = []
|
||||
for j in range(seq_len):
|
||||
block_number = int(block_table[j // block_size])
|
||||
block_offset = j % block_size
|
||||
|
||||
k = key_cache[block_number, :, :, block_offset, :]
|
||||
k = k.reshape(num_kv_heads, head_size)
|
||||
keys.append(k)
|
||||
keys_lst.append(k)
|
||||
|
||||
v = value_cache[block_number, :, :, block_offset]
|
||||
values.append(v)
|
||||
keys = torch.stack(keys, dim=0)
|
||||
values = torch.stack(values, dim=0)
|
||||
values_lst.append(v)
|
||||
keys = torch.stack(keys_lst, dim=0)
|
||||
values = torch.stack(values_lst, dim=0)
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
|
||||
|
@ -157,14 +157,15 @@ def test_paged_attention(
|
|||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables = []
|
||||
block_tables_lst: List[List[int]] = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
random.randint(0, NUM_BLOCKS - 1)
|
||||
for _ in range(max_num_blocks_per_seq)
|
||||
]
|
||||
block_tables.append(block_table)
|
||||
block_tables = torch.tensor(block_tables, dtype=torch.int)
|
||||
block_tables_lst.append(block_table)
|
||||
|
||||
block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
|
||||
|
@ -283,7 +284,7 @@ def ref_multi_query_kv_attention(
|
|||
dtype: torch.dtype,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(cu_seq_lens) - 1
|
||||
ref_outputs = []
|
||||
ref_outputs: List[torch.Tensor] = []
|
||||
for i in range(num_seqs):
|
||||
start_idx = cu_seq_lens[i]
|
||||
end_idx = cu_seq_lens[i + 1]
|
||||
|
@ -303,8 +304,8 @@ def ref_multi_query_kv_attention(
|
|||
attn_mask=attn_mask,
|
||||
)
|
||||
ref_outputs.append(ref_output)
|
||||
ref_output = torch.cat(ref_outputs, dim=0)
|
||||
return ref_output
|
||||
|
||||
return torch.cat(ref_outputs, dim=0)
|
||||
|
||||
|
||||
# TODO(woosuk): Add tests for USE_ALIBI=True.
|
||||
|
|
|
@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention(
|
|||
block_size = value_cache.shape[3]
|
||||
num_seqs = query.shape[0]
|
||||
|
||||
block_tables = block_tables.cpu().tolist()
|
||||
seq_lens = seq_lens.cpu().tolist()
|
||||
block_tables_lst = block_tables.cpu().tolist()
|
||||
seq_lens_lst = seq_lens.cpu().tolist()
|
||||
for i in range(num_seqs):
|
||||
q = query[i].unsqueeze(0)
|
||||
block_table = block_tables[i]
|
||||
seq_len = int(seq_lens[i])
|
||||
block_table = block_tables_lst[i]
|
||||
seq_len = int(seq_lens_lst[i])
|
||||
|
||||
keys = []
|
||||
values = []
|
||||
keys_lst: List[torch.Tensor] = []
|
||||
values_lst: List[torch.Tensor] = []
|
||||
for j in range(seq_len):
|
||||
block_number = int(block_table[j // block_size])
|
||||
block_offset = j % block_size
|
||||
|
||||
k = key_cache[block_number, :, :, block_offset, :]
|
||||
k = k.reshape(num_kv_heads, head_size)
|
||||
keys.append(k)
|
||||
keys_lst.append(k)
|
||||
|
||||
v = value_cache[block_number, :, :, block_offset]
|
||||
values.append(v)
|
||||
keys = torch.stack(keys, dim=0)
|
||||
values = torch.stack(values, dim=0)
|
||||
values_lst.append(v)
|
||||
keys = torch.stack(keys_lst, dim=0)
|
||||
values = torch.stack(values_lst, dim=0)
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
|
||||
|
@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill(
|
|||
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
|
||||
|
||||
ref_output = ref_multi_query_kv_attention(
|
||||
cu_seq_lens,
|
||||
cu_seq_lens.tolist(),
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import random
|
||||
from typing import Tuple
|
||||
from typing import List, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
@ -63,7 +63,7 @@ def test_copy_blocks(
|
|||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
|
||||
block_mapping = []
|
||||
block_mapping: List[Tuple[int, int]] = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst1 = dst_blocks[2 * i]
|
||||
|
@ -131,8 +131,8 @@ def test_reshape_and_cache(
|
|||
torch.set_default_device(device)
|
||||
# Create a random slot mapping.
|
||||
num_slots = block_size * num_blocks
|
||||
slot_mapping = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
|
||||
|
||||
qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
|
||||
_, key, value = qkv.unbind(dim=1)
|
||||
|
@ -170,12 +170,12 @@ def test_reshape_and_cache(
|
|||
# Run the reference implementation.
|
||||
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indicies = block_indicies.cpu().tolist()
|
||||
block_indicies_lst = block_indicies.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets = block_offsets.cpu().tolist()
|
||||
block_offsets_lst = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies[i]
|
||||
block_offset = block_offsets[i]
|
||||
block_idx = block_indicies_lst[i]
|
||||
block_offset = block_offsets_lst[i]
|
||||
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
|
||||
cloned_value_cache[block_idx, :, :, block_offset] = value[i]
|
||||
|
||||
|
@ -224,8 +224,10 @@ def test_reshape_and_cache_flash(
|
|||
|
||||
# Create a random slot mapping.
|
||||
num_slots = block_size * num_blocks
|
||||
slot_mapping = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device)
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst,
|
||||
dtype=torch.long,
|
||||
device=device)
|
||||
|
||||
qkv = torch.randn(num_tokens,
|
||||
3,
|
||||
|
@ -257,13 +259,13 @@ def test_reshape_and_cache_flash(
|
|||
slot_mapping, kv_cache_dtype)
|
||||
|
||||
# Run the reference implementation.
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
|
||||
block_indicies = block_indicies.cpu().tolist()
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indicies_lst = block_indicies.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets = block_offsets.cpu().tolist()
|
||||
block_offsets_lst = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies[i]
|
||||
block_offset = block_offsets[i]
|
||||
block_idx = block_indicies_lst[i]
|
||||
block_offset = block_offsets_lst[i]
|
||||
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
|
||||
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
|
||||
|
||||
|
|
|
@ -17,13 +17,13 @@ capability = torch.cuda.get_device_capability()
|
|||
capability = capability[0] * 10 + capability[1]
|
||||
|
||||
|
||||
def to_fp8(tensor: torch.tensor):
|
||||
def to_fp8(tensor: torch.Tensor):
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
return torch.round(tensor.clamp(
|
||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||
|
||||
|
||||
def to_int8(tensor: torch.tensor):
|
||||
def to_int8(tensor: torch.Tensor):
|
||||
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ def ref_paged_attn(
|
|||
block_tables = block_tables.cpu().numpy()
|
||||
_, block_size, num_kv_heads, head_size = key_cache.shape
|
||||
|
||||
outputs = []
|
||||
outputs: List[torch.Tensor] = []
|
||||
start_idx = 0
|
||||
for i in range(num_seqs):
|
||||
query_len = query_lens[i]
|
||||
|
@ -70,7 +70,7 @@ def ref_paged_attn(
|
|||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode
|
||||
def test_flash_attn_with_paged_kv(
|
||||
kv_lens: List[Tuple[int, int]],
|
||||
kv_lens: List[int],
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from itertools import accumulate, product
|
||||
from typing import List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
@ -126,7 +126,7 @@ def test_batched_rotary_embedding(
|
|||
query,
|
||||
key,
|
||||
offsets=torch.zeros(batch_size * seq_len,
|
||||
dtype=int,
|
||||
dtype=torch.long,
|
||||
device=device))
|
||||
# Compare the results.
|
||||
assert torch.allclose(out_query,
|
||||
|
@ -214,20 +214,16 @@ def test_batched_rotary_embedding_multi_lora(
|
|||
def test_rope_module_cache():
|
||||
MAX_POSITIONS = [123, 1234]
|
||||
BASES = [10000, 1000000]
|
||||
ROPE_SCALINGS = [
|
||||
None, {
|
||||
"type": "linear",
|
||||
"factor": (1, )
|
||||
}, {
|
||||
"type": "dynamic",
|
||||
"factor": 1
|
||||
}
|
||||
]
|
||||
settings = [
|
||||
HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
|
||||
ROPE_SCALINGS, DTYPES
|
||||
]
|
||||
rope_setting_id_map = {}
|
||||
ROPE_SCALINGS = (None, {
|
||||
"type": "linear",
|
||||
"factor": (1, )
|
||||
}, {
|
||||
"type": "dynamic",
|
||||
"factor": 1
|
||||
})
|
||||
settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
|
||||
ROPE_SCALINGS, DTYPES)
|
||||
rope_setting_id_map: Dict[str, int] = {}
|
||||
for setting in product(*settings):
|
||||
head_size, rotary_dim, max_position, base, \
|
||||
is_neox_stype, rope_scaling, dtype = setting
|
||||
|
|
|
@ -2,6 +2,7 @@ import contextlib
|
|||
import gc
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from typing import Dict, List, TypedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
@ -24,7 +25,18 @@ from vllm.model_executor.layers.sampler import Sampler
|
|||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
|
||||
LONG_LORA_INFOS = [{
|
||||
|
||||
class ContextIDInfo(TypedDict):
|
||||
lora_id: int
|
||||
context_length: str
|
||||
|
||||
|
||||
class ContextInfo(TypedDict):
|
||||
lora: str
|
||||
context_length: str
|
||||
|
||||
|
||||
LONG_LORA_INFOS: List[ContextIDInfo] = [{
|
||||
"lora_id": 1,
|
||||
"context_length": "16k",
|
||||
}, {
|
||||
|
@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1,
|
|||
long_context_lora_files_16k_2,
|
||||
long_context_lora_files_32k):
|
||||
cleanup()
|
||||
infos = {}
|
||||
infos: Dict[int, ContextInfo] = {}
|
||||
for lora_checkpoint_info in LONG_LORA_INFOS:
|
||||
lora_id = lora_checkpoint_info["lora_id"]
|
||||
if lora_id == 1:
|
||||
|
@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1,
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
|
||||
def llama_2_7b_engine_extra_embeddings():
|
||||
cleanup()
|
||||
get_model_old = get_model
|
||||
|
||||
|
@ -244,7 +256,6 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_model_extra_embeddings(
|
||||
llama_2_7b_engine_extra_embeddings) -> nn.Module:
|
||||
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
|
||||
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
|
@ -10,7 +12,7 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
|
|||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
|
@ -30,7 +32,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
@ -6,7 +8,7 @@ MODEL_PATH = "THUDM/chatglm3-6b"
|
|||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
|
@ -26,7 +28,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
from typing import List
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "google/gemma-7b"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
prompts = [
|
||||
"Quote: Imagination is",
|
||||
"Quote: Be yourself;",
|
||||
|
@ -17,7 +19,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
|
|
|
@ -26,7 +26,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int):
|
|||
return lora_model
|
||||
|
||||
|
||||
def do_sample(llm,
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: Optional[str] = None,
|
||||
lora_id: Optional[int] = None,
|
||||
logprobs: int = 0,
|
||||
|
@ -42,8 +42,8 @@ def do_sample(llm,
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_logprobs = []
|
||||
generated_texts: List[str] = []
|
||||
generated_logprobs: List[List[List[int]]] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
|
|
|
@ -109,7 +109,7 @@ def populate_loras(
|
|||
|
||||
for slot_idx, lora_id in enumerate(id_to_index):
|
||||
if lora_id is not None:
|
||||
subloras = []
|
||||
subloras: List[LoRALayerWeights] = []
|
||||
sublora_len = layer_weights.shape[0] // repeats
|
||||
for i in range(repeats):
|
||||
sublora = DummyLoRAManager().init_random_lora(
|
||||
|
@ -158,7 +158,10 @@ def create_random_inputs(
|
|||
|
||||
low, high = input_range
|
||||
|
||||
inputs, index_mapping, prompt_mapping = [], [], []
|
||||
inputs: List[torch.Tensor] = []
|
||||
index_mapping: List[int] = []
|
||||
prompt_mapping: List[int] = []
|
||||
|
||||
for _ in range(num_inputs):
|
||||
if input_type == torch.int:
|
||||
inputs.append(
|
||||
|
@ -222,7 +225,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
|
|||
|
||||
lora_result = lora_embedding(torch.cat(inputs))
|
||||
|
||||
expected_results = []
|
||||
expected_results: List[torch.Tensor] = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = embedding(input_)
|
||||
|
@ -356,7 +359,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
|
|||
|
||||
lora_result = lora_embedding(torch.cat(original_inputs))
|
||||
|
||||
expected_results = []
|
||||
expected_results: List[torch.Tensor] = []
|
||||
for input_, original_input_, lora_id in zip(inputs, original_inputs,
|
||||
prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
|
@ -482,7 +485,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
|
|||
|
||||
logits_processor.org_vocab_size = (vocab_size +
|
||||
lora_config.lora_extra_vocab_size)
|
||||
expected_results = []
|
||||
expected_results: List[torch.Tensor] = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = logits_processor._get_logits(hidden_states=input_,
|
||||
|
@ -598,7 +601,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
|
|||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
|
||||
expected_results = []
|
||||
expected_results: List[torch.Tensor] = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = linear(input_)[0]
|
||||
|
@ -729,7 +732,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
|
|||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
|
||||
expected_results = []
|
||||
expected_results: List[torch.Tensor] = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
result = linear(input_)[0]
|
||||
subloras = sublora_dict[lora_id]
|
||||
|
@ -885,9 +888,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
|
|||
computed_added_vocab_size = 0
|
||||
vocab_size_padded = -1
|
||||
|
||||
all_org_tokens = []
|
||||
all_added_tokens = []
|
||||
token_ids = []
|
||||
all_org_tokens: List[int] = []
|
||||
all_added_tokens: List[int] = []
|
||||
token_ids: List[int] = []
|
||||
|
||||
for tp_rank in range(tp_size):
|
||||
with patch(
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
|
@ -9,7 +11,7 @@ from .conftest import cleanup
|
|||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int):
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
|
||||
|
@ -27,7 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
|
|
|
@ -77,7 +77,7 @@ def evaluate_json_response(model_response, golden_response):
|
|||
|
||||
|
||||
def generate(
|
||||
llm,
|
||||
llm: vllm.LLM,
|
||||
inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
|
||||
):
|
||||
prompts, sampling_param, lora_request = inputs
|
||||
|
@ -159,7 +159,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
|
|||
non-batched generation.
|
||||
"""
|
||||
# Create non batched results first to compare against batched results
|
||||
non_batched_results = []
|
||||
non_batched_results: List[str] = []
|
||||
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
|
@ -172,7 +172,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
|
|||
# Create batched results
|
||||
# Each element of the batch must be
|
||||
# (prompt, prompt_sampling_params, prompt_lora_request)
|
||||
batched_prompts = []
|
||||
batched_prompts: List[Tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]] = []
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
batched_prompts.extend([
|
||||
|
@ -196,7 +197,8 @@ def test_self_consistency(lora_llm, long_context_infos):
|
|||
num_loras = len(long_context_infos)
|
||||
|
||||
# Create results in order of long_context_infos
|
||||
batched_prompts = []
|
||||
batched_prompts: List[Tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]] = []
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
batched_prompts.extend([
|
||||
|
@ -244,7 +246,7 @@ def test_quality(lora_llm, long_context_infos):
|
|||
The test is expected to run for about 1 minute on a p4de.24xlarge
|
||||
instance.
|
||||
"""
|
||||
scores = []
|
||||
scores: List[float] = []
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
for prompt_and_response in prompts_and_responses[context_len]:
|
||||
|
@ -277,7 +279,8 @@ def test_max_len(lora_llm, long_context_infos):
|
|||
generate(lora_llm, (bad_prompt, sampling_params, lora_request))
|
||||
|
||||
# Also test batched
|
||||
batched_prompts = []
|
||||
batched_prompts: List[Tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]] = []
|
||||
for lora_id_with_bad_inputs in long_context_infos:
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.lora.models import LoRAModel
|
||||
|
@ -17,7 +19,7 @@ def test_load_checkpoints(
|
|||
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
||||
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
|
||||
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
|
||||
expected_lora_modules = []
|
||||
expected_lora_modules: List[str] = []
|
||||
for module in supported_lora_modules:
|
||||
if module in packed_modules_mapping:
|
||||
expected_lora_modules.extend(packed_modules_mapping[module])
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import os
|
||||
from typing import List
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
@ -62,7 +62,7 @@ def test_from_lora_tensors(sql_lora_files):
|
|||
|
||||
def create_lora(lora_id: int, model: nn.Module,
|
||||
sub_modules: List[str]) -> LoRAModel:
|
||||
loras = {}
|
||||
loras: Dict[str, LoRALayerWeights] = {}
|
||||
for name in sub_modules:
|
||||
w = model.get_submodule(name).weight
|
||||
loras[name] = LoRALayerWeights(
|
||||
|
@ -83,7 +83,7 @@ def create_packed_lora(
|
|||
empty_replaced_module_name=None,
|
||||
) -> LoRAModel:
|
||||
w = model.get_submodule(module_name).weight
|
||||
loras = {}
|
||||
loras: Dict[str, LoRALayerWeights] = {}
|
||||
for replaced_module_name in replaced_module_names:
|
||||
if replaced_module_name == empty_replaced_module_name:
|
||||
continue
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
@ -7,7 +9,7 @@ from vllm.lora.request import LoRARequest
|
|||
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int):
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
prompts = [
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
|
||||
|
@ -20,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
@ -6,7 +8,7 @@ MODEL_PATH = "microsoft/phi-2"
|
|||
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(
|
||||
sql_prompt=
|
||||
|
@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
|||
if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
|
|
|
@ -25,7 +25,10 @@ MODELS: List[ModelWithQuantization] = [
|
|||
]
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
max_tokens: int = 256) -> List[str]:
|
||||
raw_prompts = [
|
||||
"Give me an orange-ish brown color",
|
||||
"Give me a neon pink color",
|
||||
|
@ -45,7 +48,7 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
|
|||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -9,13 +9,13 @@ class DummyLoRAManager:
|
|||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._loras = {}
|
||||
self._loras: Dict[str, LoRALayerWeights] = {}
|
||||
|
||||
def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
|
||||
self._loras[module_name] = lora
|
||||
|
||||
def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
|
||||
return self._loras.get(module_name, None)
|
||||
def get_module_lora(self, module_name: str) -> LoRALayerWeights:
|
||||
return self._loras[module_name]
|
||||
|
||||
def init_random_lora(self,
|
||||
module_name: str,
|
||||
|
@ -68,11 +68,11 @@ class DummyLoRAManager:
|
|||
module_name: str,
|
||||
input_dim: int,
|
||||
output_dims: List[int],
|
||||
noop_lora_index: List[int] = None,
|
||||
rank=8,
|
||||
noop_lora_index: Optional[List[int]] = None,
|
||||
rank: int = 8,
|
||||
):
|
||||
base_loras = []
|
||||
noop_lora_index = set(noop_lora_index or [])
|
||||
base_loras: List[LoRALayerWeights] = []
|
||||
noop_lora_index_set = set(noop_lora_index or [])
|
||||
|
||||
for i, out_dim in enumerate(output_dims):
|
||||
base_lora = self.init_lora(
|
||||
|
@ -80,7 +80,7 @@ class DummyLoRAManager:
|
|||
input_dim,
|
||||
out_dim,
|
||||
rank=rank,
|
||||
noop=i in noop_lora_index,
|
||||
noop=i in noop_lora_index_set,
|
||||
)
|
||||
base_loras.append(base_lora)
|
||||
packed_lora = PackedLoRALayerWeights.pack(base_loras)
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
@ -100,7 +101,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
|
|||
]
|
||||
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
generations = []
|
||||
generations: List[str] = []
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
|
|
|
@ -2,8 +2,11 @@
|
|||
|
||||
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
|
||||
"""
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.block import PhysicalTokenBlock
|
||||
from vllm.core.block_manager_v1 import CachedBlockAllocator
|
||||
from vllm.utils import Device
|
||||
|
||||
|
@ -43,7 +46,7 @@ def test_block_allocator(
|
|||
def test_eviction(num_blocks: int, ):
|
||||
block_size = 16
|
||||
block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
|
||||
blocks = []
|
||||
blocks: List[PhysicalTokenBlock] = []
|
||||
|
||||
for i in range(num_blocks):
|
||||
# use i as the block_hash
|
||||
|
|
|
@ -4,6 +4,7 @@ Run `pytest tests/quantization/test_configs.py --forked`.
|
|||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -51,7 +52,7 @@ MODEL_ARG_EXPTYPES = [
|
|||
|
||||
|
||||
@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
|
||||
def test_auto_gptq(model_arg_exptype: str) -> None:
|
||||
def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
|
||||
model_path, quantization_arg, expected_type = model_arg_exptype
|
||||
|
||||
try:
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
@ -62,21 +64,22 @@ def test_get_prompt_logprobs(
|
|||
for logprobs in result.outputs[0].logprobs:
|
||||
assert len(logprobs) == num_top_logprobs
|
||||
output_text = result.outputs[0].text
|
||||
output_string_from_most_likely_tokens = []
|
||||
output_string_from_most_likely_tokens_lst: List[str] = []
|
||||
for top_logprobs in result.outputs[0].logprobs:
|
||||
top_logprob = next(iter(top_logprobs.values()))
|
||||
output_string_from_most_likely_tokens.append(
|
||||
output_string_from_most_likely_tokens_lst.append(
|
||||
top_logprob.decoded_token)
|
||||
|
||||
if detokenize:
|
||||
output_string_from_most_likely_tokens = "".join(
|
||||
output_string_from_most_likely_tokens)
|
||||
output_string_from_most_likely_tokens_lst)
|
||||
assert output_text == output_string_from_most_likely_tokens, (
|
||||
"The output text from the top logprob for each token position "
|
||||
"should be the same as the output text in the result.")
|
||||
else:
|
||||
assert output_text == ''
|
||||
assert output_string_from_most_likely_tokens == [None] * max_tokens
|
||||
assert output_string_from_most_likely_tokens_lst == ([None] *
|
||||
max_tokens)
|
||||
|
||||
# The first prompt logprob is always None
|
||||
assert result.prompt_logprobs[0] is None
|
||||
|
|
|
@ -246,8 +246,8 @@ def test_rejection_sampling_approximates_target_distribution(
|
|||
draft_and_target_probs_equal)
|
||||
|
||||
sample_sizes = [10, 100, 1_000, 10_000, 100_000]
|
||||
distance_wrt_reference = []
|
||||
distance_wrt_target = []
|
||||
distance_wrt_reference: List[float] = []
|
||||
distance_wrt_target: List[float] = []
|
||||
|
||||
for num_samples in sample_sizes:
|
||||
(reference_vs_rejsample_dist,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import itertools
|
||||
import random
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
@ -49,8 +49,8 @@ def _do_sample(
|
|||
sampling_params: SamplingParams,
|
||||
device: str,
|
||||
):
|
||||
seq_group_metadata_list = []
|
||||
seq_lens = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
seq_lens: List[int] = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
|
@ -212,7 +212,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
|
|||
batch_size = random.randint(1, 128)
|
||||
|
||||
expected_penalization = []
|
||||
sequence_metadata_list = []
|
||||
sequence_metadata_list: List[SequenceGroupMetadata] = []
|
||||
# 20% chance to generate seq group metadata list with all prompts
|
||||
is_prompt = random.random() < 0.2
|
||||
while batch_size > 0:
|
||||
|
@ -232,8 +232,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
|
|||
eos_token_id=eos_token_id,
|
||||
stop_token_ids=stop_token_ids)
|
||||
|
||||
seq_data = {}
|
||||
seq_group_penalization = []
|
||||
seq_data: Dict[int, SequenceData] = {}
|
||||
seq_group_penalization: List[bool] = []
|
||||
for _ in range(num_seqs):
|
||||
num_input = random.randint(1, 100)
|
||||
num_generated = 0 if is_prompt else random.randint(1, 100)
|
||||
|
@ -392,17 +392,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
|
|||
else:
|
||||
test_cases = [generate_test_case()]
|
||||
|
||||
def run_test_case(*,
|
||||
expected_penalization=None,
|
||||
seq_group_metadata_list=None):
|
||||
def run_test_case(*, expected_penalization: List[bool],
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata]):
|
||||
assert expected_penalization, \
|
||||
"Invalid test case, need expected_penalization"
|
||||
assert seq_group_metadata_list, \
|
||||
"Invalid test case, need seq_group_metadata_list"
|
||||
|
||||
batch_size = 0
|
||||
seq_lens = []
|
||||
sampling_params_per_row = []
|
||||
seq_lens: List[int] = []
|
||||
sampling_params_per_row: List[SamplingParams] = []
|
||||
for sgm in seq_group_metadata_list:
|
||||
sampling_params = sgm.sampling_params
|
||||
|
||||
|
@ -472,15 +471,15 @@ def test_sampler_mixed(seed: int, device: str):
|
|||
batch_size = random.randint(1, 256)
|
||||
input_tensor, fake_logits, sampler = _prepare_test(batch_size)
|
||||
|
||||
seq_group_metadata_list = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
expected_tokens: List[Optional[List[int]]] = []
|
||||
seq_lens = []
|
||||
seq_lens: List[int] = []
|
||||
for i in range(batch_size):
|
||||
expected: Optional[List[int]] = None
|
||||
sampling_type = random.randint(0, 3)
|
||||
if sampling_type == 0:
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
expected = [torch.argmax(fake_logits[i], dim=-1).item()]
|
||||
expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
|
||||
elif sampling_type in (1, 2):
|
||||
n = random.randint(1, 10)
|
||||
sampling_params = SamplingParams(
|
||||
|
@ -536,15 +535,18 @@ def test_sampler_mixed(seed: int, device: str):
|
|||
]
|
||||
continue
|
||||
|
||||
expected_tokens_item = expected_tokens[i]
|
||||
assert expected_tokens_item is not None
|
||||
|
||||
for n, nth_output in enumerate(sequence_output.samples):
|
||||
if (metadata.sampling_params.temperature == 0
|
||||
or metadata.sampling_params.seed is not None):
|
||||
# Ensure exact matches for greedy or random with seed
|
||||
assert nth_output.output_token == expected_tokens[i][n]
|
||||
assert nth_output.output_token == expected_tokens_item[n]
|
||||
else:
|
||||
# For non-seeded random check that one of the high-logit
|
||||
# tokens were chosen
|
||||
assert nth_output.output_token in expected_tokens[i]
|
||||
assert nth_output.output_token in expected_tokens_item
|
||||
|
||||
# Test batch
|
||||
test_sampling()
|
||||
|
@ -588,8 +590,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
|
|||
warpers = generation_model._get_logits_warper(generation_config)
|
||||
assert len(warpers) == 2 # top_p and top_k
|
||||
|
||||
seq_group_metadata_list = []
|
||||
seq_lens = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
seq_lens: List[int] = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
|
@ -622,6 +624,9 @@ def test_sampler_top_k_top_p(seed: int, device: str):
|
|||
|
||||
with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
|
||||
sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
|
||||
|
||||
assert sample_probs is not None
|
||||
|
||||
hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
|
||||
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
|
||||
assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
|
||||
|
|
|
@ -118,16 +118,17 @@ class AsyncLLM:
|
|||
raise ValueError("The lengths of prompts and "
|
||||
"sampling_params must be the same.")
|
||||
|
||||
async def get_output(prompt, sampling_param) -> str:
|
||||
async def get_output(prompt, sampling_param) -> RequestOutput:
|
||||
request_id = random_uuid()
|
||||
results_generator = self.llm_engine.generate(
|
||||
prompt, sampling_param, request_id)
|
||||
final_output = None
|
||||
async for request_output in results_generator:
|
||||
final_output = request_output
|
||||
assert final_output is not None
|
||||
return final_output
|
||||
|
||||
outputs = []
|
||||
outputs: List[RequestOutput] = []
|
||||
try:
|
||||
for i in range(num_requests):
|
||||
prompt = prompts[i] if prompts is not None else None
|
||||
|
@ -208,8 +209,8 @@ def maybe_assert_ngram_worker(llm):
|
|||
def get_output_from_llm_generator(
|
||||
llm_generator, prompts,
|
||||
sampling_params) -> Tuple[List[str], List[List[int]]]:
|
||||
tokens = []
|
||||
token_ids = []
|
||||
tokens: List[str] = []
|
||||
token_ids: List[List[int]] = []
|
||||
for llm in llm_generator():
|
||||
maybe_assert_ngram_worker(llm)
|
||||
|
||||
|
@ -300,8 +301,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
|
|||
nvmlInit()
|
||||
start_time = time.time()
|
||||
while True:
|
||||
output = {}
|
||||
output_raw = {}
|
||||
output: Dict[int, str] = {}
|
||||
output_raw: Dict[int, float] = {}
|
||||
for device in devices:
|
||||
dev_handle = nvmlDeviceGetHandleByIndex(device)
|
||||
mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int):
|
|||
device='cuda',
|
||||
)
|
||||
|
||||
expected_output = [
|
||||
expected_output: List[List[int]] = [
|
||||
[],
|
||||
]
|
||||
for i in range(proposal_token_ids.shape[0]):
|
||||
expected_output.append(proposal_token_ids[:i + 1].tolist())
|
||||
|
||||
scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
|
||||
actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access
|
||||
actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist()) # pylint: disable=protected-access
|
||||
|
||||
actual_output = [
|
||||
x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
import random
|
||||
from typing import Dict, List
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
|
||||
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
||||
from vllm.spec_decode.top1_proposer import Top1Proposer
|
||||
from vllm.worker.worker import Worker
|
||||
|
@ -210,7 +211,7 @@ def test_same_output_for_multi_step():
|
|||
|
||||
# Run single-step repeatedly.
|
||||
zero_kv_cache(worker.cache_engine)
|
||||
single_step_output = []
|
||||
single_step_output: List[SamplerOutput] = []
|
||||
continuations = [[1] for _ in prompts]
|
||||
set_random_seed(seed)
|
||||
|
||||
|
@ -232,11 +233,15 @@ def test_same_output_for_multi_step():
|
|||
continuations[i].append(seq_group_output.samples[0].output_token)
|
||||
|
||||
# Get token ids and logprobs for comparison.
|
||||
multi_step_output_logprobs = [[] for _ in prompts]
|
||||
single_step_output_logprobs = [[] for _ in prompts]
|
||||
multi_step_output_logprobs: List[List[Dict[int,
|
||||
Logprob]]] = [[]
|
||||
for _ in prompts]
|
||||
single_step_output_logprobs: List[List[Dict[int,
|
||||
Logprob]]] = [[]
|
||||
for _ in prompts]
|
||||
|
||||
multi_step_output_token_ids = [[] for _ in prompts]
|
||||
single_step_output_token_ids = [[] for _ in prompts]
|
||||
multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
|
||||
single_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
|
||||
for i, _ in enumerate(prompts):
|
||||
for multi_step, single_step in zip(multi_step_output,
|
||||
single_step_output):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import random
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, List
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
@ -7,7 +8,7 @@ import torch
|
|||
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
|
||||
from vllm.spec_decode.interfaces import SpeculativeProposals
|
||||
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
|
||||
SpecDecodeWorkerMetrics)
|
||||
|
@ -103,7 +104,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
|
|||
seq_group_metadata_list=seq_group_metadata_list,
|
||||
num_lookahead_slots=k))
|
||||
|
||||
seen_contexts = []
|
||||
seen_contexts: List[List[int]] = []
|
||||
|
||||
call_args_list = target_worker.execute_model.call_args_list
|
||||
assert len(call_args_list) == 1
|
||||
|
@ -116,7 +117,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
|
|||
for seq_data in seq_group_metadata.seq_data.values():
|
||||
seen_contexts.append(seq_data.get_token_ids())
|
||||
|
||||
expected_seen_contexts = []
|
||||
expected_seen_contexts: List[List[int]] = []
|
||||
|
||||
for prompt, prev_generated, draft_tokens in zip(
|
||||
prompts, prev_output_tokens, proposal_token_ids.tolist()):
|
||||
|
@ -310,8 +311,14 @@ def test_correctly_formats_output(k: int, batch_size: int):
|
|||
next(iter(seq_group_metadata.seq_data.keys()))
|
||||
for seq_group_metadata in seq_group_metadata_list
|
||||
]
|
||||
actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
|
||||
expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
|
||||
actual_output_by_seq: Dict[int, List[SequenceOutput]] = {
|
||||
seq_id: []
|
||||
for seq_id in seq_ids
|
||||
}
|
||||
expected_output_by_seq: Dict[int, List[SequenceOutput]] = {
|
||||
seq_id: []
|
||||
for seq_id in seq_ids
|
||||
}
|
||||
|
||||
for step in output:
|
||||
for seq_group in step:
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from itertools import count
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
from typing import Callable, Dict, List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import TypeVar, Union
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
@ -14,6 +16,8 @@ from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
|||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
T = TypeVar("T", bound=Worker)
|
||||
|
||||
|
||||
def round_up_to_next_block(seq_len: int, block_size: int) -> int:
|
||||
return (seq_len + block_size - 1) // block_size
|
||||
|
@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine):
|
|||
value_blocks.zero_()
|
||||
|
||||
|
||||
def create_worker(cls: type,
|
||||
def create_worker(cls: Callable[..., T],
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
num_gpu_blocks: int,
|
||||
seed: int,
|
||||
is_driver_worker: bool = True,
|
||||
enforce_eager: bool = True):
|
||||
enforce_eager: bool = True) -> T:
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
seed=seed,
|
||||
|
@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose(
|
|||
|
||||
def create_sampler_output_list(
|
||||
token_ids: torch.Tensor,
|
||||
probs: Iterable[Optional[torch.Tensor]],
|
||||
logprobs: Iterable[Optional[torch.Tensor]],
|
||||
probs: GenericSequence[Optional[torch.Tensor]],
|
||||
logprobs: GenericSequence[Optional[torch.Tensor]],
|
||||
seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
|
||||
num_steps, batch_size = token_ids.shape
|
||||
token_ids_by_step = token_ids.tolist()
|
||||
|
|
|
@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
|
|||
max_input_length=None,
|
||||
)
|
||||
|
||||
hashes = []
|
||||
hashes: List[List[List[int]]] = []
|
||||
|
||||
for prefix in prefixes:
|
||||
for lora_int_id in concurrent_lora_int_ids:
|
||||
|
|
|
@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration():
|
|||
assert not logger.propagate
|
||||
|
||||
handler = logger.handlers[0]
|
||||
assert isinstance(handler, logging.StreamHandler)
|
||||
assert handler.stream == sys.stdout
|
||||
assert handler.level == logging.INFO
|
||||
|
||||
|
|
|
@ -153,8 +153,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
|
|||
# Run sequentially.
|
||||
seq = create_sequence()
|
||||
dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
|
||||
sequential_logprobs_text_chosen_token = []
|
||||
sequential_logprobs_text_other_token = []
|
||||
sequential_logprobs_text_chosen_token: List[str] = []
|
||||
sequential_logprobs_text_other_token: List[str] = []
|
||||
for new_token, logprobs in zip(complete_sequence_token_ids,
|
||||
dummy_logprobs):
|
||||
seq.append_token_id(new_token, logprobs)
|
||||
|
|
|
@ -79,7 +79,7 @@ class RemoteOpenAIServer:
|
|||
self.host = str(args.host or 'localhost')
|
||||
self.port = int(args.port)
|
||||
|
||||
self._runner = self._RemoteRunner.remote(
|
||||
self._runner = self._RemoteRunner.remote( # type: ignore
|
||||
cli_args,
|
||||
wait_url=self.url_for("health"),
|
||||
wait_timeout=self.MAX_SERVER_START_WAIT_S)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
@ -35,8 +37,8 @@ def test_prepare_prompt(batch_size):
|
|||
enable_chunked_prefill=False,
|
||||
)
|
||||
|
||||
seq_lens = []
|
||||
seq_group_metadata_list = []
|
||||
seq_lens: List[int] = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
block_tables = {0: [1]}
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
|
@ -151,15 +153,14 @@ def test_prepare_decode_cuda_graph(batch_size):
|
|||
enable_chunked_prefill=False,
|
||||
)
|
||||
|
||||
context_lens = []
|
||||
seq_group_metadata_list = []
|
||||
context_lens: List[int] = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
# Assume each seq group finishes prefill.
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
context_len = i % (model_runner.block_size - 1) + 1
|
||||
context_lens.append(context_len)
|
||||
seq_data = list(range(context_len))
|
||||
seq_data = SequenceData(seq_data)
|
||||
seq_data = SequenceData(list(range(context_len)))
|
||||
seq_data.update_num_computed_tokens(context_len)
|
||||
# Append one token ID since prefill is finished.
|
||||
seq_data.append_token_id(1, 0)
|
||||
|
@ -257,7 +258,7 @@ def test_empty_seq_group():
|
|||
dtype="float16",
|
||||
enforce_eager=False,
|
||||
)
|
||||
seq_group_metadata_list = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
model_input = model_runner._prepare_model_input(seq_group_metadata_list)
|
||||
input_tokens, input_positions, attn_metadata, slot_mapping = (
|
||||
model_input.input_tokens,
|
||||
|
@ -310,10 +311,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
|
|||
)
|
||||
|
||||
# Add prefill requests.
|
||||
seq_lens = []
|
||||
seq_group_metadata_list = []
|
||||
prefill_metadata_list = []
|
||||
decode_metadata_list = []
|
||||
seq_lens: List[int] = []
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
prefill_metadata_list: List[SequenceGroupMetadata] = []
|
||||
decode_metadata_list: List[SequenceGroupMetadata] = []
|
||||
block_tables = {0: [1]}
|
||||
prefill_batch_size = batch_size // 2
|
||||
decode_batch_size = batch_size - prefill_batch_size
|
||||
|
|
|
@ -245,7 +245,7 @@ def _make_alibi_bias(
|
|||
dtype: torch.dtype,
|
||||
seq_lens: List[int],
|
||||
) -> List[torch.Tensor]:
|
||||
attn_biases = []
|
||||
attn_biases: List[torch.Tensor] = []
|
||||
for seq_len in seq_lens:
|
||||
bias = torch.arange(seq_len, dtype=dtype)
|
||||
# NOTE(zhuohan): HF uses
|
||||
|
@ -271,7 +271,7 @@ def _make_sliding_window_bias(
|
|||
window_size: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
) -> List[torch.Tensor]:
|
||||
attn_biases = []
|
||||
attn_biases: List[torch.Tensor] = []
|
||||
for seq_len in seq_lens:
|
||||
tensor = torch.full(
|
||||
(1, seq_len, seq_len),
|
||||
|
|
|
@ -431,8 +431,8 @@ def _make_alibi_bias(
|
|||
num_kv_heads: int,
|
||||
dtype: torch.dtype,
|
||||
seq_lens: List[int],
|
||||
) -> LowerTriangularMaskWithTensorBias:
|
||||
attn_biases = []
|
||||
) -> List[AttentionBias]:
|
||||
attn_biases: List[AttentionBias] = []
|
||||
for seq_len in seq_lens:
|
||||
bias = torch.arange(seq_len, dtype=dtype)
|
||||
# NOTE(zhuohan): HF uses
|
||||
|
|
|
@ -252,7 +252,7 @@ class BlockTable:
|
|||
def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device) -> List[Block]:
|
||||
blocks = []
|
||||
blocks: List[Block] = []
|
||||
for block_token_ids in chunk_list(token_ids, self._block_size):
|
||||
if len(block_token_ids) == self._block_size:
|
||||
# If the block is full, create an immutable block.
|
||||
|
|
|
@ -111,7 +111,7 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||
"""
|
||||
source_blocks = get_all_blocks_recursively(last_block)
|
||||
|
||||
forked_blocks = []
|
||||
forked_blocks: List[Block] = []
|
||||
prev_block = None
|
||||
for block in source_blocks:
|
||||
|
||||
|
|
|
@ -271,7 +271,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||
"""
|
||||
source_blocks = get_all_blocks_recursively(last_block)
|
||||
|
||||
forked_blocks = []
|
||||
forked_blocks: List[Block] = []
|
||||
prev_block = None
|
||||
for block in source_blocks:
|
||||
refcount = self._refcounter.incr(block.block_id)
|
||||
|
|
|
@ -260,7 +260,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||
# at max extend.
|
||||
if self.enable_caching:
|
||||
block_table = self.block_tables[seq.seq_id]
|
||||
block_ids = []
|
||||
block_ids: List[Optional[int]] = []
|
||||
for block_id in block_table.physical_block_ids:
|
||||
block_ids.append(block_id)
|
||||
self.block_allocator.mark_blocks_as_accessed(
|
||||
|
|
|
@ -2,7 +2,7 @@ import ctypes
|
|||
import json
|
||||
import os
|
||||
from itertools import product
|
||||
from typing import Dict, Optional, Sequence
|
||||
from typing import Dict, List, Optional, Sequence
|
||||
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int],
|
|||
def can_actually_p2p(
|
||||
batch_src: Sequence[int],
|
||||
batch_tgt: Sequence[int],
|
||||
):
|
||||
) -> Sequence[bool]:
|
||||
"""
|
||||
Usually, checking if P2P access is enabled can be done by
|
||||
`torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
|
||||
|
@ -138,7 +138,7 @@ def can_actually_p2p(
|
|||
p_tgt.start()
|
||||
p_src.join()
|
||||
p_tgt.join()
|
||||
result = []
|
||||
result: List[bool] = []
|
||||
for src, tgt in zip(batch_src, batch_tgt):
|
||||
a = result_queue.get()
|
||||
b = result_queue.get()
|
||||
|
@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|||
# only the local master process (with local_rank == 0) can
|
||||
# enter this block to calculate the cache
|
||||
logger.info("generating GPU P2P access cache in %s", path)
|
||||
cache = {}
|
||||
cache: Dict[str, bool] = {}
|
||||
ids = list(range(num_dev))
|
||||
# batch of all pairs of GPUs
|
||||
batch_src, batch_tgt = zip(*list(product(ids, ids)))
|
||||
|
|
|
@ -205,7 +205,7 @@ class NCCLLibrary:
|
|||
raise e
|
||||
|
||||
if so_file not in NCCLLibrary.path_to_dict_mapping:
|
||||
_funcs = {}
|
||||
_funcs: Dict[str, Any] = {}
|
||||
for func in NCCLLibrary.exported_functions:
|
||||
f = getattr(self.lib, func.name)
|
||||
f.restype = func.restype
|
||||
|
|
|
@ -2,7 +2,7 @@ import time
|
|||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Type, TypeVar, Union
|
||||
from typing import Set, Type, TypeVar, Union
|
||||
|
||||
from transformers import GenerationConfig, PreTrainedTokenizer
|
||||
|
||||
|
@ -973,7 +973,7 @@ class LLMEngine:
|
|||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.model_executor.remove_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> List[int]:
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.model_executor.list_loras()
|
||||
|
||||
def check_health(self) -> None:
|
||||
|
|
|
@ -144,7 +144,7 @@ class Metrics:
|
|||
# end-metrics-definitions
|
||||
|
||||
|
||||
def build_1_2_5_buckets(max_value: int):
|
||||
def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
"""
|
||||
Builds a list of buckets with increasing powers of 10 multiplied by
|
||||
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
|
||||
|
@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int):
|
|||
"""
|
||||
mantissa_lst = [1, 2, 5]
|
||||
exponent = 0
|
||||
buckets = []
|
||||
buckets: List[int] = []
|
||||
while True:
|
||||
for m in mantissa_lst:
|
||||
value = m * 10**exponent
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Dict, List, Tuple, Union
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.config import SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
|
@ -146,8 +146,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
|
|||
|
||||
# Beam search case
|
||||
# Select the child sequences to keep in the sequence group.
|
||||
selected_child_seqs = []
|
||||
unselected_child_seqs = []
|
||||
selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
|
||||
unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
|
||||
beam_width = seq_group.sampling_params.best_of
|
||||
length_penalty = seq_group.sampling_params.length_penalty
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ import argparse
|
|||
import asyncio
|
||||
import sys
|
||||
from io import StringIO
|
||||
from typing import Awaitable, List
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
@ -114,7 +115,7 @@ async def main(args):
|
|||
)
|
||||
|
||||
# Submit all requests in the file to the engine "concurrently".
|
||||
response_futures = []
|
||||
response_futures: List[Awaitable[BatchRequestOutput]] = []
|
||||
for request_json in (await read_file(args.input_file)).strip().split("\n"):
|
||||
request = BatchRequestInput.model_validate_json(request_json)
|
||||
response_futures.append(run_request(openai_serving_chat, request))
|
||||
|
|
|
@ -487,7 +487,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||
final_res = res
|
||||
assert final_res is not None
|
||||
|
||||
choices = []
|
||||
choices: List[ChatCompletionResponseChoice] = []
|
||||
|
||||
role = self.get_chat_request_role(request)
|
||||
for output in final_res.outputs:
|
||||
|
|
|
@ -25,7 +25,7 @@ def request_output_to_embedding_response(
|
|||
created_time: int,
|
||||
model_name: str,
|
||||
) -> EmbeddingResponse:
|
||||
data = []
|
||||
data: List[EmbeddingResponseData] = []
|
||||
num_prompt_tokens = 0
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
assert final_res is not None
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -120,7 +121,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):
|
|||
|
||||
@classmethod
|
||||
def pack(
|
||||
cls, loras: List[Optional["LoRALayerWeights"]]
|
||||
cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
|
||||
) -> "PackedLoRALayerWeights":
|
||||
"""Pack a list of LoRAs into a single LoRA.
|
||||
|
||||
|
|
|
@ -165,7 +165,7 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
|
|||
model = self._lora_manager.model
|
||||
supported_lora_modules = model.supported_lora_modules
|
||||
packed_modules_mapping = model.packed_modules_mapping
|
||||
expected_lora_modules = []
|
||||
expected_lora_modules: List[str] = []
|
||||
for module in supported_lora_modules:
|
||||
if module in packed_modules_mapping:
|
||||
expected_lora_modules.extend(
|
||||
|
|
|
@ -393,7 +393,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
|||
param_data.copy_(loaded_weight)
|
||||
return
|
||||
current_shard_offset = 0
|
||||
shard_offsets = []
|
||||
shard_offsets: List[Tuple[int, int, int]] = []
|
||||
for i, output_size in enumerate(self.output_sizes):
|
||||
shard_offsets.append((i, current_shard_offset, output_size))
|
||||
current_shard_offset += output_size
|
||||
|
|
|
@ -25,24 +25,25 @@ GPTQ_MARLIN_SUPPORTED_SYM = [True]
|
|||
|
||||
|
||||
# Permutations for Marlin scale shuffling
|
||||
def get_scale_perms(num_bits):
|
||||
scale_perm = []
|
||||
def get_scale_perms(num_bits: int):
|
||||
scale_perm: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i + 8 * j for j in range(8)])
|
||||
scale_perm_single = []
|
||||
scale_perm_single: List[int] = []
|
||||
for i in range(4):
|
||||
scale_perm_single.extend(
|
||||
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
||||
return scale_perm, scale_perm_single
|
||||
|
||||
|
||||
def get_pack_factor(num_bits):
|
||||
def get_pack_factor(num_bits: int):
|
||||
assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
|
||||
), f"Unsupported num_bits = {num_bits}"
|
||||
return 32 // num_bits
|
||||
|
||||
|
||||
def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
|
||||
def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
|
||||
group_size: int, num_bits: int):
|
||||
scale_perm, scale_perm_single = get_scale_perms(num_bits)
|
||||
if group_size < size_k and group_size != -1:
|
||||
s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
"""This file is used for /tests and /benchmarks"""
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
|
@ -11,10 +13,10 @@ import torch
|
|||
#
|
||||
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
|
||||
# (without the need to use ldmatrix instructions) # noqa: E501
|
||||
def get_perms_24(num_bits):
|
||||
perm_list = []
|
||||
def get_perms_24(num_bits: int):
|
||||
perm_list: List[int] = []
|
||||
for i in range(32):
|
||||
perm1 = []
|
||||
perm1: List[int] = []
|
||||
col = i // 4
|
||||
col_o = col // 2
|
||||
for block in [0, 1]:
|
||||
|
@ -39,18 +41,18 @@ def get_perms_24(num_bits):
|
|||
|
||||
perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
perm = torch.from_numpy(perm)
|
||||
scale_perm = []
|
||||
scale_perm: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
|
||||
scale_perm_single = []
|
||||
scale_perm_single: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
|
||||
return perm, scale_perm, scale_perm_single
|
||||
|
||||
|
||||
marlin_24_perm = {}
|
||||
marlin_24_scale_perm = {}
|
||||
marlin_24_scale_perm_single = {}
|
||||
marlin_24_perm: Dict[int, torch.Tensor] = {}
|
||||
marlin_24_scale_perm: Dict[int, List[int]] = {}
|
||||
marlin_24_scale_perm_single: Dict[int, List[int]] = {}
|
||||
for num_bits in [4, 8]:
|
||||
perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
|
||||
marlin_24_perm[num_bits] = perm_24
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
"""This file is used for /tests and /benchmarks"""
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
|
@ -11,10 +13,10 @@ import torch
|
|||
#
|
||||
# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
|
||||
# (without the need to use ldmatrix instructions) # noqa: E501
|
||||
def get_perms(num_bits):
|
||||
perm_list = []
|
||||
def get_perms(num_bits: int):
|
||||
perm_list: List[int] = []
|
||||
for i in range(32):
|
||||
perm1 = []
|
||||
perm1: List[int] = []
|
||||
col = i // 4
|
||||
for block in [0, 1]:
|
||||
for row in [
|
||||
|
@ -38,19 +40,19 @@ def get_perms(num_bits):
|
|||
|
||||
perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
perm = torch.from_numpy(perm)
|
||||
scale_perm = []
|
||||
scale_perm: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i + 8 * j for j in range(8)])
|
||||
scale_perm_single = []
|
||||
scale_perm_single: List[int] = []
|
||||
for i in range(4):
|
||||
scale_perm_single.extend(
|
||||
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
||||
return perm, scale_perm, scale_perm_single
|
||||
|
||||
|
||||
marlin_perm = {}
|
||||
marlin_scale_perm = {}
|
||||
marlin_scale_perm_single = {}
|
||||
marlin_perm: Dict[int, torch.Tensor] = {}
|
||||
marlin_scale_perm: Dict[int, List[int]] = {}
|
||||
marlin_scale_perm_single: Dict[int, List[int]] = {}
|
||||
for num_bits in [4, 8]:
|
||||
perm, scale_perm, scale_perm_single = get_perms(num_bits)
|
||||
marlin_perm[num_bits] = perm
|
||||
|
|
|
@ -174,7 +174,7 @@ def _apply_min_tokens_penalty(
|
|||
min_tokens = sampling_params.min_tokens
|
||||
token_ids_to_penalize = sampling_params.all_stop_token_ids
|
||||
if min_tokens > 0 and token_ids_to_penalize:
|
||||
seqs_to_penalize = []
|
||||
seqs_to_penalize: List[int] = []
|
||||
for j, seq_id in enumerate(seq_ids):
|
||||
seq_data = seq_group.seq_data[seq_id]
|
||||
if len(seq_data.output_token_ids) < min_tokens:
|
||||
|
@ -285,7 +285,7 @@ def _greedy_sample(
|
|||
same as the length of selected_seq_groups. If the corresponding
|
||||
seq_group has do_sample=False, tuple contains ([], [])
|
||||
"""
|
||||
samples = samples.tolist()
|
||||
samples_lst = samples.tolist()
|
||||
sample_idx = 0
|
||||
results: SampleResultType = []
|
||||
for seq_group in selected_seq_groups:
|
||||
|
@ -298,7 +298,7 @@ def _greedy_sample(
|
|||
assert num_parent_seqs == 1, (
|
||||
"Greedy sampling should have only one seq.")
|
||||
parent_ids = list(range(num_parent_seqs))
|
||||
next_token_ids = [samples[sample_idx]]
|
||||
next_token_ids = [samples_lst[sample_idx]]
|
||||
results.append((next_token_ids, parent_ids))
|
||||
sample_idx += num_parent_seqs
|
||||
return results
|
||||
|
@ -394,7 +394,7 @@ def _beam_search_sample(
|
|||
next_token_ids = next_token_ids.tolist()
|
||||
else:
|
||||
# Generation phase.
|
||||
cumulative_logprobs: List[int] = [
|
||||
cumulative_logprobs: List[float] = [
|
||||
seq_group.seq_data[seq_id].cumulative_logprob
|
||||
for seq_id in seq_ids
|
||||
]
|
||||
|
@ -466,8 +466,9 @@ def _sample_with_torch(
|
|||
categorized_seq_group_ids[sampling_type].append(i)
|
||||
|
||||
sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
|
||||
sample_metadata = {}
|
||||
multinomial_samples = {}
|
||||
sample_metadata: Dict[SamplingType,
|
||||
Tuple[List[int], List[SequenceGroupToSample]]] = {}
|
||||
multinomial_samples: Dict[SamplingType, torch.Tensor] = {}
|
||||
|
||||
# Create output tensor for sampled token ids.
|
||||
if include_gpu_probs_tensor:
|
||||
|
@ -494,7 +495,7 @@ def _sample_with_torch(
|
|||
greedy_samples = torch.argmax(logprobs[long_sample_indices],
|
||||
dim=-1)
|
||||
|
||||
if include_gpu_probs_tensor:
|
||||
if sampled_token_ids_tensor is not None:
|
||||
# Store sampled tokens in output tensor.
|
||||
sampled_token_ids_tensor[
|
||||
long_sample_indices] = greedy_samples.unsqueeze(-1)
|
||||
|
@ -522,7 +523,7 @@ def _sample_with_torch(
|
|||
probs[long_sample_indices], max_best_of_in_batch,
|
||||
**seeded_args)
|
||||
|
||||
if include_gpu_probs_tensor:
|
||||
if sampled_token_ids_tensor is not None:
|
||||
# Store sampled tokens in output tensor.
|
||||
sampled_token_ids_tensor[
|
||||
long_sample_indices] = multinomial_samples[sampling_type]
|
||||
|
@ -571,7 +572,9 @@ def _sample_with_triton_kernel(
|
|||
categorized_seq_group_ids[sampling_type].append(i)
|
||||
|
||||
sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
|
||||
sample_metadata = {}
|
||||
sample_metadata: Dict[SamplingType,
|
||||
Tuple[List[int], List[SequenceGroupToSample],
|
||||
torch.Tensor, torch.Tensor]] = {}
|
||||
max_best_of_in_batch = 1
|
||||
|
||||
# Counterintiutively, having two loops here is actually faster.
|
||||
|
@ -1008,14 +1011,14 @@ def _build_sampler_output(
|
|||
speculative decoding rejection sampling.
|
||||
"""
|
||||
|
||||
sampler_output = []
|
||||
sampler_output: List[CompletionSequenceGroupOutput] = []
|
||||
for (seq_group, sample_result, group_prompt_logprobs,
|
||||
group_sample_logprobs) in zip(sampling_metadata.seq_groups,
|
||||
sample_results, prompt_logprobs,
|
||||
sample_logprobs):
|
||||
seq_ids = seq_group.seq_ids
|
||||
next_token_ids, parent_ids = sample_result
|
||||
seq_outputs = []
|
||||
seq_outputs: List[SequenceOutput] = []
|
||||
for parent_id, next_token_id, logprobs in zip(parent_ids,
|
||||
next_token_ids,
|
||||
group_sample_logprobs):
|
||||
|
|
|
@ -68,7 +68,7 @@ def _get_model_initialization_kwargs(
|
|||
vision_language_config: Optional[VisionLanguageConfig]
|
||||
) -> Dict[str, Any]:
|
||||
"""Get extra kwargs for model initialization."""
|
||||
extra_kwargs = {}
|
||||
extra_kwargs: Dict[str, Any] = {}
|
||||
if hasattr(model_class, "supported_lora_modules"):
|
||||
extra_kwargs["lora_config"] = lora_config
|
||||
elif lora_config:
|
||||
|
@ -446,7 +446,8 @@ class ShardedStateLoader(BaseModelLoader):
|
|||
Filter out all tensors that share the same memory or a subset of the
|
||||
memory of another tensor.
|
||||
"""
|
||||
same_storage_groups = collections.defaultdict(list)
|
||||
same_storage_groups: Dict[Any, List[Tuple[
|
||||
str, torch.Tensor]]] = collections.defaultdict(list)
|
||||
for key, tensor in tensors.items():
|
||||
if tensor.numel():
|
||||
ptr = tensor.untyped_storage().data_ptr()
|
||||
|
@ -455,7 +456,7 @@ class ShardedStateLoader(BaseModelLoader):
|
|||
def get_end_ptr(tensor: torch.Tensor) -> int:
|
||||
return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
|
||||
|
||||
result = {}
|
||||
result: Dict[str, torch.Tensor] = {}
|
||||
for group in same_storage_groups.values():
|
||||
for k, t in group:
|
||||
a, b = t.data_ptr(), get_end_ptr(t)
|
||||
|
|
|
@ -329,7 +329,7 @@ def np_cache_weights_iterator(
|
|||
# dumping the same model weights to numpy at the same time.
|
||||
with get_lock(model_name_or_path, cache_dir):
|
||||
if not os.path.exists(weight_names_file):
|
||||
weight_names = []
|
||||
weight_names: List[str] = []
|
||||
for bin_file in hf_weights_files:
|
||||
state = torch.load(bin_file, map_location="cpu")
|
||||
for name, param in state.items():
|
||||
|
|
|
@ -72,11 +72,11 @@ _MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS}
|
|||
_OOT_MODELS: Dict[str, Type[nn.Module]] = {}
|
||||
|
||||
# Models not supported by ROCm.
|
||||
_ROCM_UNSUPPORTED_MODELS = []
|
||||
_ROCM_UNSUPPORTED_MODELS: List[str] = []
|
||||
|
||||
# Models partially supported by ROCm.
|
||||
# Architecture -> Reason.
|
||||
_ROCM_PARTIALLY_SUPPORTED_MODELS = {
|
||||
_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
|
||||
"Qwen2ForCausalLM":
|
||||
"Sliding window attention is not yet supported in ROCm's flash attention",
|
||||
"MistralForCausalLM":
|
||||
|
|
|
@ -453,8 +453,8 @@ class ArcticForCausalLM(nn.Module):
|
|||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
|
||||
mlp_params_mapping = []
|
||||
expert_params_mapping = []
|
||||
mlp_params_mapping: List[Tuple[str, str, int]] = []
|
||||
expert_params_mapping: List[Tuple[str, str, int]] = []
|
||||
num_layers = self.config.num_hidden_layers
|
||||
|
||||
for layer in range(num_layers):
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
# This file is based on the LLama model definition file in transformers
|
||||
"""PyTorch Cohere model."""
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
from typing import Iterable, List, Optional, Set, Tuple
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
|
@ -352,7 +352,7 @@ class CohereForCausalLM(nn.Module):
|
|||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params = set()
|
||||
loaded_params: Set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for param_name, shard_name, shard_id in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
# limitations under the License.
|
||||
"""Inference-only Gemma model compatible with HuggingFace weights."""
|
||||
from functools import lru_cache
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
from typing import Iterable, List, Optional, Set, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
@ -363,7 +363,7 @@ class GemmaForCausalLM(nn.Module):
|
|||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params = set()
|
||||
loaded_params: Set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
|
|
|
@ -123,7 +123,7 @@ class SequenceData:
|
|||
output_token_ids = []
|
||||
|
||||
self.prompt_token_ids = prompt_token_ids
|
||||
self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
|
||||
self._prompt_token_ids_tuple = tuple(prompt_token_ids)
|
||||
self.output_token_ids = output_token_ids
|
||||
self.cumulative_logprob = 0.0
|
||||
# The number of tokens that are computed (that run against the model).
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import copy
|
||||
import weakref
|
||||
from typing import List, Tuple
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
|
||||
from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
|
||||
SequenceGroupMetadata)
|
||||
from vllm.spec_decode.interfaces import SpeculativeProposals
|
||||
from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
|
||||
|
@ -71,7 +71,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
|
|||
sample_len)
|
||||
|
||||
# Run model sample_len times.
|
||||
model_outputs = []
|
||||
model_outputs: List[SamplerOutput] = []
|
||||
for _ in range(sample_len):
|
||||
model_output = super().execute_model(
|
||||
execute_model_req=copied_execute_model_req)
|
||||
|
@ -132,7 +132,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
|
|||
|
||||
# Shallow-copy the list of SequenceGroupMetadata. This allows us to
|
||||
# append tokens and change is_prompt without external side-effects.
|
||||
new_seq_group_metadata_list = []
|
||||
new_seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
||||
|
||||
for old_seq_group_metadata in seq_group_metadata_list:
|
||||
# We must shallow-copy seq_group_metadata as is_prompt could change.
|
||||
|
@ -140,7 +140,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
|
|||
new_seq_group_metadata_list.append(seq_group_metadata)
|
||||
|
||||
# We must shallow-copy seq_data as we will append token ids
|
||||
new_seq_data = {}
|
||||
new_seq_data: Dict[int, SequenceData] = {}
|
||||
for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
|
||||
new_seq_data[seq_id] = copy.copy(old_seq_data)
|
||||
new_seq_data[
|
||||
|
|
|
@ -48,7 +48,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
|
|||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
sample_len: int,
|
||||
) -> Tuple[Optional[List[SamplerOutput]], bool]:
|
||||
) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
|
||||
"""NGram match algo to pick proposal candidate. Returns the list of
|
||||
sampler output, one per SequenceGroupMetadata.
|
||||
|
||||
|
@ -58,8 +58,8 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
|
|||
self._raise_if_unsupported(execute_model_req)
|
||||
|
||||
has_spec_out = False
|
||||
token_id_list = []
|
||||
token_prob_list = []
|
||||
token_id_list: List[Optional[torch.Tensor]] = []
|
||||
token_prob_list: List[Optional[torch.Tensor]] = []
|
||||
for idx, seq_group_metadata in enumerate(
|
||||
execute_model_req.seq_group_metadata_list):
|
||||
seq_data = next(iter(seq_group_metadata.seq_data.values()))
|
||||
|
|
|
@ -7,8 +7,8 @@ from vllm.config import SpeculativeConfig
|
|||
from vllm.distributed.communication_op import broadcast_tensor_dict
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
|
||||
SequenceGroupMetadata)
|
||||
from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
|
||||
SamplerOutput, SequenceGroupMetadata)
|
||||
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
|
||||
from vllm.spec_decode.interfaces import (SpeculativeProposals,
|
||||
SpeculativeScorer, SpeculativeScores)
|
||||
|
@ -516,13 +516,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
|
|||
topk_indices_by_step = topk_indices_by_step.tolist()
|
||||
|
||||
# Construct the output on a per-step, per-sequence basis.
|
||||
sampler_output_list = []
|
||||
sampler_output_list: List[SamplerOutput] = []
|
||||
for step_index in range(num_steps):
|
||||
if all(token_id == -1
|
||||
for token_id in accepted_token_ids_by_step[step_index]):
|
||||
break
|
||||
|
||||
step_output_token_ids = []
|
||||
step_output_token_ids: List[CompletionSequenceGroupOutput] = []
|
||||
for sequence_index in range(batch_size):
|
||||
# Each sequence may have a different num_logprobs; retrieve it.
|
||||
num_logprobs = num_logprobs_per_seq[sequence_index]
|
||||
|
|
|
@ -26,10 +26,10 @@ def get_all_num_logprobs(
|
|||
sequence.
|
||||
"""
|
||||
|
||||
all_num_logprobs = []
|
||||
all_num_logprobs: List[int] = []
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
num_logprobs = seq_group_metadata.sampling_params.logprobs
|
||||
if seq_group_metadata.sampling_params.logprobs is None:
|
||||
if num_logprobs is None:
|
||||
num_logprobs = 0
|
||||
all_num_logprobs.append(num_logprobs)
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ class Detokenizer:
|
|||
read_offset = 0
|
||||
next_iter_prefix_offset = 0
|
||||
next_iter_read_offset = 0
|
||||
next_iter_tokens = []
|
||||
next_iter_tokens: List[str] = []
|
||||
prev_tokens = None
|
||||
|
||||
for token_position, prompt_logprobs_for_token in enumerate(
|
||||
|
|
|
@ -20,12 +20,13 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
|
|||
import numpy as np
|
||||
import psutil
|
||||
import torch
|
||||
import torch.types
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import enable_trace_function_call, init_logger
|
||||
|
||||
T = TypeVar("T")
|
||||
logger = init_logger(__name__)
|
||||
|
||||
STR_DTYPE_TO_TORCH_DTYPE = {
|
||||
|
@ -37,6 +38,10 @@ STR_DTYPE_TO_TORCH_DTYPE = {
|
|||
"fp8_e5m2": torch.uint8,
|
||||
}
|
||||
|
||||
P = ParamSpec('P')
|
||||
K = TypeVar("K")
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class Device(enum.Enum):
|
||||
GPU = enum.auto()
|
||||
|
@ -176,7 +181,7 @@ def random_uuid() -> str:
|
|||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_vllm_instance_id():
|
||||
def get_vllm_instance_id() -> str:
|
||||
"""
|
||||
If the environment variable VLLM_INSTANCE_ID is set, return it.
|
||||
Otherwise, return a random UUID.
|
||||
|
@ -192,7 +197,7 @@ def in_wsl() -> bool:
|
|||
return "microsoft" in " ".join(uname()).lower()
|
||||
|
||||
|
||||
def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
|
||||
def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
|
||||
"""Take a blocking function, and run it on in an executor thread.
|
||||
|
||||
This function prevents the blocking function from blocking the
|
||||
|
@ -200,7 +205,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
|
|||
The code in this function needs to be thread safe.
|
||||
"""
|
||||
|
||||
def _async_wrapper(*args, **kwargs) -> asyncio.Future:
|
||||
def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
|
||||
loop = asyncio.get_event_loop()
|
||||
p_func = partial(func, *args, **kwargs)
|
||||
return loop.run_in_executor(executor=None, func=p_func)
|
||||
|
@ -325,7 +330,7 @@ def update_environment_variables(envs: Dict[str, str]):
|
|||
os.environ[k] = v
|
||||
|
||||
|
||||
def chunk_list(lst, chunk_size):
|
||||
def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
|
||||
"""Yield successive chunk_size chunks from lst."""
|
||||
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
|
||||
|
||||
|
@ -336,7 +341,7 @@ def cdiv(a: int, b: int) -> int:
|
|||
|
||||
|
||||
def _generate_random_fp8(
|
||||
tensor: torch.tensor,
|
||||
tensor: torch.Tensor,
|
||||
low: float,
|
||||
high: float,
|
||||
) -> None:
|
||||
|
@ -398,7 +403,10 @@ def create_kv_caches_with_random_flash(
|
|||
torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
|
||||
key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
|
||||
scale = head_size**-0.5
|
||||
key_caches, value_caches = [], []
|
||||
|
||||
key_caches: List[torch.Tensor] = []
|
||||
value_caches: List[torch.Tensor] = []
|
||||
|
||||
for _ in range(num_layers):
|
||||
key_value_cache = torch.empty(size=key_value_cache_shape,
|
||||
dtype=torch_dtype,
|
||||
|
@ -429,7 +437,7 @@ def create_kv_caches_with_random(
|
|||
scale = head_size**-0.5
|
||||
x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
|
||||
key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
|
||||
key_caches = []
|
||||
key_caches: List[torch.Tensor] = []
|
||||
for _ in range(num_layers):
|
||||
key_cache = torch.empty(size=key_cache_shape,
|
||||
dtype=torch_dtype,
|
||||
|
@ -444,7 +452,7 @@ def create_kv_caches_with_random(
|
|||
key_caches.append(key_cache)
|
||||
|
||||
value_cache_shape = (num_blocks, num_heads, head_size, block_size)
|
||||
value_caches = []
|
||||
value_caches: List[torch.Tensor] = []
|
||||
for _ in range(num_layers):
|
||||
value_cache = torch.empty(size=value_cache_shape,
|
||||
dtype=torch_dtype,
|
||||
|
@ -484,7 +492,7 @@ def is_pin_memory_available() -> bool:
|
|||
|
||||
class CudaMemoryProfiler:
|
||||
|
||||
def __init__(self, device=None):
|
||||
def __init__(self, device: Optional[torch.types.Device] = None):
|
||||
self.device = device
|
||||
|
||||
def current_memory_usage(self) -> float:
|
||||
|
@ -560,13 +568,13 @@ def get_dtype_size(dtype: torch.dtype) -> int:
|
|||
return torch.tensor([], dtype=dtype).element_size()
|
||||
|
||||
|
||||
def merge_dicts(dict1: Dict[Any, List[Any]],
|
||||
dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
|
||||
def merge_dicts(dict1: Dict[K, List[T]],
|
||||
dict2: Dict[K, List[T]]) -> Dict[K, List[T]]:
|
||||
"""Merge 2 dicts that have key -> List of items.
|
||||
|
||||
When a key conflicts, the values in dict1 is prioritized.
|
||||
"""
|
||||
merged_dict = defaultdict(list)
|
||||
merged_dict: Dict[K, List[T]] = defaultdict(list)
|
||||
|
||||
for key, value in dict1.items():
|
||||
merged_dict[key].extend(value)
|
||||
|
@ -577,7 +585,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]],
|
|||
return dict(merged_dict)
|
||||
|
||||
|
||||
def init_cached_hf_modules():
|
||||
def init_cached_hf_modules() -> None:
|
||||
"""
|
||||
Lazy initialization of the Hugging Face modules.
|
||||
"""
|
||||
|
@ -613,7 +621,7 @@ def find_library(lib_name: str) -> str:
|
|||
return locs[0]
|
||||
|
||||
|
||||
def find_nccl_library():
|
||||
def find_nccl_library() -> str:
|
||||
"""
|
||||
We either use the library file specified by the `VLLM_NCCL_SO_PATH`
|
||||
environment variable, or we find the library file brought by PyTorch.
|
||||
|
|
|
@ -779,8 +779,8 @@ class ModelRunner:
|
|||
# that will have unique loras, an therefore the max amount of memory
|
||||
# consumption create dummy lora request copies from the lora request
|
||||
# passed in, which contains a lora from the lora warmup path.
|
||||
dummy_lora_requests = []
|
||||
dummy_lora_requests_per_seq = []
|
||||
dummy_lora_requests: List[LoRARequest] = []
|
||||
dummy_lora_requests_per_seq: List[LoRARequest] = []
|
||||
if self.lora_config:
|
||||
assert self.lora_manager is not None
|
||||
with self.lora_manager.dummy_lora_cache():
|
||||
|
|
|
@ -99,8 +99,8 @@ class WorkerWrapperBase:
|
|||
"""
|
||||
|
||||
def __init__(self,
|
||||
worker_module_name=None,
|
||||
worker_class_name=None,
|
||||
worker_module_name: str,
|
||||
worker_class_name: str,
|
||||
trust_remote_code: bool = False) -> None:
|
||||
self.worker_module_name = worker_module_name
|
||||
self.worker_class_name = worker_class_name
|
||||
|
|
Loading…
Reference in New Issue