[mypy] Enable type checking for test directory (#5017)

2024-06-15 12:45:31 +08:00 · 2024-06-15 12:45:31 +08:00 · 0e9164b40a
parent 1b8a0d71cf
commit 0e9164b40a
92 changed files with 509 additions and 378 deletions
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -47,5 +47,5 @@ jobs:
        mypy vllm/model_executor  --config-file pyproject.toml
        mypy vllm/lora --config-file pyproject.toml
        mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/model_executor --config-file pyproject.toml
+        mypy tests --config-file pyproject.toml

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -31,7 +31,7 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@ -200,12 +200,12 @@ def calculate_metrics(
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens = []
+    actual_output_lens: List[int] = []
    total_input = 0
    completed = 0
-    itls = []
-    tpots = []
-    ttfts = []
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
            # We use the tokenizer to count the number of output tokens for all
@ -265,7 +265,7 @@ async def benchmark(
    disable_tqdm: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+        request_func = ASYNC_REQUEST_FUNCS[backend]
    else:
        raise ValueError(f"Unknown backend: {backend}")

@ -292,7 +292,7 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

    benchmark_start_time = time.perf_counter()
-    tasks = []
+    tasks: List[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate):
        prompt, prompt_len, output_len = request
        request_func_input = RequestFuncInput(
@ -310,7 +310,7 @@ async def benchmark(
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

-    if not disable_tqdm:
+    if pbar is not None:
        pbar.close()

    benchmark_duration = time.perf_counter() - benchmark_start_time
@ -466,7 +466,7 @@ def main(args: argparse.Namespace):

    # Save config and results to json
    if args.save_result:
-        result_json = {}
+        result_json: Dict[str, Any] = {}

        # Setup
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -108,8 +108,8 @@ def run_vllm(
    )

    # Add the requests to the engine.
-    prompts = []
-    sampling_params = []
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
    for prompt, _, output_len in requests:
        prompts.append(prompt)
        sampling_params.append(
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -86,9 +86,9 @@ def dequant_no_scale(
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
-def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:

-    n = parts.sum().item()
+    n = int(parts.sum().item())

    device = torch.device('cuda:0')

@ -204,7 +204,7 @@ def main():
        sys.stdout = sys.__stdout__


-def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
             methods):

    # I didn't see visible improvements from increasing these, but feel free :)
@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
    print('')


-def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
               nbooks: int, bits: int, method) -> float:

-    n = parts.sum().item()
+    n = int(parts.sum().item())

    device = torch.device('cuda:0')

--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -1,4 +1,5 @@
 import argparse
+from typing import List

 import torch
 import torch.utils.benchmark as benchmark
@ -23,8 +24,9 @@ ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]


-def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
-              size_m, size_k, size_n):
+def bench_run(results: List[benchmark.Measurement], model: str,
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
    label = "Quant Matmul"

    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
@ -156,7 +158,7 @@ def main(args):
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")

-    results = []
+    results: List[benchmark.Measurement] = []

    for model in args.models:
        for layer in WEIGHT_SHAPES[model]:
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,7 +1,7 @@
 import argparse
 import time
 from datetime import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, TypedDict

 import ray
 import torch
@ -12,8 +12,17 @@ from transformers import AutoConfig
 from vllm.model_executor.layers.fused_moe.fused_moe import *


+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
 def benchmark_config(
-    config: Dict[str, int],
+    config: BenchmarkConfig,
    num_tokens: int,
    num_experts: int,
    shard_intermediate_size: int,
@ -92,7 +101,7 @@ def benchmark_config(
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

-    latencies = []
+    latencies: List[float] = []
    for i in range(num_iters):
        prepare(i)
        torch.cuda.synchronize()
@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
    # Reduced search space for faster tuning.
    # TODO(woosuk): Increase the search space and use a performance model to
    # prune the search space.
-    configs = []
+    configs: List[BenchmarkConfig] = []
    for num_stages in [2, 3, 4, 5]:
        for block_m in [16, 32, 64, 128, 256]:
            for block_k in [64, 128, 256]:
@ -175,8 +184,8 @@ class BenchmarkWorker:
        topk: int,
        dtype: torch.dtype,
        use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
        best_config = None
        best_time = float("inf")
        for config in tqdm(search_space):
@ -199,10 +208,11 @@ class BenchmarkWorker:
                best_config = config
        now = datetime.now()
        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
        return best_config


-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
    return {
        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:


 def save_configs(
-    configs: Dict[int, Dict[str, int]],
+    configs: Dict[int, BenchmarkConfig],
    num_experts: int,
    shard_intermediate_size: int,
    hidden_size: int,
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,7 +1,7 @@
 import argparse
 import random
 import time
-from typing import Optional
+from typing import List, Optional

 import torch

@ -54,14 +54,17 @@ def main(

    # Create the block tables.
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
    for _ in range(num_seqs):
        block_table = [
            random.randint(0, NUM_BLOCKS - 1)
            for _ in range(max_num_blocks_per_seq)
        ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst,
+                                dtype=torch.int,
+                                device=device)

    # Create the KV cache.
    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -1,11 +1,12 @@
 import argparse
 from itertools import accumulate
-from typing import Optional
+from typing import List, Optional

 import nvtx
 import torch

-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
+                                                         get_rope)


 def benchmark_rope_kernels_multi_lora(
@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
    # instances to simulate the same behavior
-    non_batched_ropes = []
+    non_batched_ropes: List[RotaryEmbedding] = []
    for scaling_factor in scaling_factors:
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
@ -2,7 +2,7 @@ import argparse
 import glob
 import json
 import os
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple

 import numpy as np
 import torch
@ -19,7 +19,7 @@ def _prepare_hf_weights(
    quantized_model_dir: str,
    load_format: str = "auto",
    fall_back_to_pt: bool = True,
-) -> Tuple[str, List[str], bool]:
+) -> Tuple[List[str], bool]:
    if not os.path.isdir(quantized_model_dir):
        raise FileNotFoundError(
            f"The quantized model directory `{quantized_model_dir}` "
@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str,


 def _kv_scales_extractor(
-        hf_tensor_files: Iterable[str],
+        hf_tensor_files: List[str],
        use_safetensors: bool,
        rank_keyword: str = "rank",
        expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
@ -115,7 +115,7 @@ def _kv_scales_extractor(
    for char in rank_keyword:
        assert not char.isdecimal(
        ), f"Rank keyword {rank_keyword} contains a numeric character!"
-    rank_scales_map = {}
+    rank_scales_map: Dict[int, Dict[int, float]] = {}
    for tensor_file in hf_tensor_files:
        try:
            rank_idx = tensor_file.find(rank_keyword)
@ -141,7 +141,7 @@ def _kv_scales_extractor(
            raise

        if rank not in rank_scales_map:
-            layer_scales_map = {}
+            layer_scales_map: Dict[int, float] = {}
            rank_scales_map[rank] = layer_scales_map
        else:
            raise RuntimeError(
@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str,
            "does not exist.")
    metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))

-    result = {}
+    result: Dict[str, Any] = {}
    for file in metadata_files:
        with open(file) as f:
            try:
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@ -5,7 +5,7 @@ distributively on a multi-nodes cluster.
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """

-from typing import Dict
+from typing import Any, Dict, List

 import numpy as np
 import ray
@ -40,8 +40,8 @@ class LLMPredictor:
        # The output is a list of RequestOutput objects that contain the prompt,
        # generated text, and other information.
        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt = []
-        generated_text = []
+        prompt: List[str] = []
+        generated_text: List[str] = []
        for output in outputs:
            prompt.append(output.prompt)
            generated_text.append(' '.join([o.text for o in output.outputs]))
@ -71,7 +71,7 @@ def scheduling_strategy_fn():
        pg, placement_group_capture_child_tasks=True))


-resources_kwarg = {}
+resources_kwarg: Dict[str, Any] = {}
 if tensor_parallel_size == 1:
    # For tensor_parallel_size == 1, we simply set num_gpus=1.
    resources_kwarg["num_gpus"] = 1
--- a/format.sh
+++ b/format.sh
@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/lora --config-file pyproject.toml
 mypy vllm/logging --config-file pyproject.toml
-mypy vllm/model_executor --config-file pyproject.toml
+mypy tests --config-file pyproject.toml


 # If git diff returns a file that is in the skip list, the file may be checked anyway:
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest

 from vllm.core.block.block_table import BlockTable
@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
    token_ids = list(range(sequence_len))
    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))

-    block_tables = []
+    block_tables: List[BlockTable] = []
    for i in range(5):
        assert allocator.get_num_free_blocks(
            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
    num_immutable_blocks_per_alloc = len(
        chunked_tokens) - num_mutable_blocks_per_alloc

-    block_tables = []
+    block_tables: List[BlockTable] = []
    for alloc_i in range(1, 6):

        block_tables.append(
@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
    )
    block_table.allocate(token_ids=token_ids, device=Device.GPU)

-    appended_so_far = []
+    appended_so_far: List[int] = []
    for append in chunk_list(token_ids_to_append, append_size):
        block_table.append_token_ids(append)
        appended_so_far.extend(append)
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@ -123,7 +123,7 @@ class TestPrefixCachingBlock:
                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
-        blocks = []
+        blocks: List[PrefixCachingBlock] = []
        num_blocks = math.ceil(
            len(token_ids) / block_size) + num_empty_trailing_blocks

@ -608,7 +608,7 @@ class TestPrefixCachingBlockAllocator:
    ) -> List[PrefixCachingBlock]:
        """Helper method which creates a chain of blocks.
        """
-        blocks = []
+        blocks: List[Block] = []
        num_blocks = math.ceil(len(token_ids) / block_size)

        if num_blocks == 0:
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@ -483,11 +483,11 @@ def test_chunked_prefill_preempt():
    # The request should be preempted.
    scheduler.block_manager.can_append_slots = MagicMock()

-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group1(seq_group, num_lookahead_slots):
        return seq_group.request_id != "1"

    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group1)

    # The running prefill is now preempted.
    _, out = schedule_and_update_computed_tokens(scheduler)
@ -505,11 +505,11 @@ def test_chunked_prefill_preempt():
    assert seq_group.get_num_uncomputed_tokens() == 30

    # We should be able to run prefill twice as it is chunked.
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+    def cannot_append_second_group2(seq_group, num_lookahead_slots):
        return True

    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+        cannot_append_second_group2)
    _, out = schedule_and_update_computed_tokens(scheduler)
    assert len(out.scheduled_seq_groups) == 1
    assert out.num_prefill_groups == 1
@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs():
    cache_config.num_cpu_blocks = 8
    cache_config.num_gpu_blocks = 8
    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running = []
+    running: List[SequenceGroup] = []

    _, seq_group = create_dummy_prompt("1", prompt_length=65)
    scheduler.add_seq_group(seq_group)
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@ -1,6 +1,6 @@
 import time
 from collections import deque
-from typing import List
+from typing import Deque, List, Set, Tuple
 from unittest.mock import MagicMock

 import pytest  # noqa
@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group():

    # Add multiple seq groups to scheduler.
    num_seq_group = 4
-    request_ids = set()
+    request_ids: Set[str] = set()
    for i in range(num_seq_group):
        _, seq_group = create_dummy_prompt(str(i), block_size)
        scheduler.add_seq_group(seq_group)
@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len():
    Test prompt longer than max_prompt_len is aborted.
    """
    scheduler = initialize_scheduler(max_model_len=30)
-    _, seq_group = create_dummy_prompt(0, prompt_length=60)
+    _, seq_group = create_dummy_prompt("0", prompt_length=60)
    waiting = deque([seq_group])
    budget = create_token_budget()
    remaining_waiting, output = scheduler._schedule_prefills(
@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget():
    Test token budget respected.
    """
    scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget(token_budget=0)
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs():
    Test max seq respected.
    """
    scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget(max_num_seqs=2)
    for i in range(3):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora():
    """
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config)
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget(token_budget=120)
-    curr_loras = set()
+    curr_loras: Set[int] = set()
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity():
    Test sequence cannot be scheduled due to block manager has no capacity.
    """
    scheduler = initialize_scheduler()
-    waiting = deque()
+    waiting: Deque[SequenceGroup] = deque()
    budget = create_token_budget()
    for i in range(3):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
@ -536,7 +536,7 @@ def test_decode_schedule_preempted():
    Test decodes cannot be scheduled and preempted.
    """
    scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    for i in range(3):
@ -577,7 +577,7 @@ def test_decode_swap_beam_search():
    Test best_of > 1 swap out blocks
    """
    scheduler = initialize_scheduler()
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    budget = create_token_budget()
@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update():
    """
    scheduler = initialize_scheduler()
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    running = deque()
+    running: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    scheduler._allocate_and_set_running(seq_group)
@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update():

 def test_schedule_swapped_simple():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
@ -683,10 +683,10 @@ def test_schedule_swapped_simple():

 def test_schedule_swapped_max_token_budget():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
        scheduler._allocate_and_set_running(seq_group)
@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget():

 def test_schedule_swapped_max_seqs():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for i in range(4):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
        scheduler._allocate_and_set_running(seq_group)
@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs():
 def test_schedule_swapped_max_loras():
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config)
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
-    curr_loras = set()
-    blocks_to_swap_out = []
+    curr_loras: Set[int] = set()
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for i in range(2):
        _, seq_group = create_dummy_prompt(str(i),
                                           prompt_length=60,
@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras():

 def test_schedule_swapped_cannot_swap_in():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
        scheduler._allocate_and_set_running(seq_group)
@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in():

 def test_infeasible_swap():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
        scheduler._allocate_and_set_running(seq_group)
@ -834,13 +834,13 @@ def test_infeasible_swap():

 def test_schedule_swapped_blocks_to_copy():
    scheduler = initialize_scheduler()
-    swapped = deque()
+    swapped: Deque[SequenceGroup] = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out = []
+    blocks_to_swap_out: List[Tuple[int, int]] = []
    scheduler._swap_out(seq_group, blocks_to_swap_out)
    swapped.append(seq_group)

--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@ -1,5 +1,7 @@
 import time
-from typing import Iterable, Optional, Tuple
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple

 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder(
    lora_request: Optional[LoRARequest] = None,
    use_beam_search: bool = False,
    best_of: int = 1,
-) -> Tuple[Sequence, SequenceGroup]:
+) -> Tuple[Sequence, Sequence, SequenceGroup]:
    if not block_size:
        block_size = decoder_prompt_length

@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder(

 def create_seq_group(
        seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
        request_id: str = '0',
        seq_id_start: int = 0,
        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
@ -98,7 +100,7 @@ def create_seq_group(

    prompt_token_ids = [0] * seq_prompt_len

-    seqs = []
+    seqs: List[Sequence] = []
    for seq_id_offset, output_len in enumerate(seq_output_lens):
        seq = Sequence(
            seq_id=seq_id_start + seq_id_offset,
@ -125,7 +127,7 @@ def create_seq_group(

 def create_seq_group_encoder_decoder(
        seq_prompt_len: int = 1024,
-        seq_output_lens: Iterable[int] = (128, ),
+        seq_output_lens: GenericSequence[int] = (128, ),
        request_id: str = '0',
        seq_id_start: int = 0,
        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@ -1,5 +1,6 @@
 import multiprocessing
 import os
+from typing import Dict, List

 import pytest
 import torch
@ -17,9 +18,9 @@ from vllm.utils import update_environment_variables

 def distributed_run(fn, world_size):
    number_of_processes = world_size
-    processes = []
+    processes: List[multiprocessing.Process] = []
    for i in range(number_of_processes):
-        env = {}
+        env: Dict[str, str] = {}
        env['RANK'] = str(i)
        env['LOCAL_RANK'] = str(i)
        env['WORLD_SIZE'] = str(number_of_processes)
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@ -6,7 +6,7 @@ from vllm.utils import cuda_device_count_stateless


@ray.remote
-class _CUDADeviceCountStatelessTestActor():
+class _CUDADeviceCountStatelessTestActor:

    def get_count(self):
        return cuda_device_count_stateless()
@ -22,7 +22,8 @@ def test_cuda_device_count_stateless():
    """Test that cuda_device_count_stateless changes return value if
    CUDA_VISIBLE_DEVICES is changed."""

-    actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
+    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
+        num_gpus=2).remote()
    assert sorted(ray.get(
        actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
    assert ray.get(actor.get_count.remote()) == 2
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@ -1,6 +1,7 @@
 # imports for guided decoding tests
 import json
 import re
+from typing import List

 import jsonschema
 import openai  # use the official client for correctness check
@ -453,7 +454,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
                                             max_tokens=5,
                                             temperature=0.0,
                                             stream=True)
-    chunks = []
+    chunks: List[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        chunks.append(chunk.choices[0].text)
@ -499,7 +500,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
        temperature=0.0,
        stream=True,
    )
-    chunks = []
+    chunks: List[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@ -72,27 +72,27 @@ def ref_single_query_cached_kv_attention(
    block_size = value_cache.shape[3]
    num_seqs = query.shape[0]

-    block_tables = block_tables.cpu().tolist()
-    seq_lens = seq_lens.cpu().tolist()
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
    for i in range(num_seqs):
        q = query[i].unsqueeze(0)
-        block_table = block_tables[i]
-        seq_len = int(seq_lens[i])
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])

-        keys = []
-        values = []
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
        for j in range(seq_len):
            block_number = int(block_table[j // block_size])
            block_offset = j % block_size

            k = key_cache[block_number, :, :, block_offset, :]
            k = k.reshape(num_kv_heads, head_size)
-            keys.append(k)
+            keys_lst.append(k)

            v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
-        keys = torch.stack(keys, dim=0)
-        values = torch.stack(values, dim=0)
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
        if num_queries_per_kv > 1:
            # Handle MQA and GQA
            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
@ -157,14 +157,15 @@ def test_paged_attention(

    # Create the block tables.
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
    for _ in range(num_seqs):
        block_table = [
            random.randint(0, NUM_BLOCKS - 1)
            for _ in range(max_num_blocks_per_seq)
        ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)

    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
@ -283,7 +284,7 @@ def ref_multi_query_kv_attention(
    dtype: torch.dtype,
 ) -> torch.Tensor:
    num_seqs = len(cu_seq_lens) - 1
-    ref_outputs = []
+    ref_outputs: List[torch.Tensor] = []
    for i in range(num_seqs):
        start_idx = cu_seq_lens[i]
        end_idx = cu_seq_lens[i + 1]
@ -303,8 +304,8 @@ def ref_multi_query_kv_attention(
            attn_mask=attn_mask,
        )
        ref_outputs.append(ref_output)
-    ref_output = torch.cat(ref_outputs, dim=0)
-    return ref_output
+
+    return torch.cat(ref_outputs, dim=0)


 # TODO(woosuk): Add tests for USE_ALIBI=True.
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention(
    block_size = value_cache.shape[3]
    num_seqs = query.shape[0]

-    block_tables = block_tables.cpu().tolist()
-    seq_lens = seq_lens.cpu().tolist()
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
    for i in range(num_seqs):
        q = query[i].unsqueeze(0)
-        block_table = block_tables[i]
-        seq_len = int(seq_lens[i])
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])

-        keys = []
-        values = []
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
        for j in range(seq_len):
            block_number = int(block_table[j // block_size])
            block_offset = j % block_size

            k = key_cache[block_number, :, :, block_offset, :]
            k = k.reshape(num_kv_heads, head_size)
-            keys.append(k)
+            keys_lst.append(k)

            v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
-        keys = torch.stack(keys, dim=0)
-        values = torch.stack(values, dim=0)
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
        if num_queries_per_kv > 1:
            # Handle MQA and GQA
            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill(
        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)

    ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
+        cu_seq_lens.tolist(),
        query,
        key,
        value,
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@ -1,5 +1,5 @@
 import random
-from typing import Tuple
+from typing import List, Tuple

 import pytest
 import torch
@ -63,7 +63,7 @@ def test_copy_blocks(
    src_blocks = random.sample(range(num_blocks), num_mappings)
    remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
    dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
-    block_mapping = []
+    block_mapping: List[Tuple[int, int]] = []
    for i in range(num_mappings):
        src = src_blocks[i]
        dst1 = dst_blocks[2 * i]
@ -131,8 +131,8 @@ def test_reshape_and_cache(
    torch.set_default_device(device)
    # Create a random slot mapping.
    num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)

    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
    _, key, value = qkv.unbind(dim=1)
@ -170,12 +170,12 @@ def test_reshape_and_cache(
    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies = block_indicies.cpu().tolist()
+    block_indicies_lst = block_indicies.cpu().tolist()
    block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
+    block_offsets_lst = block_offsets.cpu().tolist()
    for i in range(num_tokens):
-        block_idx = block_indicies[i]
-        block_offset = block_offsets[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]

@ -224,8 +224,10 @@ def test_reshape_and_cache_flash(

    # Create a random slot mapping.
    num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device)
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)

    qkv = torch.randn(num_tokens,
                      3,
@ -257,13 +259,13 @@ def test_reshape_and_cache_flash(
                                slot_mapping, kv_cache_dtype)

    # Run the reference implementation.
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
-    block_indicies = block_indicies.cpu().tolist()
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies_lst = block_indicies.cpu().tolist()
    block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
+    block_offsets_lst = block_offsets.cpu().tolist()
    for i in range(num_tokens):
-        block_idx = block_indicies[i]
-        block_offset = block_offsets[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
        cloned_key_cache[block_idx, block_offset, :, :] = key[i]
        cloned_value_cache[block_idx, block_offset, :, :] = value[i]

--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@ -17,13 +17,13 @@ capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]


-def to_fp8(tensor: torch.tensor):
+def to_fp8(tensor: torch.Tensor):
    finfo = torch.finfo(torch.float8_e4m3fn)
    return torch.round(tensor.clamp(
        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)


-def to_int8(tensor: torch.tensor):
+def to_int8(tensor: torch.Tensor):
    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)


--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@ -25,7 +25,7 @@ def ref_paged_attn(
    block_tables = block_tables.cpu().numpy()
    _, block_size, num_kv_heads, head_size = key_cache.shape

-    outputs = []
+    outputs: List[torch.Tensor] = []
    start_idx = 0
    for i in range(num_seqs):
        query_len = query_lens[i]
@ -70,7 +70,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("dtype", DTYPES)
@torch.inference_mode
 def test_flash_attn_with_paged_kv(
-    kv_lens: List[Tuple[int, int]],
+    kv_lens: List[int],
    num_heads: Tuple[int, int],
    head_size: int,
    dtype: torch.dtype,
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@ -1,5 +1,5 @@
 from itertools import accumulate, product
-from typing import List, Optional
+from typing import Dict, List, Optional

 import pytest
 import torch
@ -126,7 +126,7 @@ def test_batched_rotary_embedding(
                                      query,
                                      key,
                                      offsets=torch.zeros(batch_size * seq_len,
-                                                          dtype=int,
+                                                          dtype=torch.long,
                                                          device=device))
    # Compare the results.
    assert torch.allclose(out_query,
@ -214,20 +214,16 @@ def test_batched_rotary_embedding_multi_lora(
 def test_rope_module_cache():
    MAX_POSITIONS = [123, 1234]
    BASES = [10000, 1000000]
-    ROPE_SCALINGS = [
-        None, {
-            "type": "linear",
-            "factor": (1, )
-        }, {
-            "type": "dynamic",
-            "factor": 1
-        }
-    ]
-    settings = [
-        HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
-        ROPE_SCALINGS, DTYPES
-    ]
-    rope_setting_id_map = {}
+    ROPE_SCALINGS = (None, {
+        "type": "linear",
+        "factor": (1, )
+    }, {
+        "type": "dynamic",
+        "factor": 1
+    })
+    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
+                ROPE_SCALINGS, DTYPES)
+    rope_setting_id_map: Dict[str, int] = {}
    for setting in product(*settings):
        head_size, rotary_dim, max_position, base, \
            is_neox_stype, rope_scaling, dtype = setting
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -2,6 +2,7 @@ import contextlib
 import gc
 import tempfile
 from collections import OrderedDict
+from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch

 import pytest
@ -24,7 +25,18 @@ from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model

-LONG_LORA_INFOS = [{
+
+class ContextIDInfo(TypedDict):
+    lora_id: int
+    context_length: str
+
+
+class ContextInfo(TypedDict):
+    lora: str
+    context_length: str
+
+
+LONG_LORA_INFOS: List[ContextIDInfo] = [{
    "lora_id": 1,
    "context_length": "16k",
 }, {
@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1,
                       long_context_lora_files_16k_2,
                       long_context_lora_files_32k):
    cleanup()
-    infos = {}
+    infos: Dict[int, ContextInfo] = {}
    for lora_checkpoint_info in LONG_LORA_INFOS:
        lora_id = lora_checkpoint_info["lora_id"]
        if lora_id == 1:
@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1,


@pytest.fixture
-def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+def llama_2_7b_engine_extra_embeddings():
    cleanup()
    get_model_old = get_model

@ -244,7 +256,6 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:


@pytest.fixture
-def llama_2_7b_model_extra_embeddings(
-        llama_2_7b_engine_extra_embeddings) -> nn.Module:
+def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
           model_runner.model)
--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest

 import vllm
@ -10,7 +12,7 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501


-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
        PROMPT_TEMPLATE.format(
@ -30,7 +32,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@ -1,3 +1,5 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest

@ -6,7 +8,7 @@ MODEL_PATH = "THUDM/chatglm3-6b"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501


-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
        PROMPT_TEMPLATE.format(
@ -26,7 +28,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@ -1,10 +1,12 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest

 MODEL_PATH = "google/gemma-7b"


-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        "Quote: Imagination is",
        "Quote: Be yourself;",
@ -17,7 +19,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@ -26,7 +26,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int):
    return lora_model


-def do_sample(llm,
+def do_sample(llm: vllm.LLM,
              lora_path: Optional[str] = None,
              lora_id: Optional[int] = None,
              logprobs: int = 0,
@ -42,8 +42,8 @@ def do_sample(llm,
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
-    generated_logprobs = []
+    generated_texts: List[str] = []
+    generated_logprobs: List[List[List[int]]] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@ -109,7 +109,7 @@ def populate_loras(

    for slot_idx, lora_id in enumerate(id_to_index):
        if lora_id is not None:
-            subloras = []
+            subloras: List[LoRALayerWeights] = []
            sublora_len = layer_weights.shape[0] // repeats
            for i in range(repeats):
                sublora = DummyLoRAManager().init_random_lora(
@ -158,7 +158,10 @@ def create_random_inputs(

    low, high = input_range

-    inputs, index_mapping, prompt_mapping = [], [], []
+    inputs: List[torch.Tensor] = []
+    index_mapping: List[int] = []
+    prompt_mapping: List[int] = []
+
    for _ in range(num_inputs):
        if input_type == torch.int:
            inputs.append(
@ -222,7 +225,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:

        lora_result = lora_embedding(torch.cat(inputs))

-        expected_results = []
+        expected_results: List[torch.Tensor] = []
        for input_, lora_id in zip(inputs, prompt_mapping):
            lora = lora_dict[lora_id]
            result = embedding(input_)
@ -356,7 +359,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,

        lora_result = lora_embedding(torch.cat(original_inputs))

-        expected_results = []
+        expected_results: List[torch.Tensor] = []
        for input_, original_input_, lora_id in zip(inputs, original_inputs,
                                                    prompt_mapping):
            lora = lora_dict[lora_id]
@ -482,7 +485,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,

        logits_processor.org_vocab_size = (vocab_size +
                                           lora_config.lora_extra_vocab_size)
-        expected_results = []
+        expected_results: List[torch.Tensor] = []
        for input_, lora_id in zip(inputs, prompt_mapping):
            lora = lora_dict[lora_id]
            result = logits_processor._get_logits(hidden_states=input_,
@ -598,7 +601,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,

        lora_result = lora_linear(torch.cat(inputs))[0]

-        expected_results = []
+        expected_results: List[torch.Tensor] = []
        for input_, lora_id in zip(inputs, prompt_mapping):
            lora = lora_dict[lora_id]
            result = linear(input_)[0]
@ -729,7 +732,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,

        lora_result = lora_linear(torch.cat(inputs))[0]

-        expected_results = []
+        expected_results: List[torch.Tensor] = []
        for input_, lora_id in zip(inputs, prompt_mapping):
            result = linear(input_)[0]
            subloras = sublora_dict[lora_id]
@ -885,9 +888,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
    computed_added_vocab_size = 0
    vocab_size_padded = -1

-    all_org_tokens = []
-    all_added_tokens = []
-    token_ids = []
+    all_org_tokens: List[int] = []
+    all_added_tokens: List[int] = []
+    token_ids: List[int] = []

    for tp_rank in range(tp_size):
        with patch(
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import ray

@ -9,7 +11,7 @@ from .conftest import cleanup
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"


-def do_sample(llm, lora_path: str, lora_id: int):
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
@ -27,7 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@ -77,7 +77,7 @@ def evaluate_json_response(model_response, golden_response):


 def generate(
-    llm,
+    llm: vllm.LLM,
    inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
 ):
    prompts, sampling_param, lora_request = inputs
@ -159,7 +159,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
        non-batched generation.
    """
    # Create non batched results first to compare against batched results
-    non_batched_results = []
+    non_batched_results: List[str] = []

    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
@ -172,7 +172,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
    # Create batched results
    # Each element of the batch must be
    # (prompt, prompt_sampling_params, prompt_lora_request)
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
        batched_prompts.extend([
@ -196,7 +197,8 @@ def test_self_consistency(lora_llm, long_context_infos):
    num_loras = len(long_context_infos)

    # Create results in order of long_context_infos
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
        batched_prompts.extend([
@ -244,7 +246,7 @@ def test_quality(lora_llm, long_context_infos):
    The test is expected to run for about 1 minute on a p4de.24xlarge
    instance.
    """
-    scores = []
+    scores: List[float] = []
    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
        for prompt_and_response in prompts_and_responses[context_len]:
@ -277,7 +279,8 @@ def test_max_len(lora_llm, long_context_infos):
            generate(lora_llm, (bad_prompt, sampling_params, lora_request))

    # Also test batched
-    batched_prompts = []
+    batched_prompts: List[Tuple[str, SamplingParams,
+                                Optional[LoRARequest]]] = []
    for lora_id_with_bad_inputs in long_context_infos:
        for lora_id, info in long_context_infos.items():
            context_len = info["context_length"]
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest

 from vllm.lora.models import LoRAModel
@ -17,7 +19,7 @@ def test_load_checkpoints(
    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules = []
+    expected_lora_modules: List[str] = []
    for module in supported_lora_modules:
        if module in packed_modules_mapping:
            expected_lora_modules.extend(packed_modules_mapping[module])
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@ -1,5 +1,5 @@
 import os
-from typing import List
+from typing import Dict, List

 import pytest
 import torch
@ -62,7 +62,7 @@ def test_from_lora_tensors(sql_lora_files):

 def create_lora(lora_id: int, model: nn.Module,
                sub_modules: List[str]) -> LoRAModel:
-    loras = {}
+    loras: Dict[str, LoRALayerWeights] = {}
    for name in sub_modules:
        w = model.get_submodule(name).weight
        loras[name] = LoRALayerWeights(
@ -83,7 +83,7 @@ def create_packed_lora(
    empty_replaced_module_name=None,
 ) -> LoRAModel:
    w = model.get_submodule(module_name).weight
-    loras = {}
+    loras: Dict[str, LoRALayerWeights] = {}
    for replaced_module_name in replaced_module_names:
        if replaced_module_name == empty_replaced_module_name:
            continue
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch

@ -7,7 +9,7 @@ from vllm.lora.request import LoRARequest
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"


-def do_sample(llm, lora_path: str, lora_id: int):
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
@ -20,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@ -1,3 +1,5 @@
+from typing import List
+
 import vllm
 from vllm.lora.request import LoRARequest

@ -6,7 +8,7 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501


-def do_sample(llm, lora_path: str, lora_id: int) -> str:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        PROMPT_TEMPLATE.format(
            sql_prompt=
@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
        if lora_id else None,
    )
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@ -25,7 +25,10 @@ MODELS: List[ModelWithQuantization] = [
 ]


-def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              max_tokens: int = 256) -> List[str]:
    raw_prompts = [
        "Give me an orange-ish brown color",
        "Give me a neon pink color",
@ -45,7 +48,7 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts = []
+    generated_texts: List[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Dict, List, Optional

 import torch

@ -9,13 +9,13 @@ class DummyLoRAManager:

    def __init__(self):
        super().__init__()
-        self._loras = {}
+        self._loras: Dict[str, LoRALayerWeights] = {}

    def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
        self._loras[module_name] = lora

-    def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
-        return self._loras.get(module_name, None)
+    def get_module_lora(self, module_name: str) -> LoRALayerWeights:
+        return self._loras[module_name]

    def init_random_lora(self,
                         module_name: str,
@ -68,11 +68,11 @@ class DummyLoRAManager:
        module_name: str,
        input_dim: int,
        output_dims: List[int],
-        noop_lora_index: List[int] = None,
-        rank=8,
+        noop_lora_index: Optional[List[int]] = None,
+        rank: int = 8,
    ):
-        base_loras = []
-        noop_lora_index = set(noop_lora_index or [])
+        base_loras: List[LoRALayerWeights] = []
+        noop_lora_index_set = set(noop_lora_index or [])

        for i, out_dim in enumerate(output_dims):
            base_lora = self.init_lora(
@ -80,7 +80,7 @@ class DummyLoRAManager:
                input_dim,
                out_dim,
                rank=rank,
-                noop=i in noop_lora_index,
+                noop=i in noop_lora_index_set,
            )
            base_loras.append(base_lora)
        packed_lora = PackedLoRALayerWeights.pack(base_loras)
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@ -3,6 +3,7 @@
 Note: these tests will only pass on L4 GPU.
 """
 import os
+from typing import List

 import pytest
 import torch
@ -100,7 +101,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
    ]

    params = SamplingParams(max_tokens=20, temperature=0)
-    generations = []
+    generations: List[str] = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@ -2,8 +2,11 @@

 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
+from typing import List
+
 import pytest

+from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device

@ -43,7 +46,7 @@ def test_block_allocator(
 def test_eviction(num_blocks: int, ):
    block_size = 16
    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
-    blocks = []
+    blocks: List[PhysicalTokenBlock] = []

    for i in range(num_blocks):
        # use i as the block_hash
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@ -4,6 +4,7 @@ Run `pytest tests/quantization/test_configs.py --forked`.
 """

 from dataclasses import dataclass
+from typing import Tuple

 import pytest

@ -51,7 +52,7 @@ MODEL_ARG_EXPTYPES = [


@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
-def test_auto_gptq(model_arg_exptype: str) -> None:
+def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
    model_path, quantization_arg, expected_type = model_arg_exptype

    try:
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch

@ -62,21 +64,22 @@ def test_get_prompt_logprobs(
        for logprobs in result.outputs[0].logprobs:
            assert len(logprobs) == num_top_logprobs
        output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens = []
+        output_string_from_most_likely_tokens_lst: List[str] = []
        for top_logprobs in result.outputs[0].logprobs:
            top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens.append(
+            output_string_from_most_likely_tokens_lst.append(
                top_logprob.decoded_token)

        if detokenize:
            output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens)
+                output_string_from_most_likely_tokens_lst)
            assert output_text == output_string_from_most_likely_tokens, (
                "The output text from the top logprob for each token position "
                "should be the same as the output text in the result.")
        else:
            assert output_text == ''
-            assert output_string_from_most_likely_tokens == [None] * max_tokens
+            assert output_string_from_most_likely_tokens_lst == ([None] *
+                                                                 max_tokens)

        # The first prompt logprob is always None
        assert result.prompt_logprobs[0] is None
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@ -246,8 +246,8 @@ def test_rejection_sampling_approximates_target_distribution(
        draft_and_target_probs_equal)

    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
-    distance_wrt_reference = []
-    distance_wrt_target = []
+    distance_wrt_reference: List[float] = []
+    distance_wrt_target: List[float] = []

    for num_samples in sample_sizes:
        (reference_vs_rejsample_dist,
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@ -1,6 +1,6 @@
 import itertools
 import random
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 from unittest.mock import patch

 import pytest
@ -49,8 +49,8 @@ def _do_sample(
    sampling_params: SamplingParams,
    device: str,
 ):
-    seq_group_metadata_list = []
-    seq_lens = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@ -212,7 +212,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
        batch_size = random.randint(1, 128)

        expected_penalization = []
-        sequence_metadata_list = []
+        sequence_metadata_list: List[SequenceGroupMetadata] = []
        # 20% chance to generate seq group metadata list with all prompts
        is_prompt = random.random() < 0.2
        while batch_size > 0:
@ -232,8 +232,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
                eos_token_id=eos_token_id,
                stop_token_ids=stop_token_ids)

-            seq_data = {}
-            seq_group_penalization = []
+            seq_data: Dict[int, SequenceData] = {}
+            seq_group_penalization: List[bool] = []
            for _ in range(num_seqs):
                num_input = random.randint(1, 100)
                num_generated = 0 if is_prompt else random.randint(1, 100)
@ -392,17 +392,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
    else:
        test_cases = [generate_test_case()]

-    def run_test_case(*,
-                      expected_penalization=None,
-                      seq_group_metadata_list=None):
+    def run_test_case(*, expected_penalization: List[bool],
+                      seq_group_metadata_list: List[SequenceGroupMetadata]):
        assert expected_penalization, \
            "Invalid test case, need expected_penalization"
        assert seq_group_metadata_list, \
            "Invalid test case, need seq_group_metadata_list"

        batch_size = 0
-        seq_lens = []
-        sampling_params_per_row = []
+        seq_lens: List[int] = []
+        sampling_params_per_row: List[SamplingParams] = []
        for sgm in seq_group_metadata_list:
            sampling_params = sgm.sampling_params

@ -472,15 +471,15 @@ def test_sampler_mixed(seed: int, device: str):
    batch_size = random.randint(1, 256)
    input_tensor, fake_logits, sampler = _prepare_test(batch_size)

-    seq_group_metadata_list = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
    expected_tokens: List[Optional[List[int]]] = []
-    seq_lens = []
+    seq_lens: List[int] = []
    for i in range(batch_size):
        expected: Optional[List[int]] = None
        sampling_type = random.randint(0, 3)
        if sampling_type == 0:
            sampling_params = SamplingParams(temperature=0)
-            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
+            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
        elif sampling_type in (1, 2):
            n = random.randint(1, 10)
            sampling_params = SamplingParams(
@ -536,15 +535,18 @@ def test_sampler_mixed(seed: int, device: str):
                ]
                continue

+            expected_tokens_item = expected_tokens[i]
+            assert expected_tokens_item is not None
+
            for n, nth_output in enumerate(sequence_output.samples):
                if (metadata.sampling_params.temperature == 0
                        or metadata.sampling_params.seed is not None):
                    # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens[i][n]
+                    assert nth_output.output_token == expected_tokens_item[n]
                else:
                    # For non-seeded random check that one of the high-logit
                    # tokens were chosen
-                    assert nth_output.output_token in expected_tokens[i]
+                    assert nth_output.output_token in expected_tokens_item

    # Test batch
    test_sampling()
@ -588,8 +590,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
    warpers = generation_model._get_logits_warper(generation_config)
    assert len(warpers) == 2  # top_p and top_k

-    seq_group_metadata_list = []
-    seq_lens = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@ -622,6 +624,9 @@ def test_sampler_top_k_top_p(seed: int, device: str):

    with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+    assert sample_probs is not None
+
    hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
    assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@ -118,16 +118,17 @@ class AsyncLLM:
            raise ValueError("The lengths of prompts and "
                             "sampling_params must be the same.")

-        async def get_output(prompt, sampling_param) -> str:
+        async def get_output(prompt, sampling_param) -> RequestOutput:
            request_id = random_uuid()
            results_generator = self.llm_engine.generate(
                prompt, sampling_param, request_id)
            final_output = None
            async for request_output in results_generator:
                final_output = request_output
+            assert final_output is not None
            return final_output

-        outputs = []
+        outputs: List[RequestOutput] = []
        try:
            for i in range(num_requests):
                prompt = prompts[i] if prompts is not None else None
@ -208,8 +209,8 @@ def maybe_assert_ngram_worker(llm):
 def get_output_from_llm_generator(
        llm_generator, prompts,
        sampling_params) -> Tuple[List[str], List[List[int]]]:
-    tokens = []
-    token_ids = []
+    tokens: List[str] = []
+    token_ids: List[List[int]] = []
    for llm in llm_generator():
        maybe_assert_ngram_worker(llm)

@ -300,8 +301,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
    nvmlInit()
    start_time = time.time()
    while True:
-        output = {}
-        output_raw = {}
+        output: Dict[int, str] = {}
+        output_raw: Dict[int, float] = {}
        for device in devices:
            dev_handle = nvmlDeviceGetHandleByIndex(device)
            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch

@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int):
        device='cuda',
    )

-    expected_output = [
+    expected_output: List[List[int]] = [
        [],
    ]
    for i in range(proposal_token_ids.shape[0]):
        expected_output.append(proposal_token_ids[:i + 1].tolist())

    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
-    actual_output = scorer._get_token_ids_to_score(proposal_token_ids)  # pylint: disable=protected-access
+    actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist())  # pylint: disable=protected-access

    actual_output = [
        x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@ -1,11 +1,12 @@
 import random
+from typing import Dict, List
 from unittest.mock import MagicMock

 import pytest
 import torch

 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@ -210,7 +211,7 @@ def test_same_output_for_multi_step():

    # Run single-step repeatedly.
    zero_kv_cache(worker.cache_engine)
-    single_step_output = []
+    single_step_output: List[SamplerOutput] = []
    continuations = [[1] for _ in prompts]
    set_random_seed(seed)

@ -232,11 +233,15 @@ def test_same_output_for_multi_step():
            continuations[i].append(seq_group_output.samples[0].output_token)

    # Get token ids and logprobs for comparison.
-    multi_step_output_logprobs = [[] for _ in prompts]
-    single_step_output_logprobs = [[] for _ in prompts]
+    multi_step_output_logprobs: List[List[Dict[int,
+                                               Logprob]]] = [[]
+                                                             for _ in prompts]
+    single_step_output_logprobs: List[List[Dict[int,
+                                                Logprob]]] = [[]
+                                                              for _ in prompts]

-    multi_step_output_token_ids = [[] for _ in prompts]
-    single_step_output_token_ids = [[] for _ in prompts]
+    multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
+    single_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
    for i, _ in enumerate(prompts):
        for multi_step, single_step in zip(multi_step_output,
                                           single_step_output):
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@ -1,5 +1,6 @@
 import random
 from types import SimpleNamespace
+from typing import Dict, List
 from unittest.mock import MagicMock

 import pytest
@ -7,7 +8,7 @@ import torch

 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                      SpecDecodeWorkerMetrics)
@ -103,7 +104,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
            seq_group_metadata_list=seq_group_metadata_list,
            num_lookahead_slots=k))

-    seen_contexts = []
+    seen_contexts: List[List[int]] = []

    call_args_list = target_worker.execute_model.call_args_list
    assert len(call_args_list) == 1
@ -116,7 +117,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
            for seq_data in seq_group_metadata.seq_data.values():
                seen_contexts.append(seq_data.get_token_ids())

-    expected_seen_contexts = []
+    expected_seen_contexts: List[List[int]] = []

    for prompt, prev_generated, draft_tokens in zip(
            prompts, prev_output_tokens, proposal_token_ids.tolist()):
@ -310,8 +311,14 @@ def test_correctly_formats_output(k: int, batch_size: int):
        next(iter(seq_group_metadata.seq_data.keys()))
        for seq_group_metadata in seq_group_metadata_list
    ]
-    actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
-    expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+    actual_output_by_seq: Dict[int, List[SequenceOutput]] = {
+        seq_id: []
+        for seq_id in seq_ids
+    }
+    expected_output_by_seq: Dict[int, List[SequenceOutput]] = {
+        seq_id: []
+        for seq_id in seq_ids
+    }

    for step in output:
        for seq_group in step:
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@ -1,5 +1,7 @@
 from itertools import count
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import TypeVar, Union
 from unittest.mock import MagicMock

 import torch
@ -14,6 +16,8 @@ from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker

+T = TypeVar("T", bound=Worker)
+

 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
    return (seq_len + block_size - 1) // block_size
@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine):
        value_blocks.zero_()


-def create_worker(cls: type,
+def create_worker(cls: Callable[..., T],
                  model_name: str,
                  block_size: int,
                  num_gpu_blocks: int,
                  seed: int,
                  is_driver_worker: bool = True,
-                  enforce_eager: bool = True):
+                  enforce_eager: bool = True) -> T:
    engine_args = EngineArgs(
        model=model_name,
        seed=seed,
@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose(

 def create_sampler_output_list(
        token_ids: torch.Tensor,
-        probs: Iterable[Optional[torch.Tensor]],
-        logprobs: Iterable[Optional[torch.Tensor]],
+        probs: GenericSequence[Optional[torch.Tensor]],
+        logprobs: GenericSequence[Optional[torch.Tensor]],
        seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
    num_steps, batch_size = token_ids.shape
    token_ids_by_step = token_ids.tolist()
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
        max_input_length=None,
    )

-    hashes = []
+    hashes: List[List[List[int]]] = []

    for prefix in prefixes:
        for lora_int_id in concurrent_lora_int_ids:
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration():
    assert not logger.propagate

    handler = logger.handlers[0]
+    assert isinstance(handler, logging.StreamHandler)
    assert handler.stream == sys.stdout
    assert handler.level == logging.INFO

--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@ -153,8 +153,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
    # Run sequentially.
    seq = create_sequence()
    dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    sequential_logprobs_text_chosen_token = []
-    sequential_logprobs_text_other_token = []
+    sequential_logprobs_text_chosen_token: List[str] = []
+    sequential_logprobs_text_other_token: List[str] = []
    for new_token, logprobs in zip(complete_sequence_token_ids,
                                   dummy_logprobs):
        seq.append_token_id(new_token, logprobs)
--- a/tests/utils.py
+++ b/tests/utils.py
@ -79,7 +79,7 @@ class RemoteOpenAIServer:
        self.host = str(args.host or 'localhost')
        self.port = int(args.port)

-        self._runner = self._RemoteRunner.remote(
+        self._runner = self._RemoteRunner.remote(  # type: ignore
            cli_args,
            wait_url=self.url_for("health"),
            wait_timeout=self.MAX_SERVER_START_WAIT_S)
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 import torch

@ -35,8 +37,8 @@ def test_prepare_prompt(batch_size):
        enable_chunked_prefill=False,
    )

-    seq_lens = []
-    seq_group_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
    block_tables = {0: [1]}
    for i in range(batch_size):
        # make sure all tokens fit into one block
@ -151,15 +153,14 @@ def test_prepare_decode_cuda_graph(batch_size):
        enable_chunked_prefill=False,
    )

-    context_lens = []
-    seq_group_metadata_list = []
+    context_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
    # Assume each seq group finishes prefill.
    for i in range(batch_size):
        # make sure all tokens fit into one block
        context_len = i % (model_runner.block_size - 1) + 1
        context_lens.append(context_len)
-        seq_data = list(range(context_len))
-        seq_data = SequenceData(seq_data)
+        seq_data = SequenceData(list(range(context_len)))
        seq_data.update_num_computed_tokens(context_len)
        # Append one token ID since prefill is finished.
        seq_data.append_token_id(1, 0)
@ -257,7 +258,7 @@ def test_empty_seq_group():
        dtype="float16",
        enforce_eager=False,
    )
-    seq_group_metadata_list = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
    input_tokens, input_positions, attn_metadata, slot_mapping = (
        model_input.input_tokens,
@ -310,10 +311,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
    )

    # Add prefill requests.
-    seq_lens = []
-    seq_group_metadata_list = []
-    prefill_metadata_list = []
-    decode_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    prefill_metadata_list: List[SequenceGroupMetadata] = []
+    decode_metadata_list: List[SequenceGroupMetadata] = []
    block_tables = {0: [1]}
    prefill_batch_size = batch_size // 2
    decode_batch_size = batch_size - prefill_batch_size
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@ -245,7 +245,7 @@ def _make_alibi_bias(
    dtype: torch.dtype,
    seq_lens: List[int],
 ) -> List[torch.Tensor]:
-    attn_biases = []
+    attn_biases: List[torch.Tensor] = []
    for seq_len in seq_lens:
        bias = torch.arange(seq_len, dtype=dtype)
        # NOTE(zhuohan): HF uses
@ -271,7 +271,7 @@ def _make_sliding_window_bias(
    window_size: Optional[int],
    dtype: torch.dtype,
 ) -> List[torch.Tensor]:
-    attn_biases = []
+    attn_biases: List[torch.Tensor] = []
    for seq_len in seq_lens:
        tensor = torch.full(
            (1, seq_len, seq_len),
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@ -431,8 +431,8 @@ def _make_alibi_bias(
    num_kv_heads: int,
    dtype: torch.dtype,
    seq_lens: List[int],
-) -> LowerTriangularMaskWithTensorBias:
-    attn_biases = []
+) -> List[AttentionBias]:
+    attn_biases: List[AttentionBias] = []
    for seq_len in seq_lens:
        bias = torch.arange(seq_len, dtype=dtype)
        # NOTE(zhuohan): HF uses
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@ -252,7 +252,7 @@ class BlockTable:
    def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
                                       token_ids: List[int],
                                       device: Device) -> List[Block]:
-        blocks = []
+        blocks: List[Block] = []
        for block_token_ids in chunk_list(token_ids, self._block_size):
            if len(block_token_ids) == self._block_size:
                # If the block is full, create an immutable block.
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@ -111,7 +111,7 @@ class NaiveBlockAllocator(BlockAllocator):
        """
        source_blocks = get_all_blocks_recursively(last_block)

-        forked_blocks = []
+        forked_blocks: List[Block] = []
        prev_block = None
        for block in source_blocks:

--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@ -271,7 +271,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
        """
        source_blocks = get_all_blocks_recursively(last_block)

-        forked_blocks = []
+        forked_blocks: List[Block] = []
        prev_block = None
        for block in source_blocks:
            refcount = self._refcounter.incr(block.block_id)
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@ -260,7 +260,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
        # at max extend.
        if self.enable_caching:
            block_table = self.block_tables[seq.seq_id]
-            block_ids = []
+            block_ids: List[Optional[int]] = []
            for block_id in block_table.physical_block_ids:
                block_ids.append(block_id)
            self.block_allocator.mark_blocks_as_accessed(
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@ -2,7 +2,7 @@ import ctypes
 import json
 import os
 from itertools import product
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence

 import torch.distributed as dist
 import torch.multiprocessing as mp
@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int],
 def can_actually_p2p(
    batch_src: Sequence[int],
    batch_tgt: Sequence[int],
-):
+) -> Sequence[bool]:
    """
    Usually, checking if P2P access is enabled can be done by
    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
@ -138,7 +138,7 @@ def can_actually_p2p(
    p_tgt.start()
    p_src.join()
    p_tgt.join()
-    result = []
+    result: List[bool] = []
    for src, tgt in zip(batch_src, batch_tgt):
        a = result_queue.get()
        b = result_queue.get()
@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
        logger.info("generating GPU P2P access cache in %s", path)
-        cache = {}
+        cache: Dict[str, bool] = {}
        ids = list(range(num_dev))
        # batch of all pairs of GPUs
        batch_src, batch_tgt = zip(*list(product(ids, ids)))
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@ -205,7 +205,7 @@ class NCCLLibrary:
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
-            _funcs = {}
+            _funcs: Dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -2,7 +2,7 @@ import time
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
 from typing import Sequence as GenericSequence
-from typing import Type, TypeVar, Union
+from typing import Set, Type, TypeVar, Union

 from transformers import GenerationConfig, PreTrainedTokenizer

@ -973,7 +973,7 @@ class LLMEngine:
    def remove_lora(self, lora_id: int) -> bool:
        return self.model_executor.remove_lora(lora_id)

-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
        return self.model_executor.list_loras()

    def check_health(self) -> None:
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@ -144,7 +144,7 @@ class Metrics:
 # end-metrics-definitions


-def build_1_2_5_buckets(max_value: int):
+def build_1_2_5_buckets(max_value: int) -> List[int]:
    """
    Builds a list of buckets with increasing powers of 10 multiplied by 
    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int):
    """
    mantissa_lst = [1, 2, 5]
    exponent = 0
-    buckets = []
+    buckets: List[int] = []
    while True:
        for m in mantissa_lst:
            value = m * 10**exponent
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union

 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@ -146,8 +146,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):

        # Beam search case
        # Select the child sequences to keep in the sequence group.
-        selected_child_seqs = []
-        unselected_child_seqs = []
+        selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
+        unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
        beam_width = seq_group.sampling_params.best_of
        length_penalty = seq_group.sampling_params.length_penalty

--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@ -2,6 +2,7 @@ import argparse
 import asyncio
 import sys
 from io import StringIO
+from typing import Awaitable, List

 import aiohttp

@ -114,7 +115,7 @@ async def main(args):
    )

    # Submit all requests in the file to the engine "concurrently".
-    response_futures = []
+    response_futures: List[Awaitable[BatchRequestOutput]] = []
    for request_json in (await read_file(args.input_file)).strip().split("\n"):
        request = BatchRequestInput.model_validate_json(request_json)
        response_futures.append(run_request(openai_serving_chat, request))
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -487,7 +487,7 @@ class OpenAIServingChat(OpenAIServing):
            final_res = res
        assert final_res is not None

-        choices = []
+        choices: List[ChatCompletionResponseChoice] = []

        role = self.get_chat_request_role(request)
        for output in final_res.outputs:
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@ -25,7 +25,7 @@ def request_output_to_embedding_response(
    created_time: int,
    model_name: str,
 ) -> EmbeddingResponse:
-    data = []
+    data: List[EmbeddingResponseData] = []
    num_prompt_tokens = 0
    for idx, final_res in enumerate(final_res_batch):
        assert final_res is not None
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@ -1,4 +1,5 @@
 from typing import List, Optional
+from typing import Sequence as GenericSequence

 import torch

@ -120,7 +121,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):

    @classmethod
    def pack(
-            cls, loras: List[Optional["LoRALayerWeights"]]
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
    ) -> "PackedLoRALayerWeights":
        """Pack a list of LoRAs into a single LoRA.

--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@ -165,7 +165,7 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
            model = self._lora_manager.model
            supported_lora_modules = model.supported_lora_modules
            packed_modules_mapping = model.packed_modules_mapping
-            expected_lora_modules = []
+            expected_lora_modules: List[str] = []
            for module in supported_lora_modules:
                if module in packed_modules_mapping:
                    expected_lora_modules.extend(
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@ -393,7 +393,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                param_data.copy_(loaded_weight)
                return
            current_shard_offset = 0
-            shard_offsets = []
+            shard_offsets: List[Tuple[int, int, int]] = []
            for i, output_size in enumerate(self.output_sizes):
                shard_offsets.append((i, current_shard_offset, output_size))
                current_shard_offset += output_size
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@ -25,24 +25,25 @@ GPTQ_MARLIN_SUPPORTED_SYM = [True]


 # Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits):
-    scale_perm = []
+def get_scale_perms(num_bits: int):
+    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return scale_perm, scale_perm_single


-def get_pack_factor(num_bits):
+def get_pack_factor(num_bits: int):
    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
            ), f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits


-def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int, num_bits: int):
    scale_perm, scale_perm_single = get_scale_perms(num_bits)
    if group_size < size_k and group_size != -1:
        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
@ -1,4 +1,6 @@
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch

@ -11,10 +13,10 @@ import torch
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits):
-    perm_list = []
+def get_perms_24(num_bits: int):
+    perm_list: List[int] = []
    for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
        col = i // 4
        col_o = col // 2
        for block in [0, 1]:
@ -39,18 +41,18 @@ def get_perms_24(num_bits):

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
    for i in range(8):
        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
    return perm, scale_perm, scale_perm_single


-marlin_24_perm = {}
-marlin_24_scale_perm = {}
-marlin_24_scale_perm_single = {}
+marlin_24_perm: Dict[int, torch.Tensor] = {}
+marlin_24_scale_perm: Dict[int, List[int]] = {}
+marlin_24_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
    marlin_24_perm[num_bits] = perm_24
--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
@ -1,4 +1,6 @@
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch

@ -11,10 +13,10 @@ import torch
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits):
-    perm_list = []
+def get_perms(num_bits: int):
+    perm_list: List[int] = []
    for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
        col = i // 4
        for block in [0, 1]:
            for row in [
@ -38,19 +40,19 @@ def get_perms(num_bits):

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return perm, scale_perm, scale_perm_single


-marlin_perm = {}
-marlin_scale_perm = {}
-marlin_scale_perm_single = {}
+marlin_perm: Dict[int, torch.Tensor] = {}
+marlin_scale_perm: Dict[int, List[int]] = {}
+marlin_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
    perm, scale_perm, scale_perm_single = get_perms(num_bits)
    marlin_perm[num_bits] = perm
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@ -174,7 +174,7 @@ def _apply_min_tokens_penalty(
        min_tokens = sampling_params.min_tokens
        token_ids_to_penalize = sampling_params.all_stop_token_ids
        if min_tokens > 0 and token_ids_to_penalize:
-            seqs_to_penalize = []
+            seqs_to_penalize: List[int] = []
            for j, seq_id in enumerate(seq_ids):
                seq_data = seq_group.seq_data[seq_id]
                if len(seq_data.output_token_ids) < min_tokens:
@ -285,7 +285,7 @@ def _greedy_sample(
        same as the length of selected_seq_groups. If the corresponding
        seq_group has do_sample=False, tuple contains ([], [])
    """
-    samples = samples.tolist()
+    samples_lst = samples.tolist()
    sample_idx = 0
    results: SampleResultType = []
    for seq_group in selected_seq_groups:
@ -298,7 +298,7 @@ def _greedy_sample(
        assert num_parent_seqs == 1, (
            "Greedy sampling should have only one seq.")
        parent_ids = list(range(num_parent_seqs))
-        next_token_ids = [samples[sample_idx]]
+        next_token_ids = [samples_lst[sample_idx]]
        results.append((next_token_ids, parent_ids))
        sample_idx += num_parent_seqs
    return results
@ -394,7 +394,7 @@ def _beam_search_sample(
            next_token_ids = next_token_ids.tolist()
        else:
            # Generation phase.
-            cumulative_logprobs: List[int] = [
+            cumulative_logprobs: List[float] = [
                seq_group.seq_data[seq_id].cumulative_logprob
                for seq_id in seq_ids
            ]
@ -466,8 +466,9 @@ def _sample_with_torch(
        categorized_seq_group_ids[sampling_type].append(i)

    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
-    multinomial_samples = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample]]] = {}
+    multinomial_samples: Dict[SamplingType, torch.Tensor] = {}

    # Create output tensor for sampled token ids.
    if include_gpu_probs_tensor:
@ -494,7 +495,7 @@ def _sample_with_torch(
            greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                          dim=-1)

-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                # Store sampled tokens in output tensor.
                sampled_token_ids_tensor[
                    long_sample_indices] = greedy_samples.unsqueeze(-1)
@ -522,7 +523,7 @@ def _sample_with_torch(
                probs[long_sample_indices], max_best_of_in_batch,
                **seeded_args)

-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                # Store sampled tokens in output tensor.
                sampled_token_ids_tensor[
                    long_sample_indices] = multinomial_samples[sampling_type]
@ -571,7 +572,9 @@ def _sample_with_triton_kernel(
        categorized_seq_group_ids[sampling_type].append(i)

    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample],
+                                torch.Tensor, torch.Tensor]] = {}
    max_best_of_in_batch = 1

    # Counterintiutively, having two loops here is actually faster.
@ -1008,14 +1011,14 @@ def _build_sampler_output(
            speculative decoding rejection sampling.
    """

-    sampler_output = []
+    sampler_output: List[CompletionSequenceGroupOutput] = []
    for (seq_group, sample_result, group_prompt_logprobs,
         group_sample_logprobs) in zip(sampling_metadata.seq_groups,
                                       sample_results, prompt_logprobs,
                                       sample_logprobs):
        seq_ids = seq_group.seq_ids
        next_token_ids, parent_ids = sample_result
-        seq_outputs = []
+        seq_outputs: List[SequenceOutput] = []
        for parent_id, next_token_id, logprobs in zip(parent_ids,
                                                      next_token_ids,
                                                      group_sample_logprobs):
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@ -68,7 +68,7 @@ def _get_model_initialization_kwargs(
        vision_language_config: Optional[VisionLanguageConfig]
 ) -> Dict[str, Any]:
    """Get extra kwargs for model initialization."""
-    extra_kwargs = {}
+    extra_kwargs: Dict[str, Any] = {}
    if hasattr(model_class, "supported_lora_modules"):
        extra_kwargs["lora_config"] = lora_config
    elif lora_config:
@ -446,7 +446,8 @@ class ShardedStateLoader(BaseModelLoader):
        Filter out all tensors that share the same memory or a subset of the
        memory of another tensor.
        """
-        same_storage_groups = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[
+            str, torch.Tensor]]] = collections.defaultdict(list)
        for key, tensor in tensors.items():
            if tensor.numel():
                ptr = tensor.untyped_storage().data_ptr()
@ -455,7 +456,7 @@ class ShardedStateLoader(BaseModelLoader):
        def get_end_ptr(tensor: torch.Tensor) -> int:
            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()

-        result = {}
+        result: Dict[str, torch.Tensor] = {}
        for group in same_storage_groups.values():
            for k, t in group:
                a, b = t.data_ptr(), get_end_ptr(t)
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -329,7 +329,7 @@ def np_cache_weights_iterator(
    # dumping the same model weights to numpy at the same time.
    with get_lock(model_name_or_path, cache_dir):
        if not os.path.exists(weight_names_file):
-            weight_names = []
+            weight_names: List[str] = []
            for bin_file in hf_weights_files:
                state = torch.load(bin_file, map_location="cpu")
                for name, param in state.items():
--- a/vllm/model_executor/models/init.py
+++ b/vllm/model_executor/models/init.py
@ -72,11 +72,11 @@ _MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS}
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}

 # Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS = []
+_ROCM_UNSUPPORTED_MODELS: List[str] = []

 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_PARTIALLY_SUPPORTED_MODELS = {
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
    "Qwen2ForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
    "MistralForCausalLM":
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@ -453,8 +453,8 @@ class ArcticForCausalLM(nn.Module):
            ("qkv_proj", "v_proj", "v"),
        ]

-        mlp_params_mapping = []
-        expert_params_mapping = []
+        mlp_params_mapping: List[Tuple[str, str, int]] = []
+        expert_params_mapping: List[Tuple[str, str, int]] = []
        num_layers = self.config.num_hidden_layers

        for layer in range(num_layers):
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@ -20,7 +20,7 @@

 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple

 import torch
 import torch.utils.checkpoint
@ -352,7 +352,7 @@ class CohereForCausalLM(nn.Module):
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters())
-        loaded_params = set()
+        loaded_params: Set[str] = set()
        for name, loaded_weight in weights:
            for param_name, shard_name, shard_id in stacked_params_mapping:
                if shard_name not in name:
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@ -15,7 +15,7 @@
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from functools import lru_cache
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple

 import torch
 from torch import nn
@ -363,7 +363,7 @@ class GemmaForCausalLM(nn.Module):
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters())
-        loaded_params = set()
+        loaded_params: Set[str] = set()
        for name, loaded_weight in weights:
            for (param_name, shard_name, shard_id) in stacked_params_mapping:
                if shard_name not in name:
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@ -123,7 +123,7 @@ class SequenceData:
            output_token_ids = []

        self.prompt_token_ids = prompt_token_ids
-        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
+        self._prompt_token_ids_tuple = tuple(prompt_token_ids)
        self.output_token_ids = output_token_ids
        self.cumulative_logprob = 0.0
        # The number of tokens that are computed (that run against the model).
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@ -1,10 +1,10 @@
 import copy
 import weakref
-from typing import List, Tuple
+from typing import Dict, List, Tuple

 import torch

-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
                           SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
@ -71,7 +71,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
                                     sample_len)

        # Run model sample_len times.
-        model_outputs = []
+        model_outputs: List[SamplerOutput] = []
        for _ in range(sample_len):
            model_output = super().execute_model(
                execute_model_req=copied_execute_model_req)
@ -132,7 +132,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):

        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
        # append tokens and change is_prompt without external side-effects.
-        new_seq_group_metadata_list = []
+        new_seq_group_metadata_list: List[SequenceGroupMetadata] = []

        for old_seq_group_metadata in seq_group_metadata_list:
            # We must shallow-copy seq_group_metadata as is_prompt could change.
@ -140,7 +140,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
            new_seq_group_metadata_list.append(seq_group_metadata)

            # We must shallow-copy seq_data as we will append token ids
-            new_seq_data = {}
+            new_seq_data: Dict[int, SequenceData] = {}
            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
                new_seq_data[seq_id] = copy.copy(old_seq_data)
                new_seq_data[
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@ -48,7 +48,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
        self,
        execute_model_req: ExecuteModelRequest,
        sample_len: int,
-    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
+    ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
        """NGram match algo to pick proposal candidate. Returns the list of
        sampler output, one per SequenceGroupMetadata.

@ -58,8 +58,8 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
        self._raise_if_unsupported(execute_model_req)

        has_spec_out = False
-        token_id_list = []
-        token_prob_list = []
+        token_id_list: List[Optional[torch.Tensor]] = []
+        token_prob_list: List[Optional[torch.Tensor]] = []
        for idx, seq_group_metadata in enumerate(
                execute_model_req.seq_group_metadata_list):
            seq_data = next(iter(seq_group_metadata.seq_data.values()))
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@ -7,8 +7,8 @@ from vllm.config import SpeculativeConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
-                           SequenceGroupMetadata)
+from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
+                           SamplerOutput, SequenceGroupMetadata)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                         SpeculativeScorer, SpeculativeScores)
@ -516,13 +516,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
        topk_indices_by_step = topk_indices_by_step.tolist()

        # Construct the output on a per-step, per-sequence basis.
-        sampler_output_list = []
+        sampler_output_list: List[SamplerOutput] = []
        for step_index in range(num_steps):
            if all(token_id == -1
                   for token_id in accepted_token_ids_by_step[step_index]):
                break

-            step_output_token_ids = []
+            step_output_token_ids: List[CompletionSequenceGroupOutput] = []
            for sequence_index in range(batch_size):
                # Each sequence may have a different num_logprobs; retrieve it.
                num_logprobs = num_logprobs_per_seq[sequence_index]
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@ -26,10 +26,10 @@ def get_all_num_logprobs(
    sequence.
    """

-    all_num_logprobs = []
+    all_num_logprobs: List[int] = []
    for seq_group_metadata in seq_group_metadata_list:
        num_logprobs = seq_group_metadata.sampling_params.logprobs
-        if seq_group_metadata.sampling_params.logprobs is None:
+        if num_logprobs is None:
            num_logprobs = 0
        all_num_logprobs.append(num_logprobs)

--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@ -44,7 +44,7 @@ class Detokenizer:
        read_offset = 0
        next_iter_prefix_offset = 0
        next_iter_read_offset = 0
-        next_iter_tokens = []
+        next_iter_tokens: List[str] = []
        prev_tokens = None

        for token_position, prompt_logprobs_for_token in enumerate(
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -20,12 +20,13 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
 import numpy as np
 import psutil
 import torch
+import torch.types
+from typing_extensions import ParamSpec

 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import enable_trace_function_call, init_logger

-T = TypeVar("T")
 logger = init_logger(__name__)

 STR_DTYPE_TO_TORCH_DTYPE = {
@ -37,6 +38,10 @@ STR_DTYPE_TO_TORCH_DTYPE = {
    "fp8_e5m2": torch.uint8,
 }

+P = ParamSpec('P')
+K = TypeVar("K")
+T = TypeVar("T")
+

 class Device(enum.Enum):
    GPU = enum.auto()
@ -176,7 +181,7 @@ def random_uuid() -> str:


@lru_cache(maxsize=None)
-def get_vllm_instance_id():
+def get_vllm_instance_id() -> str:
    """
    If the environment variable VLLM_INSTANCE_ID is set, return it.
    Otherwise, return a random UUID.
@ -192,7 +197,7 @@ def in_wsl() -> bool:
    return "microsoft" in " ".join(uname()).lower()


-def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
+def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
    """Take a blocking function, and run it on in an executor thread.

    This function prevents the blocking function from blocking the
@ -200,7 +205,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
    The code in this function needs to be thread safe.
    """

-    def _async_wrapper(*args, **kwargs) -> asyncio.Future:
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
        loop = asyncio.get_event_loop()
        p_func = partial(func, *args, **kwargs)
        return loop.run_in_executor(executor=None, func=p_func)
@ -325,7 +330,7 @@ def update_environment_variables(envs: Dict[str, str]):
        os.environ[k] = v


-def chunk_list(lst, chunk_size):
+def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
    """Yield successive chunk_size chunks from lst."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

@ -336,7 +341,7 @@ def cdiv(a: int, b: int) -> int:


 def _generate_random_fp8(
-    tensor: torch.tensor,
+    tensor: torch.Tensor,
    low: float,
    high: float,
 ) -> None:
@ -398,7 +403,10 @@ def create_kv_caches_with_random_flash(
    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
    key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
    scale = head_size**-0.5
-    key_caches, value_caches = [], []
+
+    key_caches: List[torch.Tensor] = []
+    value_caches: List[torch.Tensor] = []
+
    for _ in range(num_layers):
        key_value_cache = torch.empty(size=key_value_cache_shape,
                                      dtype=torch_dtype,
@ -429,7 +437,7 @@ def create_kv_caches_with_random(
    scale = head_size**-0.5
    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches = []
+    key_caches: List[torch.Tensor] = []
    for _ in range(num_layers):
        key_cache = torch.empty(size=key_cache_shape,
                                dtype=torch_dtype,
@ -444,7 +452,7 @@ def create_kv_caches_with_random(
        key_caches.append(key_cache)

    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches = []
+    value_caches: List[torch.Tensor] = []
    for _ in range(num_layers):
        value_cache = torch.empty(size=value_cache_shape,
                                  dtype=torch_dtype,
@ -484,7 +492,7 @@ def is_pin_memory_available() -> bool:

 class CudaMemoryProfiler:

-    def __init__(self, device=None):
+    def __init__(self, device: Optional[torch.types.Device] = None):
        self.device = device

    def current_memory_usage(self) -> float:
@ -560,13 +568,13 @@ def get_dtype_size(dtype: torch.dtype) -> int:
    return torch.tensor([], dtype=dtype).element_size()


-def merge_dicts(dict1: Dict[Any, List[Any]],
-                dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
+def merge_dicts(dict1: Dict[K, List[T]],
+                dict2: Dict[K, List[T]]) -> Dict[K, List[T]]:
    """Merge 2 dicts that have key -> List of items.

    When a key conflicts, the values in dict1 is prioritized.
    """
-    merged_dict = defaultdict(list)
+    merged_dict: Dict[K, List[T]] = defaultdict(list)

    for key, value in dict1.items():
        merged_dict[key].extend(value)
@ -577,7 +585,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]],
    return dict(merged_dict)


-def init_cached_hf_modules():
+def init_cached_hf_modules() -> None:
    """
    Lazy initialization of the Hugging Face modules.
    """
@ -613,7 +621,7 @@ def find_library(lib_name: str) -> str:
    return locs[0]


-def find_nccl_library():
+def find_nccl_library() -> str:
    """
    We either use the library file specified by the `VLLM_NCCL_SO_PATH`
    environment variable, or we find the library file brought by PyTorch.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@ -779,8 +779,8 @@ class ModelRunner:
        # that will have unique loras, an therefore the max amount of memory
        # consumption create dummy lora request copies from the lora request
        # passed in, which contains a lora from the lora warmup path.
-        dummy_lora_requests = []
-        dummy_lora_requests_per_seq = []
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
        if self.lora_config:
            assert self.lora_manager is not None
            with self.lora_manager.dummy_lora_cache():
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@ -99,8 +99,8 @@ class WorkerWrapperBase:
    """

    def __init__(self,
-                 worker_module_name=None,
-                 worker_class_name=None,
+                 worker_module_name: str,
+                 worker_class_name: str,
                 trust_remote_code: bool = False) -> None:
        self.worker_module_name = worker_module_name
        self.worker_class_name = worker_class_name