[Misc] `compressed-tensors` code reuse (#7277)

This commit is contained in:
Kyle Sayers 2024-08-13 19:05:15 -04:00 committed by GitHub
parent 33e5d7e6b6
commit 373538f973
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 13 additions and 85 deletions

View File

@ -23,3 +23,4 @@ pyzmq
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
compressed-tensors == 0.5.0

View File

@ -17,7 +17,7 @@ peft
requests
ray
sentence-transformers # required for embedding
compressed-tensors==0.4.0 # required for compressed-tensors
compressed-tensors==0.5.0 # required for compressed-tensors
timm # required for internvl test
# TODO: Add this after fully implementing llava(mantis)

View File

@ -5,13 +5,12 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
import pytest
import torch
from compressed_tensors.quantization import QuantizationType
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationType)
@pytest.mark.parametrize("model_args", [

View File

@ -1,6 +1,10 @@
from typing import Any, Dict, List, Optional
import torch
from compressed_tensors.config import CompressionFormat
from compressed_tensors.quantization import (QuantizationArgs,
QuantizationStrategy,
QuantizationType)
from pydantic import BaseModel
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@ -13,8 +17,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
CompressionFormat, QuantizationArgs, QuantizationStrategy,
QuantizationType, find_matched_target, is_activation_quantization_format,
find_matched_target, is_activation_quantization_format,
should_ignore_layer)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.platforms import current_platform

View File

@ -1,11 +1,10 @@
from typing import Callable, List, Optional
import torch
from compressed_tensors.quantization import QuantizationStrategy
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationStrategy)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (

View File

@ -1,12 +1,11 @@
from typing import Callable, List, Optional
import torch
from compressed_tensors.quantization import QuantizationStrategy
from torch.nn import Parameter
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationStrategy)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,

View File

@ -1,12 +1,11 @@
from typing import Callable, List, Optional
import torch
from compressed_tensors.quantization import QuantizationStrategy
from torch.nn import Parameter
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationStrategy)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
apply_int8_linear, convert_to_channelwise)
from vllm.model_executor.parameter import (BasevLLMParameter,

View File

@ -1,85 +1,13 @@
import re
from enum import Enum
from typing import Any, Dict, Iterable, Optional
from typing import Iterable, Optional
from pydantic import BaseModel, Field
from compressed_tensors import CompressionFormat
from torch.nn import Module
from vllm.model_executor.layers.quantization.utils.quant_utils import (
FUSED_LAYER_NAME_MAPPING)
class CompressionFormat(Enum):
dense = "dense"
sparse_bitmask = "sparse-bitmask"
naive_quantized = "naive-quantized"
float_quantized = "float-quantized"
int_quantized = "int-quantized"
pack_quantized = "pack-quantized"
marlin_24 = "marlin-24"
class QuantizationType(str, Enum):
"""
Enum storing quantization type options
"""
INT = "int"
FLOAT = "float"
class QuantizationStrategy(str, Enum):
"""
Enum storing quantization strategy options
"""
TENSOR = "tensor"
CHANNEL = "channel"
GROUP = "group"
BLOCK = "block"
TOKEN = "token"
class QuantizationArgs(BaseModel):
"""
User facing arguments used to define a quantization config
for weights or activations
:param num_bits: quantization bit depth
:param type: dtype to quantized to, either int or float
:param symmetric: whether or not quantization scale is symmetric
:param strategy: string determining the scope of scale/zero-point to apply
:param group_size: group length to use for the group strategy
:param block_structure: 2d block structure to use for the block
strategy, must be of the format "2x4", "8x16", etc.
:param dynamic: set True to perform dynamic quantization -
values will not be calibrated during calibration phase,
instead during inference new quantization ranges will be
observed with every sample. Defaults to False for static
quantization. Note that enabling dynamic quantization
will change the default observer to a memoryless one
"""
num_bits: int = 8
type: QuantizationType = QuantizationType.INT
symmetric: bool = True
group_size: Optional[int] = None
strategy: Optional[QuantizationStrategy] = None
block_structure: Optional[str] = None
dynamic: bool = False
observer: str = Field(
default="minmax",
description=("The class to use to compute the quantization param - "
"scale and zero-point'"),
)
observer_kwargs: Dict[str, Any] = Field(
default_factory=dict,
description=
("optional dict of kwargs to be passed directly to torch quantization "
"Observers constructor excluding quantization range or symmetry"),
)
def is_activation_quantization_format(format: str) -> bool:
_ACTIVATION_QUANTIZATION_FORMATS = [
CompressionFormat.naive_quantized.value,