mirror of https://github.com/vllm-project/vllm
[Frontend] OpenAI API server: Do not add bos token by default when encoding (#4688)
This commit is contained in:
parent
8e7fb5d43a
commit
0150a10630
|
@ -158,7 +158,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||
try:
|
||||
# Tokenize/detokenize depending on prompt format (string/token list)
|
||||
prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
|
||||
request, prompt=prompt)
|
||||
request, prompt=prompt, add_special_tokens=False)
|
||||
sampling_params = request.to_sampling_params()
|
||||
lora_request = self._maybe_get_lora(request)
|
||||
decoding_config = await self.engine.get_decoding_config()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from pydantic import Field
|
||||
from typing_extensions import Annotated
|
||||
|
@ -165,13 +165,14 @@ class OpenAIServing:
|
|||
raise ValueError(f"The model `{request.model}` does not exist.")
|
||||
|
||||
def _validate_prompt_and_tokenize(
|
||||
self,
|
||||
request: Union[ChatCompletionRequest, CompletionRequest,
|
||||
EmbeddingRequest],
|
||||
prompt: Optional[str] = None,
|
||||
prompt_ids: Optional[List[int]] = None,
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
|
||||
) -> Tuple[List[int], str]:
|
||||
self,
|
||||
request: Union[ChatCompletionRequest, CompletionRequest,
|
||||
EmbeddingRequest],
|
||||
prompt: Optional[str] = None,
|
||||
prompt_ids: Optional[List[int]] = None,
|
||||
truncate_prompt_tokens: Optional[Annotated[int,
|
||||
Field(ge=1)]] = None,
|
||||
add_special_tokens: bool = True) -> Tuple[List[int], str]:
|
||||
if not (prompt or prompt_ids):
|
||||
raise ValueError("Either prompt or prompt_ids should be provided.")
|
||||
if (prompt and prompt_ids):
|
||||
|
@ -179,10 +180,19 @@ class OpenAIServing:
|
|||
"Only one of prompt or prompt_ids should be provided.")
|
||||
|
||||
if prompt_ids is None:
|
||||
tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
|
||||
"truncation": True,
|
||||
"max_length": truncate_prompt_tokens,
|
||||
# When using OpenAIServingChat for chat completions, the
|
||||
# special tokens (e.g., BOS) have already been added by the
|
||||
# chat template. Therefore, we do not need to add them again.
|
||||
# Set add_special_tokens to False to avoid adding the BOS tokens
|
||||
# again.
|
||||
tokenizer_kwargs: Dict[str, Any] = {
|
||||
"add_special_tokens": add_special_tokens
|
||||
}
|
||||
if truncate_prompt_tokens is not None:
|
||||
tokenizer_kwargs.update({
|
||||
"truncation": True,
|
||||
"max_length": truncate_prompt_tokens,
|
||||
})
|
||||
input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
|
||||
elif truncate_prompt_tokens is not None:
|
||||
input_ids = prompt_ids[-truncate_prompt_tokens:]
|
||||
|
|
Loading…
Reference in New Issue