Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"add support for vllm api stop parameter" #3527

Merged
merged 6 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/llmtuner/api/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ async def create_chat_completion_response(
top_p=request.top_p,
max_new_tokens=request.max_tokens,
num_return_sequences=request.n,
stop=request.stop
)

prompt_length, response_length = 0, 0
Expand Down Expand Up @@ -155,6 +156,7 @@ async def create_stream_chat_completion_response(
temperature=request.temperature,
top_p=request.top_p,
max_new_tokens=request.max_tokens,
stop=request.stop
):
if len(new_token) != 0:
yield _create_stream_chat_completion_chunk(
Expand Down
3 changes: 2 additions & 1 deletion src/llmtuner/api/protocol.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import time
from enum import Enum, unique
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, Field
from typing_extensions import Literal
Expand Down Expand Up @@ -78,6 +78,7 @@ class ChatCompletionRequest(BaseModel):
n: int = 1
max_tokens: Optional[int] = None
stream: bool = False
stop: Union[Optional[str], List[str]] = None


class ChatCompletionResponseChoice(BaseModel):
Expand Down
3 changes: 3 additions & 0 deletions src/llmtuner/chat/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ async def _generate(
repetition_penalty = input_kwargs.pop("repetition_penalty", None)
max_length = input_kwargs.pop("max_length", None)
max_new_tokens = input_kwargs.pop("max_new_tokens", None)
stop = input_kwargs.pop("stop", None)

generating_args = self.generating_args.copy()
generating_args.update(
Expand All @@ -105,6 +106,7 @@ async def _generate(
top_k=top_k or generating_args["top_k"],
num_return_sequences=num_return_sequences or 1,
repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
stop=stop or generating_args["stop"]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't need to insert it into generating_args

)
)

Expand All @@ -125,6 +127,7 @@ async def _generate(
stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
max_tokens=generating_args["max_new_tokens"],
skip_special_tokens=True,
stop=generating_args["stop"],
)

if self.processor is not None and image is not None:
Expand Down
7 changes: 5 additions & 2 deletions src/llmtuner/hparams/generating_args.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import asdict, dataclass, field
from typing import Any, Dict
from typing import Any, Dict, Union, Optional, List


@dataclass
Expand Down Expand Up @@ -46,7 +46,10 @@ class GeneratingArguments:
default=1.0,
metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
)

stop: Union[Optional[str], List[str]] = field(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The arguments in GeneratingArguments should also be contained in the https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/text_generation#transformers.GenerationConfig, but it is not now, we will delete it

default=None,
metadata={"help": "List of strings or string that stop the generation when they are generated. The returned output will not contain the stop strings."},
)
def to_dict(self) -> Dict[str, Any]:
args = asdict(self)
if args.get("max_new_tokens", -1) > 0:
Expand Down