-
-
Notifications
You must be signed in to change notification settings - Fork 5.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bugfix][Refactor] Unify model management in frontend #11660
Merged
+365
−307
Merged
Changes from 5 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
47632a7
:recycle: consolidate models in api layer
joerunde 43a9ef2
:recycle: is supported -> is base model
joerunde 9a44d9d
:test_tube: Test dynamic lora loading
joerunde 5a39962
:white_check_mark: fixup tests
joerunde 898fe8b
:recycle: Move OpenAIServingModels to separate file
joerunde 8c1d8ff
:art: fmt
joerunde c12e632
:recycle: move model path classes to serving_models
joerunde 55e774f
:art: fmt
joerunde File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,6 +59,7 @@ | |
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion | ||
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding | ||
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing | ||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels | ||
from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling | ||
from vllm.entrypoints.openai.serving_score import OpenAIServingScores | ||
from vllm.entrypoints.openai.serving_tokenization import ( | ||
|
@@ -269,6 +270,10 @@ def base(request: Request) -> OpenAIServing: | |
return tokenization(request) | ||
|
||
|
||
def models(request: Request) -> OpenAIServingModels: | ||
return request.app.state.openai_serving_models | ||
|
||
|
||
def chat(request: Request) -> Optional[OpenAIServingChat]: | ||
return request.app.state.openai_serving_chat | ||
|
||
|
@@ -336,10 +341,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): | |
|
||
@router.get("/v1/models") | ||
async def show_available_models(raw_request: Request): | ||
handler = base(raw_request) | ||
handler = models(raw_request) | ||
|
||
models = await handler.show_available_models() | ||
return JSONResponse(content=models.model_dump()) | ||
models_ = await handler.show_available_models() | ||
return JSONResponse(content=models_.model_dump()) | ||
|
||
|
||
@router.get("/version") | ||
|
@@ -505,26 +510,22 @@ async def stop_profile(raw_request: Request): | |
@router.post("/v1/load_lora_adapter") | ||
async def load_lora_adapter(request: LoadLoraAdapterRequest, | ||
raw_request: Request): | ||
for route in [chat, completion, embedding]: | ||
handler = route(raw_request) | ||
if handler is not None: | ||
response = await handler.load_lora_adapter(request) | ||
if isinstance(response, ErrorResponse): | ||
return JSONResponse(content=response.model_dump(), | ||
status_code=response.code) | ||
handler = models(raw_request) | ||
response = await handler.load_lora_adapter(request) | ||
if isinstance(response, ErrorResponse): | ||
return JSONResponse(content=response.model_dump(), | ||
status_code=response.code) | ||
Comment on lines
+514
to
+518
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for reviewer: Here's where we no longer need to loop over all handlers to register adapters with each one |
||
|
||
return Response(status_code=200, content=response) | ||
|
||
@router.post("/v1/unload_lora_adapter") | ||
async def unload_lora_adapter(request: UnloadLoraAdapterRequest, | ||
raw_request: Request): | ||
for route in [chat, completion, embedding]: | ||
handler = route(raw_request) | ||
if handler is not None: | ||
response = await handler.unload_lora_adapter(request) | ||
if isinstance(response, ErrorResponse): | ||
return JSONResponse(content=response.model_dump(), | ||
status_code=response.code) | ||
handler = models(raw_request) | ||
response = await handler.unload_lora_adapter(request) | ||
if isinstance(response, ErrorResponse): | ||
return JSONResponse(content=response.model_dump(), | ||
status_code=response.code) | ||
|
||
return Response(status_code=200, content=response) | ||
|
||
|
@@ -628,13 +629,18 @@ def init_app_state( | |
resolved_chat_template = load_chat_template(args.chat_template) | ||
logger.info("Using supplied chat template:\n%s", resolved_chat_template) | ||
|
||
state.openai_serving_models = OpenAIServingModels( | ||
model_config=model_config, | ||
base_model_paths=base_model_paths, | ||
lora_modules=args.lora_modules, | ||
prompt_adapters=args.prompt_adapters, | ||
) | ||
# TODO: The chat template is now broken for lora adapters :( | ||
state.openai_serving_chat = OpenAIServingChat( | ||
engine_client, | ||
model_config, | ||
base_model_paths, | ||
state.openai_serving_models, | ||
args.response_role, | ||
lora_modules=args.lora_modules, | ||
prompt_adapters=args.prompt_adapters, | ||
request_logger=request_logger, | ||
chat_template=resolved_chat_template, | ||
chat_template_content_format=args.chat_template_content_format, | ||
|
@@ -646,39 +652,36 @@ def init_app_state( | |
state.openai_serving_completion = OpenAIServingCompletion( | ||
engine_client, | ||
model_config, | ||
base_model_paths, | ||
lora_modules=args.lora_modules, | ||
prompt_adapters=args.prompt_adapters, | ||
state.openai_serving_models, | ||
request_logger=request_logger, | ||
return_tokens_as_token_ids=args.return_tokens_as_token_ids, | ||
) if model_config.runner_type == "generate" else None | ||
state.openai_serving_pooling = OpenAIServingPooling( | ||
engine_client, | ||
model_config, | ||
base_model_paths, | ||
state.openai_serving_models, | ||
request_logger=request_logger, | ||
chat_template=resolved_chat_template, | ||
chat_template_content_format=args.chat_template_content_format, | ||
) if model_config.runner_type == "pooling" else None | ||
state.openai_serving_embedding = OpenAIServingEmbedding( | ||
engine_client, | ||
model_config, | ||
base_model_paths, | ||
state.openai_serving_models, | ||
request_logger=request_logger, | ||
chat_template=resolved_chat_template, | ||
chat_template_content_format=args.chat_template_content_format, | ||
) if model_config.task == "embed" else None | ||
state.openai_serving_scores = OpenAIServingScores( | ||
engine_client, | ||
model_config, | ||
base_model_paths, | ||
state.openai_serving_models, | ||
request_logger=request_logger | ||
) if model_config.task == "score" else None | ||
state.openai_serving_tokenization = OpenAIServingTokenization( | ||
engine_client, | ||
model_config, | ||
base_model_paths, | ||
lora_modules=args.lora_modules, | ||
state.openai_serving_models, | ||
request_logger=request_logger, | ||
chat_template=resolved_chat_template, | ||
chat_template_content_format=args.chat_template_content_format, | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note for reviewers: This was the missing test case that covers dynamically loaded adapters showing up in the response from
/v1/models