From e3072104fad7c9c5384a6fb2de983d5974751cd0 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Wed, 8 Jan 2025 15:50:38 -0700 Subject: [PATCH] :test_tube: stress test dynamic loras Signed-off-by: Joe Runde --- .../entrypoints/openai/test_lora_adapters.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 72bc9708eb85a..46a064f6d9e68 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -167,6 +167,39 @@ async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI, }) +@pytest.mark.asyncio +async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files): + """Validate that many loras can be dynamically registered and inferenced + with concurrently""" + + # This test file configures the server with --max-cpu-loras=2 and this test + # will concurrently load 10 adapters, so it should flex the LRU cache + async def load_and_run_adapter(adapter_name: str): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": str(zephyr_lora_files) + }) + for _ in range(3): + await client.completions.create( + model=adapter_name, + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + + lora_tasks = [] + for i in range(10): + lora_tasks.append( + asyncio.create_task(load_and_run_adapter(f"adapter_{i}"))) + + results, _ = await asyncio.wait(lora_tasks) + + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + @pytest.mark.asyncio async def test_loading_invalid_adapters_does_not_break_others( client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):