test/srt/test_openai_server.py

"""
First run the following command to launch the server.
Note that TinyLlama adopts different chat templates in different versions.
For v0.4, the chat template is chatml.

python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \
--port 30000 --chat-template chatml

Output example:
The capital of France is Paris.
The capital of the United States is Washington, D.C.
The capital of Canada is Ottawa.
The capital of Japan is Tokyo
"""

import argparse

import openai


def test_completion(args, echo, logprobs):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.completions.create(
        model="default",
        prompt="The capital of France is",
        temperature=0,
        max_tokens=32,
        echo=echo,
        logprobs=logprobs,
    )
    text = response.choices[0].text
    print(response.choices[0].text)
    if echo:
        assert text.startswith("The capital of France is")
    if logprobs:
        assert response.choices[0].logprobs
        if echo:
            assert response.choices[0].logprobs.token_logprobs[0] == None
        else:
            assert response.choices[0].logprobs.token_logprobs[0] != None
    assert response.id
    assert response.created
    assert response.usage.prompt_tokens > 0
    assert response.usage.completion_tokens > 0
    assert response.usage.total_tokens > 0


def test_completion_stream(args, echo, logprobs):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.completions.create(
        model="default",
        prompt="The capital of France is",
        temperature=0,
        max_tokens=32,
        stream=True,
        echo=echo,
        logprobs=logprobs,
    )
    first = True
    for r in response:
        if first:
            if echo:
                assert r.choices[0].text.startswith("The capital of France is")
            first = False
        if logprobs:
            print(
                f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}",
                flush=True,
            )
        else:
            print(r.choices[0].text, end="", flush=True)
        assert r.id
        assert r.usage.prompt_tokens > 0
        assert r.usage.completion_tokens > 0
        assert r.usage.total_tokens > 0
    print()


def test_chat_completion(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": "What is the capital of France?"},
        ],
        temperature=0,
        max_tokens=32,
    )
    print(response.choices[0].message.content)
    assert response.id
    assert response.created
    assert response.usage.prompt_tokens > 0
    assert response.usage.completion_tokens > 0
    assert response.usage.total_tokens > 0


def test_chat_completion_image(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg"
                        },
                    },
                ],
            },
        ],
        temperature=0,
        max_tokens=32,
    )
    print(response.choices[0].message.content)
    assert response.id
    assert response.created
    assert response.usage.prompt_tokens > 0
    assert response.usage.completion_tokens > 0
    assert response.usage.total_tokens > 0


def test_chat_completion_stream(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": "List 3 countries and their capitals."},
        ],
        temperature=0,
        max_tokens=64,
        stream=True,
    )
    is_first = True
    for chunk in response:
        if is_first:
            is_first = False
            assert chunk.choices[0].delta.role == "assistant"
            continue

        data = chunk.choices[0].delta
        if not data.content:
            continue
        print(data.content, end="", flush=True)
    print()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1")
    parser.add_argument(
        "--test-image", action="store_true", help="Enables testing image inputs"
    )
    args = parser.parse_args()

    test_completion(args, echo=False, logprobs=False)
    test_completion(args, echo=True, logprobs=False)
    test_completion(args, echo=False, logprobs=True)
    test_completion(args, echo=True, logprobs=True)
    test_completion_stream(args, echo=False, logprobs=False)
    test_completion_stream(args, echo=True, logprobs=False)
    test_completion_stream(args, echo=False, logprobs=True)
    test_completion_stream(args, echo=True, logprobs=True)
    test_chat_completion(args)
    test_chat_completion_stream(args)
    if args.test_image:
        test_chat_completion_image(args)