From aa17e0a941ad3413b7b73098050c3152b80c6f97 Mon Sep 17 00:00:00 2001 From: shuaills Date: Sat, 21 Dec 2024 22:59:53 +0000 Subject: [PATCH 1/7] updated constrained decoding doc --- docs/backend/openai_api_completions.ipynb | 123 ++++++++++++++++++++-- 1 file changed, 114 insertions(+), 9 deletions(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 067a046885d..50b2764445a 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -36,9 +36,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-21 21:57:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=2048, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=512968292, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=8, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n", + "[2024-12-21 21:57:37 TP0] Init torch distributed begin.\n", + "[2024-12-21 21:57:37 TP0] Load weight begin. avail mem=21.87 GB\n", + "[2024-12-21 21:57:41 TP0] Using model weights format ['*.safetensors']\n" + ] + } + ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -69,9 +92,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Response: ChatCompletion(id='1306d8dd0eb14493bbfd0c8b6f029435', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **France** - **Paris**\\n2. **Japan** - **Tokyo**\\n3. **Australia** - **Canberra**', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1734818316, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=18, total_tokens=57, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import openai\n", "\n", @@ -102,9 +138,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Ancient Rome's major achievements include:

1. Expansion: Rome expanded its territories through conquest, creating the largest empire the world had ever seen, spanning from Britain to Egypt and from Spain to Syria.
2. Law and Governance: Rome developed a system of laws, the Twelve Tables, which became the foundation of modern law. The Roman Republic and later the Roman Empire also established a system of governance that lasted for centuries.
3. Architecture and Engineering: Rome built impressive structures such as the Colosseum, Pantheon, and aqueducts, showcasing its engineering and architectural skills.
4. Language and Literature: Latin became the language of
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 14\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mcreate(\n\u001b[1;32m 15\u001b[0m model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta-llama/Meta-Llama-3.1-8B-Instruct\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 16\u001b[0m messages\u001b[38;5;241m=\u001b[39m[\n\u001b[1;32m 17\u001b[0m {\n\u001b[1;32m 18\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 19\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGive me the information of the capital of France in the JSON format.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 20\u001b[0m },\n\u001b[1;32m 21\u001b[0m ],\n\u001b[1;32m 22\u001b[0m temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 23\u001b[0m max_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m128\u001b[39m,\n\u001b[1;32m 24\u001b[0m response_format\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 25\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfoo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mschema\u001b[39m\u001b[38;5;124m\"\u001b[39m: json\u001b[38;5;241m.\u001b[39mloads(json_schema)},\n\u001b[1;32m 27\u001b[0m },\n\u001b[1;32m 28\u001b[0m )\n\u001b[1;32m 30\u001b[0m print_highlight(response\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent)\n", + "\u001b[0;31mNameError\u001b[0m: name 'client' is not defined" + ] + } + ], "source": [ "import json\n", "\n", @@ -585,6 +684,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "sglang", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -594,7 +698,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.10.16" } }, "nbformat": 4, From 88db12eeb32621d5fdf484907a667668d69ba00c Mon Sep 17 00:00:00 2001 From: shuaills Date: Sat, 21 Dec 2024 23:20:20 +0000 Subject: [PATCH 2/7] rm ipynb outputs --- docs/backend/openai_api_completions.ipynb | 109 ++-------------------- 1 file changed, 8 insertions(+), 101 deletions(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 50b2764445a..61f58eb695f 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -36,32 +36,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-21 21:57:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=2048, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=512968292, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=8, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n", - "[2024-12-21 21:57:37 TP0] Init torch distributed begin.\n", - "[2024-12-21 21:57:37 TP0] Load weight begin. avail mem=21.87 GB\n", - "[2024-12-21 21:57:41 TP0] Using model weights format ['*.safetensors']\n" - ] - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -92,22 +69,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Response: ChatCompletion(id='1306d8dd0eb14493bbfd0c8b6f029435', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **France** - **Paris**\\n2. **Japan** - **Tokyo**\\n3. **Australia** - **Canberra**', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1734818316, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=18, total_tokens=57, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -138,54 +102,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Ancient Rome's major achievements include:

1. Expansion: Rome expanded its territories through conquest, creating the largest empire the world had ever seen, spanning from Britain to Egypt and from Spain to Syria.
2. Law and Governance: Rome developed a system of laws, the Twelve Tables, which became the foundation of modern law. The Roman Republic and later the Roman Empire also established a system of governance that lasted for centuries.
3. Architecture and Engineering: Rome built impressive structures such as the Colosseum, Pantheon, and aqueducts, showcasing its engineering and architectural skills.
4. Language and Literature: Latin became the language of
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 14\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mcreate(\n\u001b[1;32m 15\u001b[0m model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta-llama/Meta-Llama-3.1-8B-Instruct\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 16\u001b[0m messages\u001b[38;5;241m=\u001b[39m[\n\u001b[1;32m 17\u001b[0m {\n\u001b[1;32m 18\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 19\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGive me the information of the capital of France in the JSON format.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 20\u001b[0m },\n\u001b[1;32m 21\u001b[0m ],\n\u001b[1;32m 22\u001b[0m temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 23\u001b[0m max_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m128\u001b[39m,\n\u001b[1;32m 24\u001b[0m response_format\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 25\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfoo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mschema\u001b[39m\u001b[38;5;124m\"\u001b[39m: json\u001b[38;5;241m.\u001b[39mloads(json_schema)},\n\u001b[1;32m 27\u001b[0m },\n\u001b[1;32m 28\u001b[0m )\n\u001b[1;32m 30\u001b[0m print_highlight(response\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent)\n", - "\u001b[0;31mNameError\u001b[0m: name 'client' is not defined" - ] - } - ], + "outputs": [], "source": [ "import json\n", "\n", From 48d58aacb698e91c221ffcd3e6c675357c031bf3 Mon Sep 17 00:00:00 2001 From: shuaills Date: Sun, 22 Dec 2024 13:59:59 +0000 Subject: [PATCH 3/7] updated docs --- docs/backend/openai_api_completions.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 61f58eb695f..369e950b78e 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -223,7 +223,7 @@ "## Structured decoding (JSON, Regex)\n", "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n", "\n", - "By default, SGlang uses outlines for structured decoding. To enable Xgrammar (which offers better performance and supports JSON but not regex patterns), add `--grammar-backend xgrammar` when launching the server:\n", + "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it:\n", "\n", "```bash\n", "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --grammar-backend xgrammar\n", From f26eb4f340c2e11b95b51330f7806a3e94bc3521 Mon Sep 17 00:00:00 2001 From: shuaills Date: Sun, 22 Dec 2024 14:04:25 +0000 Subject: [PATCH 4/7] updated docs --- docs/backend/openai_api_completions.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 369e950b78e..fba7682c5dc 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -223,10 +223,11 @@ "## Structured decoding (JSON, Regex)\n", "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n", "\n", - "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it:\n", + "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it, add the `--grammar-backend xgrammar` when launching the server:\n", "\n", "```bash\n", - "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --grammar-backend xgrammar\n", + "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct \\\n", + "--grammar-backend xgrammar\n", "```\n", "\n", "### JSON" From 0eaf40105554f29ad6965c922476fd67474c25de Mon Sep 17 00:00:00 2001 From: shuaills Date: Sun, 22 Dec 2024 14:15:53 +0000 Subject: [PATCH 5/7] remove ipynb outputs --- docs/backend/openai_api_completions.ipynb | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index fba7682c5dc..6cb5d884e51 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -592,11 +592,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "sglang", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -606,8 +601,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" + "pygments_lexer": "ipython3" } }, "nbformat": 4, From f9d7349e544e16154cef5f2fbfcc25f9a782d700 Mon Sep 17 00:00:00 2001 From: shuaills Date: Sun, 22 Dec 2024 18:59:26 +0000 Subject: [PATCH 6/7] Update doc --- docs/backend/openai_api_completions.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 6cb5d884e51..9c756e02175 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -221,13 +221,13 @@ "metadata": {}, "source": [ "## Structured decoding (JSON, Regex)\n", - "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n", + "You can define a JSON schema or regular expression to constrain the model's output, which depends on the grammar backend.\n", "\n", - "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it, add the `--grammar-backend xgrammar` when launching the server:\n", + "SGlang has two backends: outlines (default) and Xgrammar. Xgrammar enhances JSON decoding performance but does not support regular expressions. To use Xgrammar, add the `--grammar-backend xgrammar` when launching the server:\n", "\n", "```bash\n", - "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct \\\n", - "--grammar-backend xgrammar\n", + "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + "--port 30000 --host 0.0.0.0 --grammar-backend xgrammar\n", "```\n", "\n", "### JSON" From a00ecb18abd7c5136fa6670595b445fdc2c8399c Mon Sep 17 00:00:00 2001 From: shuaills Date: Sun, 22 Dec 2024 23:35:41 +0000 Subject: [PATCH 7/7] update doc --- docs/backend/openai_api_completions.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 9c756e02175..9340f953f14 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -221,7 +221,7 @@ "metadata": {}, "source": [ "## Structured decoding (JSON, Regex)\n", - "You can define a JSON schema or regular expression to constrain the model's output, which depends on the grammar backend.\n", + "You can define a JSON schema or regular expression to constrain the model's output. The model output will be guaranteed to follow the given constraints and this depends on the grammar backend.\n", "\n", "SGlang has two backends: outlines (default) and Xgrammar. Xgrammar enhances JSON decoding performance but does not support regular expressions. To use Xgrammar, add the `--grammar-backend xgrammar` when launching the server:\n", "\n",