From 46c420868706589d8c725464b8c17fe73411e1db Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:50:29 +0000 Subject: [PATCH] [Doc] Rename offline inference examples (#11927) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Fred Reiss --- .buildkite/run-cpu-test.sh | 2 +- .buildkite/run-gh200-test.sh | 2 +- .buildkite/run-hpu-test.sh | 2 +- .buildkite/run-neuron-test.sh | 2 +- .buildkite/run-openvino-test.sh | 2 +- .buildkite/run-tpu-test.sh | 2 +- .buildkite/run-xpu-test.sh | 4 ++-- .buildkite/test-pipeline.yaml | 20 +++++++++---------- .../contributing/profiling/profiling_index.md | 2 +- docs/source/features/structured_outputs.md | 2 +- .../getting_started/installation/cpu-x86.md | 4 ++-- docs/source/getting_started/quickstart.md | 2 +- docs/source/models/generative_models.md | 4 ++-- docs/source/models/pooling_models.md | 6 +++--- docs/source/serving/multimodal_inputs.md | 8 ++++---- ...{offline_inference_arctic.py => arctic.py} | 0 ...ce_audio_language.py => audio_language.py} | 0 .../{offline_inference.py => basic.py} | 0 ...y => basic_with_model_default_sampling.py} | 0 .../{offline_inference_chat.py => chat.py} | 0 ..._chat_with_tools.py => chat_with_tools.py} | 0 ...ce_classification.py => classification.py} | 0 .../{offline_inference_cli.py => cli.py} | 0 ...nference_distributed.py => distributed.py} | 0 ...ne_inference_embedding.py => embedding.py} | 0 ..._encoder_decoder.py => encoder_decoder.py} | 0 .../offline_inference/florence2_inference.py | 2 +- ...ence_mlpspeculator.py => mlpspeculator.py} | 0 ...{offline_inference_neuron.py => neuron.py} | 0 ...ization.py => neuron_int8_quantization.py} | 0 .../openai_batch.md} | 18 ++++++++--------- .../openai_example_batch.jsonl | 0 ...ffline_inference_pixtral.py => pixtral.py} | 0 ...rence_with_prefix.py => prefix_caching.py} | 0 .../{offline_profile.py => profiling.py} | 2 +- ...ffline_inference_scoring.py => scoring.py} | 0 ...e_with_profiler.py => simple_profiling.py} | 0 ...tured_outputs.py => structured_outputs.py} | 0 .../{offline_inference_tpu.py => tpu.py} | 0 ..._vision_language.py => vision_language.py} | 0 ...edding.py => vision_language_embedding.py} | 0 ...mage.py => vision_language_multi_image.py} | 0 ...ffline_inference_whisper.py => whisper.py} | 0 tests/plugins_tests/test_platform_plugins.py | 2 +- tools/profiler/print_layerwise_table.py | 2 +- tools/profiler/visualize_layerwise_profile.py | 2 +- 46 files changed, 46 insertions(+), 46 deletions(-) rename examples/offline_inference/{offline_inference_arctic.py => arctic.py} (100%) rename examples/offline_inference/{offline_inference_audio_language.py => audio_language.py} (100%) rename examples/offline_inference/{offline_inference.py => basic.py} (100%) rename examples/offline_inference/{offline_inference_with_default_generation_config.py => basic_with_model_default_sampling.py} (100%) rename examples/offline_inference/{offline_inference_chat.py => chat.py} (100%) rename examples/offline_inference/{offline_chat_with_tools.py => chat_with_tools.py} (100%) rename examples/offline_inference/{offline_inference_classification.py => classification.py} (100%) rename examples/offline_inference/{offline_inference_cli.py => cli.py} (100%) rename examples/offline_inference/{offline_inference_distributed.py => distributed.py} (100%) rename examples/offline_inference/{offline_inference_embedding.py => embedding.py} (100%) rename examples/offline_inference/{offline_inference_encoder_decoder.py => encoder_decoder.py} (100%) rename examples/offline_inference/{offline_inference_mlpspeculator.py => mlpspeculator.py} (100%) rename examples/offline_inference/{offline_inference_neuron.py => neuron.py} (100%) rename examples/offline_inference/{offline_inference_neuron_int8_quantization.py => neuron_int8_quantization.py} (100%) rename examples/offline_inference/{offline_inference_openai/offline_inference_openai.md => openai/openai_batch.md} (92%) rename examples/offline_inference/{offline_inference_openai => openai}/openai_example_batch.jsonl (100%) rename examples/offline_inference/{offline_inference_pixtral.py => pixtral.py} (100%) rename examples/offline_inference/{offline_inference_with_prefix.py => prefix_caching.py} (100%) rename examples/offline_inference/{offline_profile.py => profiling.py} (99%) rename examples/offline_inference/{offline_inference_scoring.py => scoring.py} (100%) rename examples/offline_inference/{offline_inference_with_profiler.py => simple_profiling.py} (100%) rename examples/offline_inference/{offline_inference_structured_outputs.py => structured_outputs.py} (100%) rename examples/offline_inference/{offline_inference_tpu.py => tpu.py} (100%) rename examples/offline_inference/{offline_inference_vision_language.py => vision_language.py} (100%) rename examples/offline_inference/{offline_inference_vision_language_embedding.py => vision_language_embedding.py} (100%) rename examples/offline_inference/{offline_inference_vision_language_multi_image.py => vision_language_multi_image.py} (100%) rename examples/offline_inference/{offline_inference_whisper.py => whisper.py} (100%) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 5a285be039393..4ae66f6f3215a 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -30,7 +30,7 @@ function cpu_tests() { # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference/offline_inference.py" + python3 examples/offline_inference/basic.py" # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 1e5ff77895a38..3e4e409466b8a 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/offline_inference.py + python3 examples/offline_inference/basic.py ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index a50570ab53438..8f3b08212fd6a 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py \ No newline at end of file diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 52d485939b1d0..189714ebb6d75 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 380f7a44a429a..6159b21ff8206 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index a8f021890f742..650af0fac4c61 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \ && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ && python3 /workspace/vllm/tests/tpu/test_compilation.py \ && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ - && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py" + && python3 /workspace/vllm/examples/offline_inference/tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 160e10aa3bb9b..4d344e58db8ac 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference/offline_inference.py - python3 examples/offline_inference/offline_inference_cli.py -tp 2 + python3 examples/offline_inference/basic.py + python3 examples/offline_inference/cli.py -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7d13269540864..d3bd809cfdf24 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -187,19 +187,19 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference/offline_inference.py + - python3 offline_inference/basic.py - python3 offline_inference/cpu_offload.py - - python3 offline_inference/offline_inference_chat.py - - python3 offline_inference/offline_inference_with_prefix.py + - python3 offline_inference/chat.py + - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/offline_inference_vision_language.py - - python3 offline_inference/offline_inference_vision_language_multi_image.py + - python3 offline_inference/vision_language.py + - python3 offline_inference/vision_language_multi_image.py - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/offline_inference_encoder_decoder.py - - python3 offline_inference/offline_inference_classification.py - - python3 offline_inference/offline_inference_embedding.py - - python3 offline_inference/offline_inference_scoring.py - - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/classification.py + - python3 offline_inference/embedding.py + - python3 offline_inference/scoring.py + - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 97de40ff469f1..001db86bdf555 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index a42c3dd64ad10..1d77c7339a33f 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md index bb046dd0fd9dc..f4d3eec0377b1 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library $ find / -name *libtcmalloc* # find the dynamic link library path $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference/offline_inference.py # run vLLM +$ python examples/offline_inference/basic.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: @@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference/offline_inference.py +$ python examples/offline_inference/basic.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index d7d43785c6c24..6fd0083a9bb7b 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 6a5a58ad74ab7..e4b4cd03a90d2 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 324b1f550e694..91db694be29a4 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -88,7 +88,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -103,7 +103,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -125,7 +125,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: ## Online Serving diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index a06f121a6899a..53f5a274e39a3 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -125,13 +125,13 @@ for o in outputs: You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding diff --git a/examples/offline_inference/offline_inference_arctic.py b/examples/offline_inference/arctic.py similarity index 100% rename from examples/offline_inference/offline_inference_arctic.py rename to examples/offline_inference/arctic.py diff --git a/examples/offline_inference/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py similarity index 100% rename from examples/offline_inference/offline_inference_audio_language.py rename to examples/offline_inference/audio_language.py diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/basic.py similarity index 100% rename from examples/offline_inference/offline_inference.py rename to examples/offline_inference/basic.py diff --git a/examples/offline_inference/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py similarity index 100% rename from examples/offline_inference/offline_inference_with_default_generation_config.py rename to examples/offline_inference/basic_with_model_default_sampling.py diff --git a/examples/offline_inference/offline_inference_chat.py b/examples/offline_inference/chat.py similarity index 100% rename from examples/offline_inference/offline_inference_chat.py rename to examples/offline_inference/chat.py diff --git a/examples/offline_inference/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py similarity index 100% rename from examples/offline_inference/offline_chat_with_tools.py rename to examples/offline_inference/chat_with_tools.py diff --git a/examples/offline_inference/offline_inference_classification.py b/examples/offline_inference/classification.py similarity index 100% rename from examples/offline_inference/offline_inference_classification.py rename to examples/offline_inference/classification.py diff --git a/examples/offline_inference/offline_inference_cli.py b/examples/offline_inference/cli.py similarity index 100% rename from examples/offline_inference/offline_inference_cli.py rename to examples/offline_inference/cli.py diff --git a/examples/offline_inference/offline_inference_distributed.py b/examples/offline_inference/distributed.py similarity index 100% rename from examples/offline_inference/offline_inference_distributed.py rename to examples/offline_inference/distributed.py diff --git a/examples/offline_inference/offline_inference_embedding.py b/examples/offline_inference/embedding.py similarity index 100% rename from examples/offline_inference/offline_inference_embedding.py rename to examples/offline_inference/embedding.py diff --git a/examples/offline_inference/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py similarity index 100% rename from examples/offline_inference/offline_inference_encoder_decoder.py rename to examples/offline_inference/encoder_decoder.py diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py index 49dd2c331db5a..c24096e90004b 100644 --- a/examples/offline_inference/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -3,7 +3,7 @@ encoder/decoder models, specifically Florence-2 ''' # TODO(Isotr0py): -# Move to offline_inference/offline_inference_vision_language.py +# Move to offline_inference/vision_language.py # after porting vision backbone from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py similarity index 100% rename from examples/offline_inference/offline_inference_mlpspeculator.py rename to examples/offline_inference/mlpspeculator.py diff --git a/examples/offline_inference/offline_inference_neuron.py b/examples/offline_inference/neuron.py similarity index 100% rename from examples/offline_inference/offline_inference_neuron.py rename to examples/offline_inference/neuron.py diff --git a/examples/offline_inference/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py similarity index 100% rename from examples/offline_inference/offline_inference_neuron_int8_quantization.py rename to examples/offline_inference/neuron_int8_quantization.py diff --git a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md similarity index 92% rename from examples/offline_inference/offline_inference_openai/offline_inference_openai.md rename to examples/offline_inference/openai/openai_batch.md index 6278a1943fe4a..a4774e57cd9a5 100644 --- a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ``` -python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -66,10 +66,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run ``` -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ``` -aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl similarity index 100% rename from examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl rename to examples/offline_inference/openai/openai_example_batch.jsonl diff --git a/examples/offline_inference/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py similarity index 100% rename from examples/offline_inference/offline_inference_pixtral.py rename to examples/offline_inference/pixtral.py diff --git a/examples/offline_inference/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py similarity index 100% rename from examples/offline_inference/offline_inference_with_prefix.py rename to examples/offline_inference/prefix_caching.py diff --git a/examples/offline_inference/offline_profile.py b/examples/offline_inference/profiling.py similarity index 99% rename from examples/offline_inference/offline_profile.py rename to examples/offline_inference/profiling.py index 187a05e4d70a2..8a94b5c2a8623 100644 --- a/examples/offline_inference/offline_profile.py +++ b/examples/offline_inference/profiling.py @@ -363,7 +363,7 @@ def abort_requests(): example: ``` - python examples/offline_inference/offline_profile.py \\ + python examples/offline_inference/profiling.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --enforce-eager run_num_steps -n 2 diff --git a/examples/offline_inference/offline_inference_scoring.py b/examples/offline_inference/scoring.py similarity index 100% rename from examples/offline_inference/offline_inference_scoring.py rename to examples/offline_inference/scoring.py diff --git a/examples/offline_inference/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py similarity index 100% rename from examples/offline_inference/offline_inference_with_profiler.py rename to examples/offline_inference/simple_profiling.py diff --git a/examples/offline_inference/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py similarity index 100% rename from examples/offline_inference/offline_inference_structured_outputs.py rename to examples/offline_inference/structured_outputs.py diff --git a/examples/offline_inference/offline_inference_tpu.py b/examples/offline_inference/tpu.py similarity index 100% rename from examples/offline_inference/offline_inference_tpu.py rename to examples/offline_inference/tpu.py diff --git a/examples/offline_inference/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py similarity index 100% rename from examples/offline_inference/offline_inference_vision_language.py rename to examples/offline_inference/vision_language.py diff --git a/examples/offline_inference/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py similarity index 100% rename from examples/offline_inference/offline_inference_vision_language_embedding.py rename to examples/offline_inference/vision_language_embedding.py diff --git a/examples/offline_inference/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py similarity index 100% rename from examples/offline_inference/offline_inference_vision_language_multi_image.py rename to examples/offline_inference/vision_language_multi_image.py diff --git a/examples/offline_inference/offline_inference_whisper.py b/examples/offline_inference/whisper.py similarity index 100% rename from examples/offline_inference/offline_inference_whisper.py rename to examples/offline_inference/whisper.py diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 57518bd3e8299..69698b34c71a3 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -5,7 +5,7 @@ def test_platform_plugins(): import os example_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(current_file))), - "examples", "offline_inference/offline_inference.py") + "examples", "offline_inference/basic.py") runpy.run_path(example_file) # check if the plugin is loaded correctly diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 49366abc7fb56..54cd60c2bc95b 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0): type=str, required=True, help="json trace file output by " - "examples/offline_inference/offline_profile.py") + "examples/offline_inference/profiling.py") parser.add_argument("--phase", type=str, required=True, diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index fa88ed4204d8f..cb56ebd69a8c1 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -538,7 +538,7 @@ def make_plot_title_suffix(profile_json: dict) -> str: type=str, required=True, help="json trace file output by \ - examples/offline_inference/offline_profile.py") + examples/offline_inference/profiling.py") parser.add_argument("--output-directory", type=str, required=False,