Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Doc] Rename offline inference examples #11927

Merged
merged 26 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
18b1bf2
`offline_profile.py` -> `profiling.py`
hmellor Jan 10, 2025
960c15b
`offline_inference.py` -> `basic.py`
hmellor Jan 10, 2025
803174b
`offline_inference_openai.md` -> `openai_batch.md`
hmellor Jan 10, 2025
4cfb3c9
`offline_chat_with_tools.py` -> `chat_with_tools.py`
hmellor Jan 10, 2025
49301df
`offline_inference_audio_language.py` -> `audio_language.py`
hmellor Jan 10, 2025
a254f62
`offline_inference_classification.py` -> `classification.py`
hmellor Jan 10, 2025
657d478
`offline_inference_with_profiler.py` -> `simple_profiling.py`
hmellor Jan 10, 2025
5c16d0a
`offline_inference_chat.py` -> `chat.py`
hmellor Jan 10, 2025
dd34c9d
`offline_inference_arctic.py` -> `arctic.py`
hmellor Jan 10, 2025
2c00f40
`offline_inference_cli.py` -> `cli.py`
hmellor Jan 10, 2025
4e22560
`offline_inference_distributed.py` -> `distributed.py`
hmellor Jan 10, 2025
0222a0b
`offline_inference_embedding.py` -> `embedding.py`
hmellor Jan 10, 2025
687a8a1
`offline_inference_encoder_decoder.py` -> `encoder_decoder.py`
hmellor Jan 10, 2025
271aae9
`offline_inference_mlpspeculator.py` -> `mlpspeculator.py`
hmellor Jan 10, 2025
68ef6da
`offline_inference_neuron_int8_quantization.py -> `neuron_int8_quanti…
hmellor Jan 10, 2025
615f168
`offline_inference_neuron.py` -> `neuron.py`
hmellor Jan 10, 2025
4fa0b53
`offline_inference_scoring.py` -> `scoring.py`
hmellor Jan 10, 2025
a65079c
`offline_inference_vision_language_embedding.py` -> `vision_language_…
hmellor Jan 10, 2025
f8af76f
`offline_inference_vision_language_multi_image.py` -> `vision_languag…
hmellor Jan 10, 2025
49a496e
`offline_inference_vision_language.py` -> `vision_language.py`
hmellor Jan 10, 2025
06e131c
`offline_inference_structured_outputs.py` -> `structured_outputs.py`
hmellor Jan 10, 2025
c03e772
`offline_inference_with_default_generation_config.py` -> `basic_with_…
hmellor Jan 10, 2025
7059db2
`offline_inference_pixtral.py` -> `pixtral.py`
hmellor Jan 10, 2025
60827c0
`offline_inference_tpu.py` -> `tpu.py`
hmellor Jan 10, 2025
d4b5ea4
`offline_inference_with_prefix.py` -> `prefix_caching.py`
hmellor Jan 10, 2025
cdad141
`offline_inference_whisper.py` -> `whisper.py`
hmellor Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/offline_inference.py"
python3 examples/offline_inference/basic.py"

# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-gh200-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ remove_docker_container

# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/offline_inference.py
python3 examples/offline_inference/basic.py
'
2 changes: 1 addition & 1 deletion .buildkite/run-hpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
2 changes: 1 addition & 1 deletion .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
2 changes: 1 addition & 1 deletion .buildkite/run-openvino-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
2 changes: 1 addition & 1 deletion .buildkite/run-tpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
4 changes: 2 additions & 2 deletions .buildkite/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ remove_docker_container

# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/offline_inference.py
python3 examples/offline_inference/offline_inference_cli.py -tp 2
python3 examples/offline_inference/basic.py
python3 examples/offline_inference/cli.py -tp 2
'
20 changes: 10 additions & 10 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,19 +187,19 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/offline_inference.py
- python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/offline_inference_chat.py
- python3 offline_inference/offline_inference_with_prefix.py
- python3 offline_inference/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/offline_inference_vision_language.py
- python3 offline_inference/offline_inference_vision_language_multi_image.py
- python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/offline_inference_encoder_decoder.py
- python3 offline_inference/offline_inference_classification.py
- python3 offline_inference/offline_inference_embedding.py
- python3 offline_inference/offline_inference_scoring.py
- python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/classification.py
- python3 offline_inference/embedding.py
- python3 offline_inference/scoring.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Prefix Caching Test # 9min
mirror_hardwares: [amd]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/contributing/profiling/profiling_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve

### Offline Inference

Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.

### OpenAI Server

Expand Down
2 changes: 1 addition & 1 deletion docs/source/features/structured_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,4 +257,4 @@ outputs = llm.generate(
print(outputs[0].outputs[0].text)
```

Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
Full example: <gh-file:examples/offline_inference/structured_outputs.py>
4 changes: 2 additions & 2 deletions docs/source/getting_started/installation/cpu-x86.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
$ find / -name *libtcmalloc* # find the dynamic link library path
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference/offline_inference.py # run vLLM
$ python examples/offline_inference/basic.py # run vLLM
```

- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
Expand Down Expand Up @@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ

# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/offline_inference.py
$ python examples/offline_inference/basic.py
```

- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in

## Offline Batched Inference

With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>

The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:

Expand Down
4 changes: 2 additions & 2 deletions docs/source/models/generative_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
A code example can be found here: <gh-file:examples/offline_inference/basic.py>

### `LLM.beam_search`

Expand Down Expand Up @@ -103,7 +103,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
A code example can be found here: <gh-file:examples/offline_inference/chat.py>

If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template:
Expand Down
6 changes: 3 additions & 3 deletions docs/source/models/pooling_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
A code example can be found here: <gh-file:examples/offline_inference/embedding.py>

### `LLM.classify`

Expand All @@ -103,7 +103,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
A code example can be found here: <gh-file:examples/offline_inference/classification.py>

### `LLM.score`

Expand All @@ -125,7 +125,7 @@ score = output.outputs.score
print(f"Score: {score}")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
A code example can be found here: <gh-file:examples/offline_inference/scoring.py>

## Online Inference

Expand Down
8 changes: 4 additions & 4 deletions docs/source/serving/multimodal_inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ for o in outputs:
print(generated_text)
```

Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
Full example: <gh-file:examples/offline_inference/vision_language.py>

To substitute multiple images inside the same text prompt, you can pass in a list of images instead:

Expand Down Expand Up @@ -91,7 +91,7 @@ for o in outputs:
print(generated_text)
```

Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

Expand Down Expand Up @@ -125,13 +125,13 @@ for o in outputs:
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input.

Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
Full example: <gh-file:examples/offline_inference/vision_language.py>

### Audio

You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.

Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
Full example: <gh-file:examples/offline_inference/audio_language.py>

### Embedding

Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/florence2_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
encoder/decoder models, specifically Florence-2
'''
# TODO(Isotr0py):
# Move to offline_inference/offline_inference_vision_language.py
# Move to offline_inference/vision_language.py
# after porting vision backbone
from vllm import LLM, SamplingParams

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format

The OpenAI batch file format consists of a series of json objects on new lines.

[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)

Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.

Expand All @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
```

Once you've created your batch file it should look like this

```
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
$ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
Expand All @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl`

```
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
```

### Step 3: Check your results
Expand All @@ -66,10 +66,10 @@ $ cat results.jsonl

The batch runner supports remote input and output urls that are accessible via http/https.

For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run

```
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
```

## Example 3: Integrating with AWS S3
Expand All @@ -90,21 +90,21 @@ To integrate with cloud blob storage, we recommend using presigned urls.
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
```

Once you've created your batch file it should look like this

```
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
$ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

Now upload your batch file to your S3 bucket.

```
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
```

### Step 2: Generate your presigned urls
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def abort_requests():
example:
```
python examples/offline_inference/offline_profile.py \\
python examples/offline_inference/profiling.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2
Expand Down
2 changes: 1 addition & 1 deletion tests/plugins_tests/test_platform_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def test_platform_plugins():
import os
example_file = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
"examples", "offline_inference/offline_inference.py")
"examples", "offline_inference/basic.py")
runpy.run_path(example_file)

# check if the plugin is loaded correctly
Expand Down
2 changes: 1 addition & 1 deletion tools/profiler/print_layerwise_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
type=str,
required=True,
help="json trace file output by "
"examples/offline_inference/offline_profile.py")
"examples/offline_inference/profiling.py")
parser.add_argument("--phase",
type=str,
required=True,
Expand Down
2 changes: 1 addition & 1 deletion tools/profiler/visualize_layerwise_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
type=str,
required=True,
help="json trace file output by \
examples/offline_inference/offline_profile.py")
examples/offline_inference/profiling.py")
parser.add_argument("--output-directory",
type=str,
required=False,
Expand Down
Loading