Skip to content

Commit

Permalink
[Doc] Rename offline inference examples (vllm-project#11927)
Browse files Browse the repository at this point in the history
Signed-off-by: Harry Mellor <[email protected]>
  • Loading branch information
hmellor committed Jan 12, 2025
1 parent c637753 commit e52243e
Show file tree
Hide file tree
Showing 46 changed files with 46 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/offline_inference.py"
python3 examples/offline_inference/basic.py"

# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-gh200-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ remove_docker_container

# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/offline_inference.py
python3 examples/offline_inference/basic.py
'
2 changes: 1 addition & 1 deletion .buildkite/run-hpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
2 changes: 1 addition & 1 deletion .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
2 changes: 1 addition & 1 deletion .buildkite/run-openvino-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
2 changes: 1 addition & 1 deletion .buildkite/run-tpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
4 changes: 2 additions & 2 deletions .buildkite/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ remove_docker_container

# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/offline_inference.py
python3 examples/offline_inference/offline_inference_cli.py -tp 2
python3 examples/offline_inference/basic.py
python3 examples/offline_inference/cli.py -tp 2
'
20 changes: 10 additions & 10 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,19 +187,19 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/offline_inference.py
- python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/offline_inference_chat.py
- python3 offline_inference/offline_inference_with_prefix.py
- python3 offline_inference/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/offline_inference_vision_language.py
- python3 offline_inference/offline_inference_vision_language_multi_image.py
- python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/offline_inference_encoder_decoder.py
- python3 offline_inference/offline_inference_classification.py
- python3 offline_inference/offline_inference_embedding.py
- python3 offline_inference/offline_inference_scoring.py
- python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/classification.py
- python3 offline_inference/embedding.py
- python3 offline_inference/scoring.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Prefix Caching Test # 9min
mirror_hardwares: [amd]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/contributing/profiling/profiling_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve

### Offline Inference

Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.

### OpenAI Server

Expand Down
2 changes: 1 addition & 1 deletion docs/source/features/structured_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,4 +257,4 @@ outputs = llm.generate(
print(outputs[0].outputs[0].text)
```

Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
Full example: <gh-file:examples/offline_inference/structured_outputs.py>
4 changes: 2 additions & 2 deletions docs/source/getting_started/installation/cpu/x86.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
$ find / -name *libtcmalloc* # find the dynamic link library path
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference/offline_inference.py # run vLLM
$ python examples/offline_inference/basic.py # run vLLM
```

- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
Expand Down Expand Up @@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ

# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/offline_inference.py
$ python examples/offline_inference/basic.py
```

- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in

## Offline Batched Inference

With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>

The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:

Expand Down
4 changes: 2 additions & 2 deletions docs/source/models/generative_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
A code example can be found here: <gh-file:examples/offline_inference/basic.py>

### `LLM.beam_search`

Expand Down Expand Up @@ -103,7 +103,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
A code example can be found here: <gh-file:examples/offline_inference/chat.py>

If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template:
Expand Down
6 changes: 3 additions & 3 deletions docs/source/models/pooling_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
A code example can be found here: <gh-file:examples/offline_inference/embedding.py>

### `LLM.classify`

Expand All @@ -103,7 +103,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
A code example can be found here: <gh-file:examples/offline_inference/classification.py>

### `LLM.score`

Expand All @@ -125,7 +125,7 @@ score = output.outputs.score
print(f"Score: {score}")
```

A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
A code example can be found here: <gh-file:examples/offline_inference/scoring.py>

## Online Serving

Expand Down
8 changes: 4 additions & 4 deletions docs/source/serving/multimodal_inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ for o in outputs:
print(generated_text)
```

Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
Full example: <gh-file:examples/offline_inference/vision_language.py>

To substitute multiple images inside the same text prompt, you can pass in a list of images instead:

Expand Down Expand Up @@ -91,7 +91,7 @@ for o in outputs:
print(generated_text)
```

Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

Expand Down Expand Up @@ -125,13 +125,13 @@ for o in outputs:
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input.

Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
Full example: <gh-file:examples/offline_inference/vision_language.py>

### Audio

You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.

Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
Full example: <gh-file:examples/offline_inference/audio_language.py>

### Embedding

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion examples/offline_inference/florence2_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
encoder/decoder models, specifically Florence-2
'''
# TODO(Isotr0py):
# Move to offline_inference/offline_inference_vision_language.py
# Move to offline_inference/vision_language.py
# after porting vision backbone
from vllm import LLM, SamplingParams

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format

The OpenAI batch file format consists of a series of json objects on new lines.

[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)

Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.

Expand All @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
```

Once you've created your batch file it should look like this

```
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
$ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
Expand All @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl`

```
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
```

### Step 3: Check your results
Expand All @@ -66,10 +66,10 @@ $ cat results.jsonl

The batch runner supports remote input and output urls that are accessible via http/https.

For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run

```
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
```

## Example 3: Integrating with AWS S3
Expand All @@ -90,21 +90,21 @@ To integrate with cloud blob storage, we recommend using presigned urls.
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
```

Once you've created your batch file it should look like this

```
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
$ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

Now upload your batch file to your S3 bucket.

```
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
```

### Step 2: Generate your presigned urls
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def abort_requests():
example:
```
python examples/offline_inference/offline_profile.py \\
python examples/offline_inference/profiling.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion tests/plugins_tests/test_platform_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def test_platform_plugins():
import os
example_file = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
"examples", "offline_inference/offline_inference.py")
"examples", "offline_inference/basic.py")
runpy.run_path(example_file)

# check if the plugin is loaded correctly
Expand Down
2 changes: 1 addition & 1 deletion tools/profiler/print_layerwise_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
type=str,
required=True,
help="json trace file output by "
"examples/offline_inference/offline_profile.py")
"examples/offline_inference/profiling.py")
parser.add_argument("--phase",
type=str,
required=True,
Expand Down
2 changes: 1 addition & 1 deletion tools/profiler/visualize_layerwise_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
type=str,
required=True,
help="json trace file output by \
examples/offline_inference/offline_profile.py")
examples/offline_inference/profiling.py")
parser.add_argument("--output-directory",
type=str,
required=False,
Expand Down

0 comments on commit e52243e

Please sign in to comment.