diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 4f1729d46dae2..9925db7bea593 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -9,36 +9,33 @@ CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; } +remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; } trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 # offline inference - docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference.py" + python3 examples/offline_inference/basic.py" # Run basic model test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install pytest pytest-asyncio \ - decord einops librosa peft Pillow sentence-transformers soundfile \ - transformers_stream_generator matplotlib datamodel_code_generator - pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pip install -r vllm/requirements-test.txt pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model @@ -46,26 +43,26 @@ function cpu_tests() { pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" - # online inference - docker exec cpu-test-"$NUMA_NODE" bash -c " + # online serving + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=$1 @@ -78,6 +75,12 @@ function cpu_tests() { --num-prompts 20 \ --endpoint /v1/completions \ --tokenizer facebook/opt-125m" + + # Run multi-lora tests + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + set -e + pytest -s -v \ + tests/lora/test_qwen2vl.py" } # All of CPU tests are expected to be finished less than 25 mins. diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 4fc6d089cc666..3e4e409466b8a 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference.py + python3 examples/offline_inference/basic.py ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index fa4f74fca7a11..8f3b08212fd6a 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py \ No newline at end of file diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 9259391aaed49..189714ebb6d75 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -3,6 +3,18 @@ # This script build the Neuron docker image and run the API server inside the container. # It serves a sanity check for compilation and basic model usage. set -e +set -v + +image_name="neuron/vllm-ci" +container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + +HF_CACHE="$(realpath ~)/huggingface" +mkdir -p "${HF_CACHE}" +HF_MOUNT="/root/.cache/huggingface" + +NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" +mkdir -p "${NEURON_COMPILE_CACHE_URL}" +NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" # Try building the docker image aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com @@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then last_build=$(cat /tmp/neuron-docker-build-timestamp) current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then + docker image prune -f docker system prune -f + rm -rf "${HF_MOUNT:?}/*" + rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else date "+%s" > /tmp/neuron-docker-build-timestamp fi -docker build -t neuron -f Dockerfile.neuron . +docker build -t "${image_name}" -f Dockerfile.neuron . # Setup cleanup -remove_docker_container() { docker rm -f neuron || true; } +remove_docker_container() { + docker image rm -f "${image_name}" || true; +} trap remove_docker_container EXIT -remove_docker_container # Run the image -docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ - --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & - -# Wait for the server to start -wait_for_server_to_start() { - timeout=300 - counter=0 - - while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do - sleep 1 - counter=$((counter + 1)) - if [ $counter -ge $timeout ]; then - echo "Timeout after $timeout seconds" - break - fi - done -} -wait_for_server_to_start - -# Test a simple prompt -curl -X POST -H "Content-Type: application/json" \ - localhost:8000/generate \ - -d '{"prompt": "San Francisco is a"}' +docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ + -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ + --name "${container_name}" \ + ${image_name} \ + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 6b12f424fd828..6159b21ff8206 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 770dad6ffa3a1..650af0fac4c61 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -14,4 +14,13 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ + && python3 -m pip install pytest \ + && python3 -m pip install lm_eval[api]==0.4.4 \ + && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \ + && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ + && python3 /workspace/vllm/tests/tpu/test_compilation.py \ + && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ + && python3 /workspace/vllm/examples/offline_inference/tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index e0a12afbe7320..4d344e58db8ac 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference.py - python3 examples/offline_inference_cli.py -tp 2 + python3 examples/offline_inference/basic.py + python3 examples/offline_inference/cli.py -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 529daf54faecf..74b287c7adbfa 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -38,7 +38,7 @@ steps: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/dev/sampling_params.html + - grep \"sig sig-object py\" build/html/api/inference_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min fast_check: true @@ -52,6 +52,7 @@ steps: - tests/worker - tests/standalone_tests/lazy_torch_compile.py commands: + - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git # Used by multimoda processing test - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine @@ -187,19 +188,19 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference.py - - python3 cpu_offload.py - - python3 offline_inference_chat.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 offline_inference_vision_language.py - - python3 offline_inference_vision_language_multi_image.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference_encoder_decoder.py - - python3 offline_inference_classification.py - - python3 offline_inference_embedding.py - - python3 offline_inference_scoring.py - - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/basic.py + - python3 offline_inference/cpu_offload.py + - python3 offline_inference/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/vision_language.py + - python3 offline_inference/vision_language_multi_image.py + - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/classification.py + - python3 offline_inference/embedding.py + - python3 offline_inference/scoring.py + - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] @@ -214,6 +215,7 @@ steps: - vllm/model_executor/layers - vllm/sampling_metadata.py - tests/samplers + - tests/conftest.py commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers @@ -229,20 +231,22 @@ steps: - pytest -v -s test_logits_processor.py - pytest -v -s model_executor/test_guided_processors.py -- label: Speculative decoding tests # 30min +- label: Speculative decoding tests # 40min source_file_dependencies: - vllm/spec_decode - tests/spec_decode + - vllm/model_executor/models/eagle.py commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min @@ -367,6 +371,7 @@ steps: - tests/models/encoder_decoder/vision_language commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model @@ -535,6 +540,7 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_minicpmv_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/doc-lint.yml similarity index 93% rename from .github/workflows/sphinx-lint.yml rename to .github/workflows/doc-lint.yml index e0bb24276a653..2f5ee8bbfd8c5 100644 --- a/.github/workflows/sphinx-lint.yml +++ b/.github/workflows/doc-lint.yml @@ -13,7 +13,7 @@ on: - "docs/**" jobs: - sphinx-lint: + doc-lint: runs-on: ubuntu-latest strategy: matrix: @@ -29,4 +29,4 @@ jobs: python -m pip install --upgrade pip pip install -r requirements-lint.txt - name: Linting docs - run: tools/sphinx-lint.sh + run: tools/doc-lint.sh diff --git a/.gitignore b/.gitignore index bb7e4d5b244a8..89dab8f13bab1 100644 --- a/.gitignore +++ b/.gitignore @@ -79,10 +79,7 @@ instance/ # Sphinx documentation docs/_build/ -docs/source/getting_started/examples/*.rst -!**/*.template.rst -docs/source/getting_started/examples/*.md -!**/*.template.md +docs/source/getting_started/examples/ # PyBuilder .pybuilder/ diff --git a/Dockerfile b/Dockerfile index 088314eb38dbe..4542bc9cf0bd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,8 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/dev/dockerfile/dockerfile.md and -# docs/source/assets/dev/dockerfile-stages-dependency.png +# docs/source/contributing/dockerfile/dockerfile.md and +# docs/source/assets/contributing/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### @@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -COPY examples/sagemaker-entrypoint.sh . +COPY examples/online_serving/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 269139fe90f0b..e9cb82889decd 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -15,8 +15,8 @@ RUN apt-get update && \ ffmpeg libsm6 libxext6 libgl1 ### Mount Point ### -# When launching the container, mount the code directory to /app -ARG APP_MOUNT=/app +# When launching the container, mount the code directory to /workspace +ARG APP_MOUNT=/workspace VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT}/vllm @@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install pytest COPY . . ARG GIT_REPO_CHECK=0 @@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils +# overwrite entrypoint to run bash script +RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py + CMD ["/bin/bash"] diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 8bd188ffde408..32bcbfa9cc168 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install -U pip # install build requirements RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt # build vLLM with OpenVINO backend diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index 971248577983f..d3cd1c7b313bc 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -4,7 +4,7 @@ USER root ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" -RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 +RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba @@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi -# These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ + RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ diff --git a/README.md b/README.md index f83c9d759b359..67c557bfe13a9 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone --- +The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui) + +--- + *Latest News* 🔥 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). @@ -37,7 +41,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. @@ -77,7 +81,7 @@ pip install vllm Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more. - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) -- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) +- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) ## Contributing @@ -90,28 +94,33 @@ vLLM is a community project. Our compute resources for development and testing a - +Cash Donations: - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab - Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/SECURITY.md b/SECURITY.md index ad3f1f16ab560..de0032d26c87b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/). --- diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index b67849038cf0d..9d71e4ecc4a37 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -417,14 +417,35 @@ def get_model(pretrained_model_name_or_path: str) -> str: def get_tokenizer( - pretrained_model_name_or_path: str, trust_remote_code: bool + pretrained_model_name_or_path: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path): pretrained_model_name_or_path = get_model( pretrained_model_name_or_path) - return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError( + "Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + if tokenizer_mode == "mistral": + try: + from vllm.transformers_utils.tokenizer import MistralTokenizer + except ImportError as e: + raise ImportError("MistralTokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use mistral tokenizer mode.") from e + return MistralTokenizer.from_pretrained( + str(pretrained_model_name_or_path)) + else: + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) ASYNC_REQUEST_FUNCS = { diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 0a14aedd5feba..77c4f6aa927e4 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -13,6 +13,7 @@ from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType +from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser @@ -40,6 +41,20 @@ def main(args: argparse.Namespace): "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] + def llm_generate(): + if not args.use_beam_search: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + else: + llm.beam_search( + dummy_prompts, + BeamSearchParams( + beam_width=args.n, + max_tokens=args.output_len, + ignore_eos=True, + )) + def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: with torch.profiler.profile( @@ -49,15 +64,11 @@ def run_to_completion(profile_dir: Optional[str] = None): ], on_trace_ready=torch.profiler.tensorboard_trace_handler( str(profile_dir))) as p: - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) + llm_generate() + print(p.key_averages().table(sort_by="self_cuda_time_total")) else: start_time = time.perf_counter() - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm_generate() end_time = time.perf_counter() latency = end_time - start_time return latency diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 13477ef535e86..0b8fba38156f1 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -2,8 +2,7 @@ Offline benchmark to test the long document QA throughput. Example usage: - # This command run the vllm with 50GB CPU memory for offloading - # The workload samples 8 different prompts with a default input + # This workload samples 8 different prompts with a default input # length of 20000 tokens, then replicates each prompt 2 times # in random order. python benchmark_long_document_qa_throughput.py \ diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 5e9381f712e10..3ab421a89c935 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -10,7 +10,8 @@ --model meta-llama/Llama-2-7b-chat-hf \ --enable-prefix-caching \ --num-prompts 1 \ - --repeat-count 100 + --repeat-count 100 \ + --input-length-range 128:256 ShareGPT example usage: # This command samples 20 prompts with input lengths diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 68f7ca1af05ad..714abca2a5ff7 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(MACOSX_FOUND TRUE) +endif() + + # # Define environment variables for special configurations # @@ -13,6 +18,9 @@ endif() include_directories("${CMAKE_SOURCE_DIR}/csrc") + +set (ENABLE_NUMA TRUE) + # # Check the compile flags # @@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") "-mf16c" ) endif() -list(APPEND CXX_COMPILE_FLAGS - "-fopenmp" - "-DVLLM_CPU_EXTENSION") -execute_process(COMMAND cat /proc/cpuinfo - RESULT_VARIABLE CPUINFO_RET - OUTPUT_VARIABLE CPUINFO) +if(MACOSX_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-Xpreprocessor" + "-fopenmp" + "-DVLLM_CPU_EXTENSION") +else() + list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-DVLLM_CPU_EXTENSION") +endif() -if (NOT CPUINFO_RET EQUAL 0) - message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") +if (NOT MACOSX_FOUND) + execute_process(COMMAND cat /proc/cpuinfo + RESULT_VARIABLE CPUINFO_RET + OUTPUT_VARIABLE CPUINFO) + if (NOT CPUINFO_RET EQUAL 0) + message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") + endif() endif() + function (find_isa CPUINFO TARGET OUT) string(FIND ${CPUINFO} ${TARGET} ISA_FOUND) if(NOT ISA_FOUND EQUAL -1) @@ -54,12 +72,17 @@ endfunction() is_avx512_disabled(AVX512_DISABLED) -find_isa(${CPUINFO} "avx2" AVX2_FOUND) -find_isa(${CPUINFO} "avx512f" AVX512_FOUND) -find_isa(${CPUINFO} "POWER10" POWER10_FOUND) -find_isa(${CPUINFO} "POWER9" POWER9_FOUND) -find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support -find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + set(APPLE_SILICON_FOUND TRUE) +else() + find_isa(${CPUINFO} "avx2" AVX2_FOUND) + find_isa(${CPUINFO} "avx512f" AVX512_FOUND) + find_isa(${CPUINFO} "POWER10" POWER10_FOUND) + find_isa(${CPUINFO} "POWER9" POWER9_FOUND) + find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support + find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +endif() + if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS @@ -103,6 +126,9 @@ elseif (ASIMD_FOUND) set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") endif() list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) +elseif(APPLE_SILICON_FOUND) + message(STATUS "Apple Silicon Detected") + set(ENABLE_NUMA OFF) else() message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") endif() @@ -139,7 +165,12 @@ endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") -list(APPEND LIBS numa) +if(ENABLE_NUMA) + list(APPEND LIBS numa) +else() + message(STATUS "NUMA is disabled") + add_compile_definitions(-DVLLM_NUMA_DISABLED) +endif() # # _C extension diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index 73e0f8cb2e0fb..ae062a5b86892 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec { vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); } } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. if (remainder > 0) { float16x8_t temp = reg.val[full_blocks]; - for (int i = 0; i < remainder; ++i) { - reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) + { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; } } } diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 1138a55df2f05..42a1c1d924bac 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -1,10 +1,22 @@ -#include -#include -#include -#include +#ifndef VLLM_NUMA_DISABLED + #include + #include + #include + #include +#endif #include "cpu_types.hpp" +#ifdef VLLM_NUMA_DISABLED +std::string init_cpu_threads_env(const std::string& cpu_ids) { + return std::string( + "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has " + "no effect to setup thread affinity."); +} + +#endif + +#ifndef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); TORCH_CHECK(omp_cpu_mask->size > 0); @@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { omp_lock_t writelock; omp_init_lock(&writelock); -#pragma omp parallel for schedule(static, 1) + #pragma omp parallel for schedule(static, 1) for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { cpu_set_t mask; CPU_ZERO(&mask); @@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } +#endif \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf1020d5..5b801f79d1f26 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -18,3 +18,7 @@ help: # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + rm -rf "$(SOURCEDIR)/getting_started/examples" diff --git a/docs/README.md b/docs/README.md index 46488c9bb0b92..1a44c1341f4fb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,4 +16,5 @@ make html ```bash python -m http.server -d build/html/ ``` + Launch your browser and open localhost:8000. diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md index 11c0ef04fd8f7..c3496358c15d9 100644 --- a/docs/dev-docker/README.md +++ b/docs/dev-docker/README.md @@ -1,6 +1,6 @@ # vllm FP8 Latency and Throughput benchmarks on AMD MI300x -Documentation for vLLM Inferencing on AMD Instinct platforms. +Documentation for vLLM Inferencing on AMD Instinct platforms. ## Overview @@ -10,11 +10,9 @@ This documentation shows some reference performance numbers and the steps to rep It includes: - - ROCm™ 6.3 - - - vLLM 0.6.3 - - - PyTorch 2.6dev (nightly) +- ROCm™ 6.3 +- vLLM 0.6.3 +- PyTorch 2.6dev (nightly) ## System configuration @@ -39,16 +37,15 @@ The performance data below was measured on a server with MI300X accelerators wit | Power cap | 750 W | | SCLK/MCLK | 2100 Mhz / 1300 Mhz | -## Pull latest +## Pull latest You can pull the image with `docker pull rocm/vllm-dev:main` ### What is New - - ROCm 6.3 support - - Potential bug with Tunable Ops not saving due to a PyTorch issue - - +- ROCm 6.3 support +- Potential bug with Tunable Ops not saving due to a PyTorch issue + Gemms are tuned using PyTorch's Tunable Ops feature (https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md) The gemms are automatically enabled in the docker image, and all stored gemm configs are kept in /app/_gemm_csv in the same image @@ -56,23 +53,24 @@ The gemms are automatically enabled in the docker image, and all stored gemm co ### Use pre-quantized models -To make it easier to run fp8 Llama 3.1 models on MI300X, the quantized checkpoints are available on AMD Huggingface space as follows +To make it easier to run fp8 Llama 3.1 models on MI300X, the quantized checkpoints are available on AMD Huggingface space as follows -- https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV -- https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV -- https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV -- https://huggingface.co/amd/grok-1-FP8-KV +- +- +- +- -Currently these models are private. Please join https://huggingface.co/amd to access. +Currently these models are private. Please join to access. Download the model you want to run. -These FP8 quantized checkpoints were generated with AMD’s Quark Quantizer. For more information about Quark, please refer to https://quark.docs.amd.com/latest/quark_example_torch_llm_gen.html +These FP8 quantized checkpoints were generated with AMD’s Quark Quantizer. For more information about Quark, please refer to ### Quantize your own models -This step is optional for you to use quantized models on your own. Take Llama 3.1 405B as an example. -Download the Model View the Llama-3.1-405B model at https://huggingface.co/meta-llama/Llama-3.1-405B. Ensure that you have been granted access, and apply for it if you do not have access. +This step is optional for you to use quantized models on your own. Take Llama 3.1 405B as an example. + +Download the Model View the Llama-3.1-405B model at . Ensure that you have been granted access, and apply for it if you do not have access. If you do not already have a HuggingFace token, open your user profile (https://huggingface.co/settings/profile), select "Access Tokens", press "+ Create New Token", and create a new Read token. @@ -100,27 +98,29 @@ Similarly, you can download Llama-3.1-70B and Llama-3.1-8B. Run the quantization script in the example folder using the following command line: export MODEL_DIR = [local model checkpoint folder] or meta-llama/Llama-3.1-405B-Instruct + #### single GPU - python3 quantize_quark.py \ - --model_dir $MODEL_DIR \ - --output_dir Llama-3.1-405B-Instruct-FP8-KV \ - --quant_scheme w_fp8_a_fp8 \ - --kv_cache_dtype fp8 \ - --num_calib_data 128 \ - --model_export quark_safetensors \ - --no_weight_matrix_merge - -#### If model size is too large for single GPU, please use multi GPU instead. - python3 quantize_quark.py \ - --model_dir $MODEL_DIR \ - --output_dir Llama-3.1-405B-Instruct-FP8-KV \ - --quant_scheme w_fp8_a_fp8 \ - --kv_cache_dtype fp8 \ - --num_calib_data 128 \ - --model_export quark_safetensors \ - --no_weight_matrix_merge \ - --multi_gpu + python3 quantize_quark.py \ + --model_dir $MODEL_DIR \ + --output_dir Llama-3.1-405B-Instruct-FP8-KV \ + --quant_scheme w_fp8_a_fp8 \ + --kv_cache_dtype fp8 \ + --num_calib_data 128 \ + --model_export quark_safetensors \ + --no_weight_matrix_merge + +#### If model size is too large for single GPU, please use multi GPU instead + + python3 quantize_quark.py \ + --model_dir $MODEL_DIR \ + --output_dir Llama-3.1-405B-Instruct-FP8-KV \ + --quant_scheme w_fp8_a_fp8 \ + --kv_cache_dtype fp8 \ + --num_calib_data 128 \ + --model_export quark_safetensors \ + --no_weight_matrix_merge \ + --multi_gpu ### Launch AMD vLLM Docker @@ -135,7 +135,7 @@ Download and launch the docker, ### Benchmark with AMD vLLM Docker -There are some system settings to be configured for optimum performance on MI300X. +There are some system settings to be configured for optimum performance on MI300X. #### NUMA balancing setting @@ -160,15 +160,16 @@ Some environment variables enhance the performance of the vLLM kernels and PyTor export NCCL_MIN_NCHANNELS=112 export VLLM_FP8_PADDING=1 -You can set both PYTORCH_TUNABLEOP_ENABLED and PYTORCH_TUNABLEOP_TUNING to 1 to performance GEMM tuning for the 1st benchmark run. -It will take some time to complete the tuning during the benchmark. After tuning, it will generate several csv files as the performance lookup database. For the subsequent benchmark runs, you can keep +You can set both PYTORCH_TUNABLEOP_ENABLED and PYTORCH_TUNABLEOP_TUNING to 1 to performance GEMM tuning for the 1st benchmark run. +It will take some time to complete the tuning during the benchmark. After tuning, it will generate several csv files as the performance lookup database. For the subsequent benchmark runs, you can keep -PYTORCH_TUNABLEOP_ENABLED as 1 and set -PYTORCH_TUNABLEOP_TUNING to 0 to use the selected kernels. +PYTORCH_TUNABLEOP_ENABLED as 1 and set +PYTORCH_TUNABLEOP_TUNING to 0 to use the selected kernels. ##### vLLM engine performance settings -vLLM provides a number of engine options which can be changed to improve performance. -Refer https://docs.vllm.ai/en/stable/models/engine_args.html for the complete list of vLLM engine options. + +vLLM provides a number of engine options which can be changed to improve performance. +Refer for the complete list of vLLM engine options. Below is a list of options which are useful: - **--max-model-len** : Maximum context length supported by the model instance. Can be set to a lower value than model configuration value to improve performance and gpu memory utilization. - **--max-num-batched-tokens** : The maximum prefill size, i.e., how many prompt tokens can be packed together in a single prefill. Set to a higher value to improve prefill performance at the cost of higher gpu memory utilization. 65536 works well for LLama models. @@ -179,6 +180,7 @@ Below is a list of options which are useful: Note: vLLM's server creation command line (vllm serve) supports the above parameters as command line arguments. ##### Online Gemm Tuning + Online Gemm tuning for small decode batch sizes can improve performance in some cases. e.g. Llama 70B upto Batch size 8 If you want to do limited online tuning use --enforce-eager and tune for particular batch sizes. See example below. @@ -239,8 +241,8 @@ If you want to run Meta-Llama-3.1-405B FP16, please run --input-len 128 \ --output-len 128 -You can change various input-len, output-len, batch size and run the benchmark as well. When output-len is 1, it measures prefill latency (TTFT). -Decoding latency (TPOT) can be calculated based on the measured latency. +You can change various input-len, output-len, batch size and run the benchmark as well. When output-len is 1, it measures prefill latency (TTFT). +Decoding latency (TPOT) can be calculated based on the measured latency. For more information about the parameters, please run @@ -261,7 +263,7 @@ Benchmark Meta-Llama-3.1-405B FP8 with input 128 tokens, output 128 tokens and t --num-scheduler-steps 10 \ --tensor-parallel-size 8 \ --input-len 128 \ - --output-len 128 + --output-len 128 If you want to run Meta-Llama-3.1-405B FP16, please run @@ -294,23 +296,23 @@ For more information about the parameters, please run /app/vllm/benchmarks/benchmark_throughput.py -h -Tensor parallelism (TP) parameters depends on the model size. For Llama 3.1 70B and 8B model, TP 1 can be used as well for MI300X. In general, TP 8 and 1 is recommended to achieve the optimum performance. +Tensor parallelism (TP) parameters depends on the model size. For Llama 3.1 70B and 8B model, TP 1 can be used as well for MI300X. In general, TP 8 and 1 is recommended to achieve the optimum performance. ##### Online Server Benchmark - + Make the following changes if required - + /app/vllm/benchmarks/backend_request_func.py - + line 242 + "ignore_eos": True, - + /app/vllm/benchmarks/benchmark_serving.py line 245 - interval = np.random.exponential(1.0 / request_rate) line 245 + ## interval = np.random.exponential(1.0 / request_rate) line 246 + interval = 1.0 / request_rate - + Benchmark Meta-Llama-3.1-70B with input 4096 tokens, output 512 tokens and tensor parallelism 8 as an example, - + vllm serve /data/llm/Meta-Llama-3.1-70B-Instruct-FP8-KV \ --swap-space 16 \ --disable-log-requests \ @@ -322,11 +324,11 @@ Benchmark Meta-Llama-3.1-70B with input 4096 tokens, output 512 tokens and tenso --max-num-batched-tokens 65536 \ --gpu-memory-utilization 0.99 \ --num_scheduler-steps 10 - + Change port (for example --port 8005) if port=8000 is currently being used by other processes. - + run client in a separate terminal. Use port_id from previous step else port-id=8000. - + python /app/vllm/benchmarks/benchmark_serving.py \ --port 8000 \ --model /data/llm/Meta-Llama-3.1-70B-Instruct-FP8-KV \ @@ -336,18 +338,18 @@ run client in a separate terminal. Use port_id from previous step else port-id=8 --request-rate 1 \ --num-prompts 500 \ --percentile-metrics ttft,tpot,itl,e2el - + Once all prompts are processed, terminate the server gracefully (ctrl+c). - + ##### CPX mode - + Currently only CPX-NPS1 mode is supported. So ONLY tp=1 is supported in CPX mode. But multiple instances can be started simultaneously (if needed) in CPX-NPS1 mode. - + Set GPUs in CPX mode - + rocm-smi --setcomputepartition cpx - + Example of running Llama3.1-8B on 1 CPX-NPS1 GPU with input 4096 and output 512. As mentioned above, tp=1. HIP_VISIBLE_DEVICES=0 \ @@ -363,42 +365,43 @@ Example of running Llama3.1-8B on 1 CPX-NPS1 GPU with input 4096 and output 512. --output-json \ --quantization fp8 \ --gpu-memory-utilization 0.99 - + Set GPU to SPX mode. rocm-smi --setcomputepartition spx ### Speculative Decoding -Speculative decoding is one of the key features in vLLM. It has been supported on MI300. Here below is an example of the performance benchmark w/wo speculative decoding for Llama 3.1 405B with Llama 3.1 8B as the draft model. +Speculative decoding is one of the key features in vLLM. It has been supported on MI300. Here below is an example of the performance benchmark w/wo speculative decoding for Llama 3.1 405B with Llama 3.1 8B as the draft model. -Without Speculative Decoding - +Without Speculative Decoding - python benchmark_latency.py --model /models/models--amd--Meta-Llama-3.1-405B-Instruct-FP8-KV/ --max-model-len 26720 -tp 8 --batch-size 1 --use-v2-block-manager --input-len 1024 --output-len 128 -With Speculative Decoding - +With Speculative Decoding - python benchmark_latency.py --model /models/models--amd--Meta-Llama-3.1-405B-Instruct-FP8-KV/ --max-model-len 26720 -tp 8 --batch-size 1 --use-v2-block-manager --input-len 1024 --output-len 128 --speculative-model /models/models--amd--Meta-Llama-3.1-8B-Instruct-FP8-KV/ --num-speculative-tokens 5 -You should see some performance improvement about the e2e latency. +You should see some performance improvement about the e2e latency. ### MMLU_PRO_Biology Accuracy Eval - + ### fp16 + vllm (pretrained=models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 64 - + | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr| |-------|------:|--------------|-----:|-----------|---|-----:|---|-----:| |biology| 0|custom-extract| 5|exact_match|↑ |0.8466|± |0.0135| - + ### fp8 + vllm (pretrained=models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,quantization=fp8,quantized_weights_path=/llama.safetensors,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 32 - + | Tasks |Version| Filter |n-shot| Metric | |Value| |Stderr| |-------|------:|--------------|-----:|-----------|---|----:|---|-----:| |biology| 0|custom-extract| 5|exact_match|↑ |0.848|± |0.0134| - ## Performance ### LLaMA2/3 *MLPerf* 70B @@ -408,18 +411,18 @@ Please refer to the MLPerf instructions for recreating the MLPerf numbers. ## Version ### Release Notes + 20240906a: Legacy quantization formats required `--quantization fp8_rocm` as a flag instead of `--quantization fp8` Updated: -vLLM: https://github.com/ROCm/vllm/commit/2c60adc83981ada77a77b2adda78ef109d2e2e2b +vLLM: + ### Docker Manifest To reproduce the release docker: -``` -git clone https://github.com/ROCm/vllm.git -cd vllm -git checkout 2c60adc83981ada77a77b2adda78ef109d2e2e2b -docker build -f Dockerfile.rocm -t --build-arg BUILD_HIPBLASLT=1 --build-arg USE_CYTHON=1 . -``` + git clone https://github.com/ROCm/vllm.git + cd vllm + git checkout 2c60adc83981ada77a77b2adda78ef109d2e2e2b + docker build -f Dockerfile.rocm -t --build-arg BUILD_HIPBLASLT=1 --build-arg USE_CYTHON=1 . diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 25a700033cc9e..8217bc3ba3ded 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -3,6 +3,8 @@ sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 myst-parser==3.0.1 sphinx-argparse==0.4.0 +sphinx-design==0.6.1 +sphinx-togglebutton==0.3.2 msgspec cloudpickle diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md similarity index 100% rename from docs/source/dev/engine/async_llm_engine.md rename to docs/source/api/engine/async_llm_engine.md diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md similarity index 100% rename from docs/source/dev/engine/engine_index.md rename to docs/source/api/engine/index.md diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md similarity index 100% rename from docs/source/dev/engine/llm_engine.md rename to docs/source/api/engine/llm_engine.md diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md new file mode 100644 index 0000000000000..181c30cab9c4a --- /dev/null +++ b/docs/source/api/inference_params.md @@ -0,0 +1,21 @@ +# Inference Parameters + +Inference parameters for vLLM APIs. + +(sampling-params)= + +## Sampling Parameters + +```{eval-rst} +.. autoclass:: vllm.SamplingParams + :members: +``` + +(pooling-params)= + +## Pooling Parameters + +```{eval-rst} +.. autoclass:: vllm.PoolingParams + :members: +``` diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md new file mode 100644 index 0000000000000..e103a51d0070d --- /dev/null +++ b/docs/source/api/model/adapters.md @@ -0,0 +1,9 @@ +# Model Adapters + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.adapters + :members: + :member-order: bysource +``` diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md new file mode 100644 index 0000000000000..113792147be7c --- /dev/null +++ b/docs/source/api/model/index.md @@ -0,0 +1,11 @@ +# Model Development + +## Submodules + +```{toctree} +:maxdepth: 1 + +interfaces_base +interfaces +adapters +``` diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md new file mode 100644 index 0000000000000..55bee57f64faa --- /dev/null +++ b/docs/source/api/model/interfaces.md @@ -0,0 +1,9 @@ +# Optional Interfaces + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.interfaces + :members: + :member-order: bysource +``` diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md new file mode 100644 index 0000000000000..75d58d34228e9 --- /dev/null +++ b/docs/source/api/model/interfaces_base.md @@ -0,0 +1,9 @@ +# Base Model Interfaces + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.interfaces_base + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md new file mode 100644 index 0000000000000..14efdb506d76f --- /dev/null +++ b/docs/source/api/multimodal/index.md @@ -0,0 +1,28 @@ +(multi-modality)= + +# Multi-Modality + +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. + +Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). + +## Module Contents + +```{eval-rst} +.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY +``` + +## Submodules + +```{toctree} +:maxdepth: 1 + +inputs +parse +processing +profiling +registry +``` diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md new file mode 100644 index 0000000000000..76b2fb95a5009 --- /dev/null +++ b/docs/source/api/multimodal/inputs.md @@ -0,0 +1,49 @@ +# Input Definitions + +## User-facing inputs + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.MultiModalDataDict +``` + +## Internal data structures + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.PlaceholderRange + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.NestedTensors +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 + :members: + :show-inheritance: +``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md new file mode 100644 index 0000000000000..4676139efe626 --- /dev/null +++ b/docs/source/api/multimodal/parse.md @@ -0,0 +1,9 @@ +# Data Parsing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.parse + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md new file mode 100644 index 0000000000000..0d81c8d3966ee --- /dev/null +++ b/docs/source/api/multimodal/processing.md @@ -0,0 +1,9 @@ +# Data Processing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.processing + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md new file mode 100644 index 0000000000000..b455145212202 --- /dev/null +++ b/docs/source/api/multimodal/profiling.md @@ -0,0 +1,9 @@ +# Memory Profiling + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.profiling + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md new file mode 100644 index 0000000000000..0737a4385cf32 --- /dev/null +++ b/docs/source/api/multimodal/registry.md @@ -0,0 +1,9 @@ +# Registry + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.registry + :members: + :member-order: bysource +``` diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md similarity index 100% rename from docs/source/dev/offline_inference/offline_index.md rename to docs/source/api/offline_inference/index.md diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md similarity index 100% rename from docs/source/dev/offline_inference/llm.md rename to docs/source/api/offline_inference/llm.md diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md similarity index 100% rename from docs/source/dev/offline_inference/llm_inputs.md rename to docs/source/api/offline_inference/llm_inputs.md diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png similarity index 100% rename from docs/source/assets/dev/dockerfile-stages-dependency.png rename to docs/source/assets/contributing/dockerfile-stages-dependency.png diff --git a/docs/source/serving/architecture_helm_deployment.png b/docs/source/assets/deployment/architecture_helm_deployment.png similarity index 100% rename from docs/source/serving/architecture_helm_deployment.png rename to docs/source/assets/deployment/architecture_helm_deployment.png diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index c6f83b3a92ca0..fb93e65673dff 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -5,26 +5,34 @@ vLLM is a community project. Our compute resources for development and testing a +Cash Donations: + - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: + - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab - Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/docs/source/conf.py b/docs/source/conf.py index 71394c5302a39..7aa52db092e36 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,6 +43,11 @@ "sphinx.ext.autosummary", "myst_parser", "sphinxarg.ext", + "sphinx_design", + "sphinx_togglebutton", +] +myst_enable_extensions = [ + "colon_fence", ] # Add any paths that contain templates here, relative to this directory. @@ -51,7 +56,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.md"] +exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 7ffec83333d7d..cb142318b8724 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -1,7 +1,7 @@ # Dockerfile We provide a to construct the image for running an OpenAI compatible server with vLLM. -More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md). +More information about deploying with Docker can be found [here](#deployment-docker). Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: @@ -17,7 +17,7 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png + > ```{figure} /assets/contributing/dockerfile-stages-dependency.png > :align: center > :alt: query > :width: 100% diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index 002808ac5fbbd..b9b92fd027f6e 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -1,6 +1,6 @@ (new-model-basic)= -# Basic Implementation +# Implementing a Basic Model This guide walks you through the steps to implement a basic vLLM model. @@ -57,7 +57,17 @@ class MyModelForCausalLM(nn.Module): ### Computation Code -Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. +- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model. + +```python +class MyModel(nn.Module): + ... + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + ... +``` + +- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. ```python def forward( diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md index a2d601c83cf47..fe018b61b08cf 100644 --- a/docs/source/contributing/model/index.md +++ b/docs/source/contributing/model/index.md @@ -2,7 +2,7 @@ # Adding a New Model -This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. +This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. ```{toctree} :caption: Contents @@ -10,6 +10,7 @@ This section provides more information on how to integrate a [HuggingFace Transf basic registration +tests multimodal ``` diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index e5dcd1223b361..e5fd9a2877ceb 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -1,6 +1,6 @@ -(enabling-multimodal-inputs)= +(supports-multimodal)= -# Enabling Multimodal Inputs +# Multi-Modal Support This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). @@ -9,7 +9,78 @@ This document walks you through the steps to extend a basic model so that it acc It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). Further update the model as follows: -- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + + More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it. + +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: + + assert self.vision_encoder is not None + image_features = self.vision_encoder(image_input) + return self.multi_modal_projector(image_features) + + def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]: + + # Validate the multimodal input keyword arguments + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + # Run multimodal inputs through encoder and projector + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + ``` + + ```{important} + The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. + ``` + +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. + + ```python + from .utils import merge_multimodal_embeddings + + class YourModelForImage2Seq(nn.Module): + ... + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + # `get_input_embeddings` should already be implemented for the language + # model as one of the requirements of basic vLLM model implementation. + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_index) + + return inputs_embeds + ``` + +- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. ```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -23,117 +94,359 @@ Further update the model as follows: Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. ``` -- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward` - for each input tensor that corresponds to a multi-modal input, as shown in the following example: +## 2. Specify processing information - ```diff - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - ``` +Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo` +to provide basic information related to HF processing. -## 2. Register input mappers +### Maximum number of input items -For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper `. -This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`. +You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits` +to return the maximum number of input items for each modality supported by the model. -```diff - from vllm.model_executor.models.interfaces import SupportsMultiModal -+ from vllm.multimodal import MULTIMODAL_REGISTRY +For example, if the model supports any number of images but only one video per prompt: -+ @MULTIMODAL_REGISTRY.register_image_input_mapper() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +```python +def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": 1} ``` -A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. +### Maximum number of placeholder feature tokens + +Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item` +to return the maximum number of placeholder feature tokens per input item for each modality. + +When calling the model, the output embeddings from the visual encoder are assigned to the input positions +containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal +to the size of the output embeddings. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at the code of HF's `LlavaForConditionalGeneration`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 +n_image_tokens = (input_ids == self.config.image_token_index).sum().item() +n_image_features = image_features.shape[0] * image_features.shape[1] + +if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) +special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) +) +image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) +inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) +``` -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) +The number of placeholder feature tokens per image is `image_features.shape[1]`. +`image_features` is calculated inside the `get_image_features` method: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 +image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + +selected_image_feature = image_outputs.hidden_states[vision_feature_layer] +if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] +elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature +else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") +image_features = self.multi_modal_projector(selected_image_feature) +return image_features ``` -## 3. Register maximum number of multi-modal tokens +We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower +(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). +Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. +The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention +mechanism doesn't change the sequence length of the output hidden states. + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 +hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) +hidden_states = self.pre_layrnorm(hidden_states) + +encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, +) +``` -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item -and register it via {meth}`INPUT_REGISTRY.register_dummy_data `. +To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 +target_dtype = self.patch_embedding.weight.dtype +patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] +patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + +class_embeds = self.class_embedding.expand(batch_size, 1, -1) +embeddings = torch.cat([class_embeds, patch_embeds], dim=1) +if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) +else: + embeddings = embeddings + self.position_embedding(self.position_ids) +return embeddings +``` -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY +We can infer that `embeddings.shape[1] == self.num_positions`, where - @MULTIMODAL_REGISTRY.register_image_input_mapper() -+ @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 +self.num_patches = (self.image_size // self.patch_size) ** 2 +self.num_positions = self.num_patches + 1 ``` -Here are some examples: +Overall, the number of placeholder feature tokens for an image can be calculated as: -- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) +```python +def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, +) -> int: + hf_config = self.get_hf_config() + hf_processor = self.get_hf_processor() -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + + num_image_tokens = (image_size // patch_size) ** 2 + 1 + if hf_processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + return num_image_tokens ``` -## 4. (Optional) Register dummy data +Notice that the number of image tokens doesn't depend on the image width and height. +So, we can calculate the maximum number of image tokens using any image size: -During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. -In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data `. +```python +def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + width = height = hf_config.image_size + return ImageSize(width=width, height=height) -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY +def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() -+ @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) +``` + +And thus, we can override the method as: + +```python +def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} ``` ```{note} -The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. +Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP. ``` -Here are some examples: +::: +:::: + +## 3. Specify dummy inputs + +Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for +HF processing as well as memory profiling. + +### For memory profiling + +Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs` +to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of +the model so that vLLM can reserve the correct amount of memory for it. + +Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based +on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava +Making use of the `get_image_size_with_most_features` method implemented in the previous section: + +```python +def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], +) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + hf_config = self.get_hf_config() + target_width, target_height = self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) +``` + +::: +:::: + +## 4. Specify processing details -- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) +Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` +to fill in the missing details about HF processing. ```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) +[Multi-Modal Data Processing](#mm-processing) ``` -## 5. (Optional) Register input processor +### Multi-modal fields + +Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to +return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at the model's `forward` method: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404 +def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, +) -> Union[Tuple, LlavaCausalLMOutputWithPast]: +``` -Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor. -This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call. -You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor `. +The only related keyword argument is `pixel_values` which directly corresponds to input images. +The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images. +So, we override the method as follows: + +```python +def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], +) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + ) +``` -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY +```{note} +Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports +pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. +``` - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() -+ @INPUT_REGISTRY.register_input_processor() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +::: +:::: + +### Prompt replacements + +Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to +return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances. + +Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace +operation performed by the HF processor. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at HF's `LlavaProcessor`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 +prompt_strings = [] +for sample in text: + sample = sample.replace(self.image_token, self.image_token * num_image_tokens) + prompt_strings.append(sample) ``` -A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. -Here are some examples: +It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). +Based on this, we override the method as follows: + +```python +def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, +) -> list[PromptReplacement]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] +``` -- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) +::: +:::: -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) +## 5. Register processor-related classes + +After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2), +{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3), +and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4), +decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` +to register them to the multi-modal registry: + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder) + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md index cf1cdb0c9de0f..d6c9e4181dfee 100644 --- a/docs/source/contributing/model/registration.md +++ b/docs/source/contributing/model/registration.md @@ -1,9 +1,9 @@ (new-model-registration)= -# Model Registration +# Registering a Model to vLLM vLLM relies on a model registry to determine how to run each model. -A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page. +A list of pre-registered architectures can be found [here](#supported-models). If your model is not on this list, you must register it to vLLM. This page provides detailed instructions on how to do so. @@ -15,8 +15,7 @@ This gives you the ability to modify the codebase and test your model. After you have implemented your model (see [tutorial](#new-model-basic)), put it into the directory. Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. -You should also include an example HuggingFace repository for this model in to run the unit tests. -Finally, update the [Supported Models](#supported-models) documentation page to promote your model! +Finally, update our [list of supported models](#supported-models) to promote your model! ```{important} The list of models in each section should be maintained in alphabetical order. @@ -48,7 +47,7 @@ ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCaus ```{important} If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#enabling-multimodal-inputs). +Read more about that [here](#supports-multimodal). ``` ```{note} diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md new file mode 100644 index 0000000000000..74c933b2f45da --- /dev/null +++ b/docs/source/contributing/model/tests.md @@ -0,0 +1,63 @@ +(new-model-tests)= + +# Writing Unit Tests + +This page explains how to write unit tests to verify the implementation of your model. + +## Required Tests + +These tests are necessary to get your PR merged into vLLM library. +Without them, the CI for your PR will fail. + +### Model loading + +Include an example HuggingFace repository for your model in . +This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. + +```{important} +The list of models in each section should be maintained in alphabetical order. +``` + +```{tip} +If your model requires a development version of HF Transformers, you can set +`min_transformers_version` to skip the test in CI until the model is released. +``` + +## Optional Tests + +These tests are optional to get your PR merged into vLLM library. +Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions. + +### Model correctness + +These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of . + +#### Generative models + +For [generative models](#generative-models), there are two levels of correctness tests, as defined in : + +- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. +- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. + +#### Pooling models + +For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in . + +(mm-processing-tests)= + +### Multi-modal processing + +#### Common tests + +Adding your model to verifies that the following input combinations result in the same outputs: + +- Text + multi-modal data +- Tokens + multi-modal data +- Text + cached multi-modal data +- Tokens + cached multi-modal data + +#### Model-specific tests + +You can add a new file under to run tests that only apply to your model. + +For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in . diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index c960790f47a13..e92104399342d 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -37,8 +37,6 @@ pytest tests/ Currently, the repository is not fully checked by `mypy`. ``` -# Contribution Guidelines - ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 46210957c19ec..001db86bdf555 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md new file mode 100644 index 0000000000000..422dc13e6a644 --- /dev/null +++ b/docs/source/contributing/vulnerability_management.md @@ -0,0 +1,43 @@ +# Vulnerability Management + +## Reporting Vulnerabilities + +As mentioned in the [security +policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security +vulnerabilities may be reported privately to the project via +[GitHub](https://github.com/vllm-project/vllm/security/advisories/new). + +## Vulnerability Management Team + +Once a vulnerability has been reported to the project, the Vulnerability +Management Team (VMT) is responsible for managing the vulnerability. The VMT is +responsible for: + +- Triaging the vulnerability. +- Coordinating with reporters and project maintainers on vulnerability analysis + and resolution. +- Drafting of security advisories for confirmed vulnerabilities, as appropriate. +- Coordination with project maintainers on a coordinated release of the fix and + security advisory. + +### Security Advisories + +Advisories are published via GitHub through the same system used to report +vulnerabilities. More information on the process can be found in the [GitHub +documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories). + +### Team Members + +We prefer to keep all vulnerability-related communication on the security report +on GitHub. However, if you need to contact the VMT directly for an urgent issue, +you may contact the following individuals: + +- Simon Mo - simon.mo@hey.com +- Russell Bryant - rbryant@redhat.com + +## Slack Discussion + +You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) +to discuss security-related topics. However, please do not disclose any +vulnerabilities in this channel. If you need to report a vulnerability, please +use the GitHub security advisory system or contact a VMT member privately. diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/deployment/docker.md similarity index 92% rename from docs/source/serving/deploying_with_docker.md rename to docs/source/deployment/docker.md index 844bd27800c7a..9e301483ef7f9 100644 --- a/docs/source/serving/deploying_with_docker.md +++ b/docs/source/deployment/docker.md @@ -1,6 +1,8 @@ -(deploying-with-docker)= +(deployment-docker)= -# Deploying with Docker +# Using Docker + +(deployment-docker-pre-built-image)= ## Use vLLM's Official Docker Image @@ -23,13 +25,15 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar memory to share data between processes under the hood, particularly for tensor parallel inference. ``` +(deployment-docker-build-image-from-source)= + ## Building vLLM's Docker Image from Source You can build and run vLLM from source via the provided . To build vLLM: ```console -$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` ```{note} diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/deployment/frameworks/bentoml.md similarity index 71% rename from docs/source/serving/deploying_with_bentoml.md rename to docs/source/deployment/frameworks/bentoml.md index dfa0de4f0f6d7..2bf435bda8380 100644 --- a/docs/source/serving/deploying_with_bentoml.md +++ b/docs/source/deployment/frameworks/bentoml.md @@ -1,7 +1,7 @@ -(deploying-with-bentoml)= +(deployment-bentoml)= -# Deploying with BentoML +# BentoML -[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. +[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes. For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html). diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md similarity index 93% rename from docs/source/serving/deploying_with_cerebrium.md rename to docs/source/deployment/frameworks/cerebrium.md index 950064c8c1b10..5787c4a407bfb 100644 --- a/docs/source/serving/deploying_with_cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -1,6 +1,6 @@ -(deploying-with-cerebrium)= +(deployment-cerebrium)= -# Deploying with Cerebrium +# Cerebrium ```{raw} html

@@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr To install the Cerebrium client, run: ```console -$ pip install cerebrium -$ cerebrium login +pip install cerebrium +cerebrium login ``` Next, create your Cerebrium project, run: ```console -$ cerebrium init vllm-project +cerebrium init vllm-project ``` Next, to install the required packages, add the following to your cerebrium.toml: @@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): Then, run the following code to deploy it to the cloud: ```console -$ cerebrium deploy +cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/deployment/frameworks/dstack.md similarity index 95% rename from docs/source/serving/deploying_with_dstack.md rename to docs/source/deployment/frameworks/dstack.md index 381f5f786ca2c..b42a34125c6d7 100644 --- a/docs/source/serving/deploying_with_dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -1,6 +1,6 @@ -(deploying-with-dstack)= +(deployment-dstack)= -# Deploying with dstack +# dstack ```{raw} html

@@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```console -$ pip install "dstack[all] -$ dstack server +pip install "dstack[all] +dstack server ``` Next, to configure your dstack project, run: ```console -$ mkdir -p vllm-dstack -$ cd vllm-dstack -$ dstack init +mkdir -p vllm-dstack +cd vllm-dstack +dstack init ``` Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/deployment/frameworks/helm.md similarity index 98% rename from docs/source/serving/deploying_with_helm.md rename to docs/source/deployment/frameworks/helm.md index 7286a0a88968f..18ed293191468 100644 --- a/docs/source/serving/deploying_with_helm.md +++ b/docs/source/deployment/frameworks/helm.md @@ -1,6 +1,6 @@ -(deploying-with-helm)= +(deployment-helm)= -# Deploying with Helm +# Helm A Helm chart to deploy vLLM for Kubernetes @@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release. ## Architecture -```{image} architecture_helm_deployment.png +```{image} /assets/deployment/architecture_helm_deployment.png ``` ## Values diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md new file mode 100644 index 0000000000000..964782763f6b3 --- /dev/null +++ b/docs/source/deployment/frameworks/index.md @@ -0,0 +1,14 @@ +# Using other frameworks + +```{toctree} +:maxdepth: 1 + +bentoml +cerebrium +dstack +helm +lws +modal +skypilot +triton +``` diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/deployment/frameworks/lws.md similarity index 91% rename from docs/source/serving/deploying_with_lws.md rename to docs/source/deployment/frameworks/lws.md index 22bab419eaca3..349fa83fbcb9d 100644 --- a/docs/source/serving/deploying_with_lws.md +++ b/docs/source/deployment/frameworks/lws.md @@ -1,6 +1,6 @@ -(deploying-with-lws)= +(deployment-lws)= -# Deploying with LWS +# LWS LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference. diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md new file mode 100644 index 0000000000000..e7c42088e36a9 --- /dev/null +++ b/docs/source/deployment/frameworks/modal.md @@ -0,0 +1,7 @@ +(deployment-modal)= + +# Modal + +vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling. + +For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference). diff --git a/docs/source/serving/run_on_sky.md b/docs/source/deployment/frameworks/skypilot.md similarity index 94% rename from docs/source/serving/run_on_sky.md rename to docs/source/deployment/frameworks/skypilot.md index 115873ae49292..051fc2f2a8d4e 100644 --- a/docs/source/serving/run_on_sky.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -1,6 +1,6 @@ -(on-cloud)= +(deployment-skypilot)= -# Deploying and scaling up with SkyPilot +# SkyPilot ```{raw} html

@@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet ## Prerequisites -- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`. +- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`. - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). -- Check that {code}`sky check` shows clouds or Kubernetes are enabled. +- Check that `sky check` shows clouds or Kubernetes are enabled. ```console pip install skypilot-nightly @@ -61,7 +61,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://localhost:8081/v1 \ @@ -321,7 +321,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://$ENDPOINT/v1 \ @@ -334,12 +334,12 @@ run: | 1. Start the chat web UI: -```console -sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) -``` + ```console + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + ``` 2. Then, we can access the GUI at the returned gradio link: -```console -| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live -``` + ```console + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + ``` diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/deployment/frameworks/triton.md similarity index 87% rename from docs/source/serving/deploying_with_triton.md rename to docs/source/deployment/frameworks/triton.md index 9b0a6f1d54ae8..94d87120159c6 100644 --- a/docs/source/serving/deploying_with_triton.md +++ b/docs/source/deployment/frameworks/triton.md @@ -1,5 +1,5 @@ -(deploying-with-triton)= +(deployment-triton)= -# Deploying with NVIDIA Triton +# NVIDIA Triton The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md new file mode 100644 index 0000000000000..d47ede8967547 --- /dev/null +++ b/docs/source/deployment/integrations/index.md @@ -0,0 +1,9 @@ +# External Integrations + +```{toctree} +:maxdepth: 1 + +kserve +kubeai +llamastack +``` diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/deployment/integrations/kserve.md similarity index 85% rename from docs/source/serving/deploying_with_kserve.md rename to docs/source/deployment/integrations/kserve.md index feaeb5d0ec8a2..c780fd74e8f55 100644 --- a/docs/source/serving/deploying_with_kserve.md +++ b/docs/source/deployment/integrations/kserve.md @@ -1,6 +1,6 @@ -(deploying-with-kserve)= +(deployment-kserve)= -# Deploying with KServe +# KServe vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/deployment/integrations/kubeai.md similarity index 93% rename from docs/source/serving/deploying_with_kubeai.md rename to docs/source/deployment/integrations/kubeai.md index 3609d7e05acd3..2f5772e075d87 100644 --- a/docs/source/serving/deploying_with_kubeai.md +++ b/docs/source/deployment/integrations/kubeai.md @@ -1,6 +1,6 @@ -(deploying-with-kubeai)= +(deployment-kubeai)= -# Deploying with KubeAI +# KubeAI [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/deployment/integrations/llamastack.md similarity index 92% rename from docs/source/serving/serving_with_llamastack.md rename to docs/source/deployment/integrations/llamastack.md index 71dadca7ad47c..a6c3569637abf 100644 --- a/docs/source/serving/serving_with_llamastack.md +++ b/docs/source/deployment/integrations/llamastack.md @@ -1,13 +1,13 @@ -(run-on-llamastack)= +(deployment-llamastack)= -# Serving with Llama Stack +# Llama Stack vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . To install Llama Stack, run ```console -$ pip install llama-stack -q +pip install llama-stack -q ``` ## Inference using OpenAI Compatible API diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md new file mode 100644 index 0000000000000..cbc95c20ff4b3 --- /dev/null +++ b/docs/source/deployment/k8s.md @@ -0,0 +1,249 @@ +(deployment-k8s)= + +# Using Kubernetes + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +## Deployment Steps + +1. Create a PVC, Secret and Deployment for vLLM + + PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + + ```yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: mistral-7b + namespace: default + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem + ``` + + Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + + ```yaml + apiVersion: v1 + kind: Secret + metadata: + name: hf-token-secret + namespace: default + type: Opaque + stringData: + token: "REPLACE_WITH_TOKEN" + ``` + + Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. + + Here are two examples for using NVIDIA GPU and AMD GPU. + + NVIDIA GPU: + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 + ``` + + AMD GPU: + + You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + # PVC + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "8Gi" + hostNetwork: true + hostIPC: true + containers: + - name: mistral-7b + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + securityContext: + seccompProfile: + type: Unconfined + runAsGroup: 44 + capabilities: + add: + - SYS_PTRACE + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + amd.com/gpu: "1" + requests: + cpu: "6" + memory: 6G + amd.com/gpu: "1" + volumeMounts: + - name: cache-volume + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + ``` + + You can get the full example with steps and sample yaml files from . + +2. Create a Kubernetes Service for vLLM + + Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + + ```yaml + apiVersion: v1 + kind: Service + metadata: + name: mistral-7b + namespace: default + spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP + ``` + +3. Deploy and Test + + Apply the deployment and service configurations using `kubectl apply -f `: + + ```console + kubectl apply -f deployment.yaml + kubectl apply -f service.yaml + ``` + + To test the deployment, run the following `curl` command: + + ```console + curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + ``` + + If the service is correctly deployed, you should receive a response from the vLLM model. + +## Conclusion + +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/deployment/nginx.md similarity index 99% rename from docs/source/serving/deploying_with_nginx.md rename to docs/source/deployment/nginx.md index a1f00d8536465..a58f791c2997b 100644 --- a/docs/source/serving/deploying_with_nginx.md +++ b/docs/source/deployment/nginx.md @@ -1,6 +1,6 @@ (nginxloadbalancer)= -# Deploying with Nginx Loadbalancer +# Using Nginx This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 2f1280c047672..cec503ef2f77d 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -53,11 +53,11 @@ for output in outputs: ``` More API details can be found in the {doc}`Offline Inference -` section of the API docs. +` section of the API docs. The code for the `LLM` class can be found in . -### OpenAI-compatible API server +### OpenAI-Compatible API Server The second primary interface to vLLM is via its OpenAI-compatible API server. This server can be started using the `vllm serve` command. diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md index 4398536b2b4ad..3928e0c16568b 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -2,11 +2,11 @@ # Automatic Prefix Caching -The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. +The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. -``` +```text Block 1 Block 2 Block 3 [A gentle breeze stirred] [the leaves as children] [laughed in the distance] Block 1: |<--- block tokens ---->| @@ -14,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->| Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| ``` - In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: -``` +```text hash(prefix tokens + block tokens) <--> KV Block ``` With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. - This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. - ## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. @@ -41,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: -- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. +* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md deleted file mode 100644 index bb16920e3d0c0..0000000000000 --- a/docs/source/design/input_processing/input_processing_pipeline.md +++ /dev/null @@ -1,19 +0,0 @@ -(input-processing-pipeline)= - -# Input Processing Pipeline - -1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`). - -2. Tokenize the data if necessary. - -3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `. - - - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. - -4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`. - -5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`. - -6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `. - - - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md deleted file mode 100644 index cb415366e5a66..0000000000000 --- a/docs/source/design/input_processing/model_inputs_index.md +++ /dev/null @@ -1,43 +0,0 @@ -(input-processing)= - -# Input Processing - -```{eval-rst} -.. currentmodule:: vllm.inputs -``` - -Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via -{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. - -Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input -data in addition to input prompt, but it can be extended to text-only language models when needed. - -## Guides - -```{toctree} -:maxdepth: 1 - -input_processing_pipeline -``` - -## Module Contents - -### LLM Engine Inputs - -```{eval-rst} -.. autoclass:: vllm.inputs.DecoderOnlyInputs - :members: - :show-inheritance: -``` - -### Registry - -```{eval-rst} -.. autodata:: vllm.inputs.INPUT_REGISTRY -``` - -```{eval-rst} -.. automodule:: vllm.inputs.registry - :members: - :show-inheritance: -``` diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md new file mode 100644 index 0000000000000..a0d01205e638c --- /dev/null +++ b/docs/source/design/mm_processing.md @@ -0,0 +1,64 @@ +(mm-processing)= + +# Multi-Modal Data Processing + +To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. + +Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`: + +## Prompt Replacement Detection + +One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `` for a single image) with feature placeholder tokens (e.g. `...`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs. + +In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt. + +## Tokenized Prompt Inputs + +To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data. + +### The problem + +Consider that HF processors follow these main steps: + +1. Tokenize the text +2. Process multi-modal inputs +3. Perform prompt replacement + +And we require that: + +- For text + multi-modal inputs, apply all steps 1--3. +- For tokenized + multi-modal inputs, apply only steps 2--3. + +How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs: + +- For text + multi-modal inputs, simply call the HF processor directly. +- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs. + +While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs. + +Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other. + +(mm-dummy-text)= + +### Dummy text + +We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. + +(mm-automatic-prompt-replacement)= + +### Automatic prompt replacement + +We address the second issue by implementing model-agnostic code in +{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. + +### Summary + +With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`. + +## Processor Output Caching + +Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again. + +When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache. + +Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md deleted file mode 100644 index bcccd284879bb..0000000000000 --- a/docs/source/design/multimodal/adding_multimodal_plugin.md +++ /dev/null @@ -1,16 +0,0 @@ -(adding-multimodal-plugin)= - -# Adding a Multimodal Plugin - -This document teaches you how to add a new modality to vLLM. - -Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. -For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`. - -The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s. - -```{note} -This article is a work in progress. -``` - -% TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md deleted file mode 100644 index e4f2171e84ff7..0000000000000 --- a/docs/source/design/multimodal/multimodal_index.md +++ /dev/null @@ -1,83 +0,0 @@ -(multi-modality)= - -# Multi-Modality - -```{eval-rst} -.. currentmodule:: vllm.multimodal -``` - -vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. - -Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) -via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. - -Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following [this guide](#adding-multimodal-plugin). - -Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). - -## Guides - -```{toctree} -:maxdepth: 1 - -adding_multimodal_plugin -``` - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal -``` - -### Registry - -```{eval-rst} -.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalRegistry - :members: - :show-inheritance: -``` - -### Base Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.base - :members: - :show-inheritance: -``` - -### Input Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.inputs - :members: - :show-inheritance: -``` - -### Audio Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.audio - :members: - :show-inheritance: -``` - -### Image Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.image - :members: - :show-inheritance: -``` - -### Video Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.video - :members: - :show-inheritance: -``` diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index da87638e5b743..c2cdb75ea08a7 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges. ## Multiprocessing Methods -[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include: +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python 3.14. diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md deleted file mode 100644 index 74b2c57443e4b..0000000000000 --- a/docs/source/dev/pooling_params.md +++ /dev/null @@ -1,6 +0,0 @@ -# Pooling Parameters - -```{eval-rst} -.. autoclass:: vllm.PoolingParams - :members: -``` diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md deleted file mode 100644 index bdc36af5153db..0000000000000 --- a/docs/source/dev/sampling_params.md +++ /dev/null @@ -1,6 +0,0 @@ -# Sampling Parameters - -```{eval-rst} -.. autoclass:: vllm.SamplingParams - :members: -``` diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md index 8d8f7dca2e5b5..86a82eb36df33 100644 --- a/docs/source/features/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -322,7 +322,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ``` -### Feature x Hardware +(feature-x-hardware)= + +## Feature x Hardware ```{list-table} :header-rows: 1 @@ -359,7 +361,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](gh-pr:4830) + - ✅ - ✅ * - prmpt adptr - ✅ diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md index 05226f2dec87c..efa2efc66192e 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -1,8 +1,12 @@ (disagg-prefill)= -# Disaggregated prefilling (experimental) +# Disaggregated Prefilling (experimental) -This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. +This page introduces you the disaggregated prefilling feature in vLLM. + +```{note} +This feature is experimental and subject to change. +``` ## Why disaggregated prefilling? @@ -17,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput. ## Usage example -Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. +Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. ## Benchmarks diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index cf06916d70f44..b00d05147bb32 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -47,7 +47,7 @@ outputs = llm.generate( ) ``` -Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index c02fbf0605a8c..404505eb3890e 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). ```console -$ pip install autoawq +pip install autoawq ``` After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index f7f41726f3725..7525e8e7866c3 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```console -$ pip install bitsandbytes>=0.45.0 +pip install bitsandbytes>=0.45.0 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. @@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and You can find bitsandbytes quantized models on . And usually, these repositories have a config.json file that includes a quantization_config section. -## Read quantized checkpoint. +## Read quantized checkpoint ```python from vllm import LLM @@ -37,10 +37,11 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` + ## OpenAI Compatible Server Append the following to your 4bit model arguments: -``` +```console --quantization bitsandbytes --load-format bitsandbytes ``` diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index b2eda74fd1e3b..da49cd2747228 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -98,7 +98,7 @@ tokenizer.save_pretrained(SAVE_DIR) Install `vllm` and `lm-evaluation-harness`: ```console -$ pip install vllm lm-eval==0.4.4 +pip install vllm lm-eval==0.4.4 ``` Load and run the model in `vllm`: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index 20a48d8c1cf18..bdc6d9da11ab4 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). To install AMMO (AlgorithMic Model Optimization): ```console -$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo ``` Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index eebf11dfc1b2b..640997cf4bc39 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console -$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` ```{warning} diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 1ac50ba987dda..82a15d76d352f 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index 8c52c97a41e48..ab7b2f302bd13 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -1,6 +1,6 @@ (spec-decode)= -# Speculative decoding +# Speculative Decoding ```{warning} Please note that speculative decoding in vLLM is not yet optimized and does @@ -159,6 +159,70 @@ A variety of speculative models of this type are available on HF hub: - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator) +## Speculating using EAGLE based draft models + +The following code configures vLLM to use speculative decoding where proposals are generated by +an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="meta-llama/Meta-Llama-3-8B-Instruct", + tensor_parallel_size=4, + speculative_model="path/to/modified/eagle/model", + speculative_draft_tensor_parallel_size=1, +) + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +``` + +A few important things to consider when using the EAGLE based draft models: + +1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be + used directly with vLLM due to differences in the expected layer names and model definition. + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + to convert them. Note that this script does not modify the model's weights. + + In the above example, use the script to first convert + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + and then use the converted checkpoint as the draft model in vLLM. + +2. The EAGLE based draft models need to be run without tensor parallelism + (i.e. speculative_draft_tensor_parallel_size is set to 1), although + it is possible to run the main model using tensor parallelism (see example above). + +3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is + reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under + investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). + +A variety of EAGLE draft models are available on the Hugging Face hub: + +| Base Model | EAGLE on Hugging Face | # EAGLE Parameters | +|---------------------------------------------------------------------|-------------------------------------------|--------------------| +| Vicuna-7B-v1.3 | yuhuili/EAGLE-Vicuna-7B-v1.3 | 0.24B | +| Vicuna-13B-v1.3 | yuhuili/EAGLE-Vicuna-13B-v1.3 | 0.37B | +| Vicuna-33B-v1.3 | yuhuili/EAGLE-Vicuna-33B-v1.3 | 0.56B | +| LLaMA2-Chat 7B | yuhuili/EAGLE-llama2-chat-7B | 0.24B | +| LLaMA2-Chat 13B | yuhuili/EAGLE-llama2-chat-13B | 0.37B | +| LLaMA2-Chat 70B | yuhuili/EAGLE-llama2-chat-70B | 0.99B | +| Mixtral-8x7B-Instruct-v0.1 | yuhuili/EAGLE-mixtral-instruct-8x7B | 0.28B | +| LLaMA3-Instruct 8B | yuhuili/EAGLE-LLaMA3-Instruct-8B | 0.25B | +| LLaMA3-Instruct 70B | yuhuili/EAGLE-LLaMA3-Instruct-70B | 0.99B | +| Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | +| Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | + ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of @@ -184,8 +248,6 @@ speculative decoding, breaking down the guarantees into three key areas: same request across runs. For more details, see the FAQ section titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). -**Conclusion** - While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -193,8 +255,6 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -**Mitigation Strategies** - For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 26c09bb0d8a0c..1d77c7339a33f 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -5,7 +5,7 @@ vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding. This document shows you some examples of the different options that are available to generate structured outputs. -## Online Inference (OpenAI API) +## Online Serving (OpenAI API) You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. @@ -131,7 +131,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +Full example: ## Experimental Automatic Parsing (OpenAI API) @@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are: - `backend` - `whitespace_pattern` -These parameters can be used in the same way as the parameters from the Online Inference examples above. +These parameters can be used in the same way as the parameters from the Online Serving examples above. One example for the usage of the `choices` parameter is shown below: ```python @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index 062f2021eb62a..027ddb6d5eda3 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -55,21 +55,24 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` Example output: -``` + +```text Function called: get_weather Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` This example demonstrates: -- Setting up the server with tool calling enabled -- Defining an actual function to handle tool calls -- Making a request with `tool_choice="auto"` -- Handling the structured response and executing the corresponding function + +* Setting up the server with tool calling enabled +* Defining an actual function to handle tool calls +* Making a request with `tool_choice="auto"` +* Handling the structured response and executing the corresponding function You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the callers responsibility to: + 1. Define appropriate tools in the request 2. Include relevant context in the chat messages 3. Handle the tool calls in your application logic @@ -77,20 +80,21 @@ Remember that it's the callers responsibility to: For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below. ## Named Function Calling + vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. - ## Automatic Function Calling To enable this feature, you should set the following flags: + * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers @@ -104,28 +108,28 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! - ### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. + * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` * `NousResearch/Hermes-3-*` - _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` - ### Mistral Models (`mistral`) Supported models: + * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. Known issues: + 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is much shorter than what vLLM generates. Since an exception is thrown when this condition @@ -136,13 +140,12 @@ it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated * `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt when tools are provided, that results in much better reliability when working with parallel tool calling. - Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` - ### Llama Models (`llama3_json`) Supported models: + * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct` @@ -152,6 +155,7 @@ The tool calling that is supported is the [JSON based tool calling](https://llam Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: + 1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -164,6 +168,7 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool #### IBM Granite Supported models: + * `ibm-granite/granite-3.0-8b-instruct` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` @@ -182,42 +187,45 @@ Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/t `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. - ### InternLM Models (`internlm`) Supported models: + * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: + * Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` - ### Jamba Models (`jamba`) + AI21's Jamba-1.5 models are supported. + * `ai21labs/AI21-Jamba-1.5-Mini` * `ai21labs/AI21-Jamba-1.5-Large` - Flags: `--tool-call-parser jamba` - ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: + ```python [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] ``` Limitations: + * The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) * Llama's smaller models struggle to use tools effectively. Example supported models: + * `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) @@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format. --- - ## How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser): ``` Then you can use this plugin in the command line like this. -``` + +```console --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index aef32f7559f74..aaa13d0fb6d3f 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -1,54 +1,239 @@ +import itertools import re +from dataclasses import dataclass, field from pathlib import Path +ROOT_DIR = Path(__file__).parent.parent.parent.resolve() +ROOT_DIR_RELATIVE = '../../../..' +EXAMPLE_DIR = ROOT_DIR / "examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples" + def fix_case(text: str) -> str: - subs = [ - ("api", "API"), - ("llm", "LLM"), - ("vllm", "vLLM"), - ("openai", "OpenAI"), - ("multilora", "MultiLoRA"), - ] - for sub in subs: - text = re.sub(*sub, text, flags=re.IGNORECASE) + subs = { + "api": "API", + "Cli": "CLI", + "cpu": "CPU", + "llm": "LLM", + "tpu": "TPU", + "aqlm": "AQLM", + "gguf": "GGUF", + "lora": "LoRA", + "vllm": "vLLM", + "openai": "OpenAI", + "multilora": "MultiLoRA", + "mlpspeculator": "MLPSpeculator", + r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 + r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 + } + for pattern, repl in subs.items(): + text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) return text -def generate_title(filename: str) -> str: - # Turn filename into a title - title = filename.replace("_", " ").title() - # Handle acronyms and names - title = fix_case(title) - return f"# {title}" +@dataclass +class Index: + """ + Index class to generate a structured document index. + + Attributes: + path (Path): The path save the index file to. + title (str): The title of the index. + description (str): A brief description of the index. + caption (str): An optional caption for the table of contents. + maxdepth (int): The maximum depth of the table of contents. Defaults to 1. + documents (list[str]): A list of document paths to include in the index. Defaults to an empty list. + + Methods: + generate() -> str: + Generates the index content as a string in the specified format. + """ # noqa: E501 + path: Path + title: str + description: str + caption: str + maxdepth: int = 1 + documents: list[str] = field(default_factory=list) + + def generate(self) -> str: + content = f"# {self.title}\n\n{self.description}\n\n" + content += "```{toctree}\n" + content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" + content += "\n".join(self.documents) + "\n```\n" + return content + + +@dataclass +class Example: + """ + Example class for generating documentation content from a given path. + + Attributes: + path (Path): The path to the main directory or file. + category (str): The category of the document. + main_file (Path): The main file in the directory. + other_files (list[Path]): List of other files in the directory. + title (str): The title of the document. + + Methods: + __post_init__(): Initializes the main_file, other_files, and title attributes. + determine_main_file() -> Path: Determines the main file in the given path. + determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. + determine_title() -> str: Determines the title of the document. + generate() -> str: Generates the documentation content. + """ # noqa: E501 + path: Path + category: str = None + main_file: Path = field(init=False) + other_files: list[Path] = field(init=False) + title: str = field(init=False) + + def __post_init__(self): + self.main_file = self.determine_main_file() + self.other_files = self.determine_other_files() + self.title = self.determine_title() + + def determine_main_file(self) -> Path: + """ + Determines the main file in the given path. + If the path is a file, it returns the path itself. Otherwise, it searches + for Markdown files (*.md) in the directory and returns the first one found. + Returns: + Path: The main file path, either the original path if it's a file or the first + Markdown file found in the directory. + Raises: + IndexError: If no Markdown files are found in the directory. + """ # noqa: E501 + return self.path if self.path.is_file() else list( + self.path.glob("*.md")).pop() + + def determine_other_files(self) -> list[Path]: + """ + Determine other files in the directory excluding the main file. + + This method checks if the given path is a file. If it is, it returns an empty list. + Otherwise, it recursively searches through the directory and returns a list of all + files that are not the main file. + + Returns: + list[Path]: A list of Path objects representing the other files in the directory. + """ # noqa: E501 + if self.path.is_file(): + return [] + is_other_file = lambda file: file.is_file() and file != self.main_file + return [file for file in self.path.rglob("*") if is_other_file(file)] + + def determine_title(self) -> str: + return fix_case(self.path.stem.replace("_", " ").title()) + + def generate(self) -> str: + # Convert the path to a relative path from __file__ + make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to( + ROOT_DIR) + + content = f"Source .\n\n" + include = "include" if self.main_file.suffix == ".md" else \ + "literalinclude" + if include == "literalinclude": + content += f"# {self.title}\n\n" + content += f":::{{{include}}} {make_relative(self.main_file)}\n" + if include == "literalinclude": + content += f":language: {self.main_file.suffix[1:]}\n" + content += ":::\n\n" + + if not self.other_files: + return content + + content += "## Example materials\n\n" + for file in self.other_files: + include = "include" if file.suffix == ".md" else "literalinclude" + content += f":::{{admonition}} {file.relative_to(self.path)}\n" + content += ":class: dropdown\n\n" + content += f":::{{{include}}} {make_relative(file)}\n:::\n" + content += ":::\n\n" + + return content def generate_examples(): - root_dir = Path(__file__).parent.parent.parent.resolve() - - # Source paths - script_dir = root_dir / "examples" - script_paths = sorted(script_dir.glob("*.py")) - - # Destination paths - doc_dir = root_dir / "docs/source/getting_started/examples" - doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths] - - # Generate the example docs for each example script - for script_path, doc_path in zip(script_paths, doc_paths): - # Make script_path relative to doc_path and call it include_path - include_path = '../../../..' / script_path.relative_to(root_dir) - content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source: .\n\n" - f"```{{literalinclude}} {include_path}\n" - ":language: python\n" - ":linenos:\n```") + # Create the EXAMPLE_DOC_DIR if it doesn't exist + if not EXAMPLE_DOC_DIR.exists(): + EXAMPLE_DOC_DIR.mkdir(parents=True) + + # Create empty indices + examples_index = Index( + path=EXAMPLE_DOC_DIR / "examples_index.md", + title="Examples", + description= + "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using from examples found in .", # noqa: E501 + caption="Examples", + maxdepth=2) + # Category indices stored in reverse order because they are inserted into + # examples_index.documents at index 0 in order + category_indices = { + "other": + Index( + path=EXAMPLE_DOC_DIR / "examples_other_index.md", + title="Other", + description= + "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 + caption="Examples", + ), + "online_serving": + Index( + path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md", + title="Online Serving", + description= + "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 + caption="Examples", + ), + "offline_inference": + Index( + path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", + title="Offline Inference", + description= + "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 + caption="Examples", + ), + } + + examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] + # Find categorised examples + for category in category_indices: + category_dir = EXAMPLE_DIR / category + globs = [category_dir.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path, category)) + # Find examples in subdirectories + for path in category_dir.glob("*/*.md"): + examples.append(Example(path.parent, category)) + # Find uncategorised examples + globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path)) + # Find examples in subdirectories + for path in EXAMPLE_DIR.glob("*/*.md"): + # Skip categorised examples + if path.parent.name in category_indices: + continue + examples.append(Example(path.parent)) + + # Generate the example documentation + for example in sorted(examples, key=lambda e: e.path.stem): + doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" with open(doc_path, "w+") as f: - f.write(content) - - # Generate the toctree for the example scripts - with open(doc_dir / "examples_index.template.md") as f: - examples_index = f.read() - with open(doc_dir / "examples_index.md", "w+") as f: - example_docs = "\n".join(path.stem + ".md" for path in script_paths) - f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) + f.write(example.generate()) + # Add the example to the appropriate index + index = category_indices.get(example.category, examples_index) + index.documents.append(example.path.stem) + + # Generate the index files + for category_index in category_indices.values(): + if category_index.documents: + examples_index.documents.insert(0, category_index.path.name) + with open(category_index.path, "w+") as f: + f.write(category_index.generate()) + + with open(examples_index.path, "w+") as f: + f.write(examples_index.generate()) diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md deleted file mode 100644 index de7a91c0ffa48..0000000000000 --- a/docs/source/getting_started/examples/examples_index.template.md +++ /dev/null @@ -1,8 +0,0 @@ -# Examples - -```{toctree} -:maxdepth: 1 -:caption: Scripts - -%EXAMPLE_DOCS% -``` \ No newline at end of file diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md index fde2954f10c59..4751b325e6fc4 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/source/getting_started/faq.md @@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. -**Mitigation Strategies** +## Mitigation Strategies - For improved stability and reduced variance, use `float32`. Note that this will require more memory. - If using `bfloat16`, switching to `float16` can also help. diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md similarity index 89% rename from docs/source/getting_started/installation/hpu-gaudi.md rename to docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 94de169f51a73..b4695d504b601 100644 --- a/docs/source/getting_started/installation/hpu-gaudi.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -1,10 +1,13 @@ -(installation-gaudi)= +# Installation -# Installation for Intel® Gaudi® +This tab provides instructions on running vLLM with Intel Gaudi devices. -This README provides instructions on running vLLM with Intel Gaudi devices. +## Requirements -## Requirements and Installation +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) @@ -12,42 +15,24 @@ to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). -### Requirements - -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 +## Configure a new environment -### Quick start using Dockerfile - -```console -$ docker build -f Dockerfile.hpu -t vllm-hpu-env . -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env -``` - -```{tip} -If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -``` - -### Build from source - -#### Environment verification +### Environment verification To verify that the Intel Gaudi software was correctly installed, run: ```console -$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -$ pip list | grep neural # verify that neural_compressor is installed +hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +pip list | grep neural # verify that neural_compressor is installed ``` Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. -#### Run Docker Image +### Run Docker Image It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi @@ -57,33 +42,58 @@ for more details. Use the following commands to run a Docker image: ```console -$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -#### Build and Install vLLM +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built Intel Gaudi wheels. + +### Build wheel from source To build and install vLLM from source, run: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python setup.py develop +git clone https://github.com/vllm-project/vllm.git +cd vllm +python setup.py develop ``` Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -$ git clone https://github.com/HabanaAI/vllm-fork.git -$ cd vllm-fork -$ git checkout habana_main -$ python setup.py develop +git clone https://github.com/HabanaAI/vllm-fork.git +cd vllm-fork +git checkout habana_main +python setup.py develop ``` -## Supported Features +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built Intel Gaudi images. + +### Build image from source -- [Offline batched inference](#offline-batched-inference) -- Online inference via [OpenAI-Compatible Server](#openai-compatible-server) +```console +docker build -f Dockerfile.hpu -t vllm-hpu-env . +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +```{tip} +If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +``` + +## Extra information + +## Supported features + +- [Offline inference](#offline-inference) +- Online serving via [OpenAI-Compatible Server](#openai-compatible-server) - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, @@ -94,14 +104,14 @@ $ python setup.py develop for accelerating low-batch latency and throughput - Attention with Linear Biases (ALiBi) -## Unsupported Features +## Unsupported features - Beam search - LoRA adapters - Quantization - Prefill chunking (mixed-batch inferencing) -## Supported Configurations +## Supported configurations The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. @@ -137,7 +147,7 @@ Gaudi2 devices. Configurations that are not listed may or may not work. - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -## Performance Tuning +## Performance tuning ### Execution modes @@ -181,7 +191,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -``` +```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -192,7 +202,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1 Example (with ramp-up) -``` +```text min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -201,7 +211,7 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -``` +```text min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) @@ -224,7 +234,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -``` +```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -273,7 +283,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -``` +```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -349,26 +359,26 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Default values: - Prompt: - : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` - Decode: - : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs -## Troubleshooting: Tweaking HPU Graphs +## Troubleshooting: tweaking HPU graphs If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following @@ -385,5 +395,5 @@ the below: completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to - server (for online inference), or by passing `enforce_eager=True` + server (for online serving), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md new file mode 100644 index 0000000000000..a6c4c44305a4c --- /dev/null +++ b/docs/source/getting_started/installation/ai_accelerator/index.md @@ -0,0 +1,375 @@ +# Other AI accelerators + +vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: + +## Requirements + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Configure a new environment + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} ../python_env_setup.inc.md +``` + +::: + +:::: + +## Set up using Python + +### Pre-built wheels + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::: + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::: + +### Build image from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::: + +## Extra information + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "## Extra information" +``` + +::: + +:::: diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md similarity index 83% rename from docs/source/getting_started/installation/neuron.md rename to docs/source/getting_started/installation/ai_accelerator/neuron.inc.md index 431f90537f543..575a9f9c2e2f0 100644 --- a/docs/source/getting_started/installation/neuron.md +++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md @@ -1,6 +1,4 @@ -(installation-neuron)= - -# Installation for Neuron +# Installation vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. Paged Attention and Chunked Prefill are currently in development and will be available soon. @@ -14,28 +12,9 @@ Data types currently supported in Neuron SDK are FP16 and BF16. - Pytorch 2.0.1/2.1.1 - AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) -Installation steps: - -- [Build from source](#build-from-source-neuron) - - - [Step 0. Launch Trn1/Inf2 instances](#launch-instances) - - [Step 1. Install drivers and tools](#install-drivers) - - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx) - - [Step 3. Install vLLM from source](#install-vllm) - -(build-from-source-neuron)= - -```{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -``` - -## Build from source - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -(launch-instances)= +## Configure a new environment -### Step 0. Launch Trn1/Inf2 instances +### Launch Trn1/Inf2 instances Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). @@ -45,9 +24,7 @@ Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch N - When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance -(install-drivers)= - -### Step 1. Install drivers and tools +### Install drivers and tools The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: @@ -82,9 +59,21 @@ sudo apt-get install aws-neuronx-tools=2.* -y export PATH=/opt/aws/neuron/bin:$PATH ``` -(install-tnx)= +## Set up using Python + +### Pre-built wheels -### Step 2. Install transformers-neuronx and its dependencies +Currently, there are no pre-built Neuron wheels. + +### Build wheel from source + +```{note} +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +``` + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +#### Install transformers-neuronx and its dependencies [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances. Follow the steps below to install transformer-neuronx package and its dependencies. @@ -116,17 +105,31 @@ python -m pip install awscli python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx ``` -(install-vllm)= - -### Step 3. Install vLLM from source +#### Install vLLM from source Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -U -r requirements-neuron.txt -$ VLLM_TARGET_DEVICE="neuron" pip install . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements-neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install . ``` If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built Neuron images. + +### Build image from source + +See for instructions on building the Docker image. + +Make sure to use in place of the default Dockerfile. + +## Extra information + +There is no extra information for this device. diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md similarity index 67% rename from docs/source/getting_started/installation/openvino.md rename to docs/source/getting_started/installation/ai_accelerator/openvino.inc.md index 60f95fd1c4250..a7867472583d6 100644 --- a/docs/source/getting_started/installation/openvino.md +++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md @@ -1,63 +1,65 @@ -(installation-openvino)= +# Installation -# Installation for OpenVINO +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). -vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: +## Requirements -- Prefix caching (`--enable-prefix-caching`) -- Chunked prefill (`--enable-chunked-prefill`) +- OS: Linux +- Instruction set architecture (ISA) requirement: at least AVX2. -**Table of contents**: +## Set up using Python -- [Requirements](#openvino-backend-requirements) -- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile) -- [Build from source](#install-openvino-backend-from-source) -- [Performance tips](#openvino-backend-performance-tips) -- [Limitations](#openvino-backend-limitations) +### Pre-built wheels -(openvino-backend-requirements)= +Currently, there are no pre-built OpenVINO wheels. -## Requirements +### Build wheel from source -- OS: Linux -- Instruction set architecture (ISA) requirement: at least AVX2. +First, install Python. For example, on Ubuntu 22.04, you can run: -(openvino-backend-quick-start-dockerfile)= +```console +sudo apt-get update -y +sudo apt-get install python3 +``` -## Quick start using Dockerfile +Second, install prerequisites vLLM OpenVINO backend installation: ```console -$ docker build -f Dockerfile.openvino -t vllm-openvino-env . -$ docker run -it --rm vllm-openvino-env +pip install --upgrade pip +pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` -(install-openvino-backend-from-source)= +Finally, install vLLM with OpenVINO backend: -## Install from source +```console +PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . +``` + +:::{tip} +To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). +::: -- First, install Python. For example, on Ubuntu 22.04, you can run: +## Set up using Docker - ```console - $ sudo apt-get update -y - $ sudo apt-get install python3 - ``` +### Pre-built images -- Second, install prerequisites vLLM OpenVINO backend installation: +Currently, there are no pre-built OpenVINO images. - ```console - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` +### Build image from source -- Finally, install vLLM with OpenVINO backend: +```console +docker build -f Dockerfile.openvino -t vllm-openvino-env . +docker run -it --rm vllm-openvino-env +``` - ```console - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . - ``` +## Extra information -- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). +## Supported features -(openvino-backend-performance-tips)= +OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) ## Performance tips @@ -95,8 +97,6 @@ $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json ``` -(openvino-backend-limitations)= - ## Limitations - LoRA serving is not supported. diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md similarity index 85% rename from docs/source/getting_started/installation/tpu.md rename to docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index bc93c44fead30..6a911cc6b9eba 100644 --- a/docs/source/getting_started/installation/tpu.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -1,6 +1,4 @@ -(installation-tpu)= - -# Installation for TPUs +# Installation Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs @@ -54,7 +52,16 @@ In all of the following commands, replace the ALL CAPS parameter names with appropriate values. See the parameter descriptions table for more information. ``` -## Provision a Cloud TPU with the queued resource API +### Provision Cloud TPUs with GKE + +For more information about using TPUs with GKE, see: +- +- +- + +## Configure a new environment + +### Provision a Cloud TPU with the queued resource API Create a TPU v5e with 4 TPU chips: @@ -102,6 +109,14 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built TPU wheels. + +### Build wheel from source + Install Miniconda: ```bash @@ -142,28 +157,25 @@ Run the setup script: VLLM_TARGET_DEVICE="tpu" python setup.py develop ``` -## Provision Cloud TPUs with GKE +## Set up using Docker -For more information about using TPUs with GKE, see - - - +### Pre-built images -(build-docker-tpu)= +See for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. -## Build a docker image with {code}`Dockerfile.tpu` +### Build image from source You can use to build a Docker image with TPU support. ```console -$ docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: ```console -$ # Make sure to add `--privileged --net host --shm-size=16G`. -$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +# Make sure to add `--privileged --net host --shm-size=16G`. +docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` ```{note} @@ -189,3 +201,7 @@ Install OpenBLAS with the following command: $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev ``` ```` + +## Extra information + +There is no extra information for this device. diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md deleted file mode 100644 index a46e2c010600d..0000000000000 --- a/docs/source/getting_started/installation/cpu-arm.md +++ /dev/null @@ -1,46 +0,0 @@ -(installation-arm)= - -# Installation for ARM CPUs - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: - -- CPU backend inference capabilities -- Relevant runtime environment variables -- Performance optimization tips - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. -Contents: - -1. [Requirements](#arm-backend-requirements) -2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile) -3. [Building from Source](#build-arm-backend-from-source) - -(arm-backend-requirements)= - -## Requirements - -- **Operating System**: Linux or macOS -- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) -- **Instruction Set Architecture (ISA)**: NEON support is required - -(arm-backend-quick-start-dockerfile)= - -## Quick Start with Dockerfile - -You can quickly set up vLLM on ARM using Docker: - -```console -$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env -``` - -(build-arm-backend-from-source)= - -## Building from Source - -To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md new file mode 100644 index 0000000000000..56545253b1ef7 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/apple.inc.md @@ -0,0 +1,56 @@ +# Installation + +vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. + +Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. + +## Requirements + +- OS: `macOS Sonoma` or later +- SDK: `XCode 15.4` or later with Command Line Tools +- Compiler: `Apple Clang >= 15.0.0` + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. + +```console +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-cpu.txt +pip install -e . +``` + +```{note} +On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. +``` + +#### Troubleshooting + +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +[Command Line Tools for Xcode](https://developer.apple.com/download/all/). + +```text +[...] fatal error: 'map' file not found + 1 | #include + | ^~~~~ + 1 error generated. + [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o + +[...] fatal error: 'cstddef' file not found + 10 | #include + | ^~~~~~~~~ + 1 error generated. +``` + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md new file mode 100644 index 0000000000000..08a764e1a25f4 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/arm.inc.md @@ -0,0 +1,30 @@ +# Installation + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. + +## Requirements + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): NEON support is required + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +:::{include} build.inc.md +::: + +Testing has been conducted on AWS Graviton3 instances for compatibility. + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md new file mode 100644 index 0000000000000..f8d1044a0d198 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/build.inc.md @@ -0,0 +1,21 @@ +First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```console +sudo apt-get update -y +sudo apt-get install -y gcc-12 g++-12 libnuma-dev +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Second, install Python packages for vLLM CPU backend building: + +```console +pip install --upgrade pip +pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` + +Finally, build and install vLLM CPU backend: + +```console +VLLM_TARGET_DEVICE=cpu python setup.py install +``` diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu/index.md similarity index 62% rename from docs/source/getting_started/installation/cpu-x86.md rename to docs/source/getting_started/installation/cpu/index.md index bbb2d1872ef39..4ec907c0e9fda 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu/index.md @@ -1,109 +1,183 @@ -(installation-x86)= +# CPU -# Installation for x86 CPUs +vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: +::::{tab-set} +:sync-group: device -- Tensor Parallel -- Model Quantization (`INT8 W8A8, AWQ`) -- Chunked-prefill -- Prefix-caching -- FP8-E5M2 KV-Caching (TODO) +:::{tab-item} x86 +:sync: x86 -Table of contents: +```{include} x86.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` -1. [Requirements](#cpu-backend-requirements) -2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile) -3. [Build from source](#build-cpu-backend-from-source) -4. [Related runtime environment variables](#env-intro) -5. [Intel Extension for PyTorch](#ipex-guidance) -6. [Performance tips](#cpu-backend-performance-tips) +::: -(cpu-backend-requirements)= +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: ## Requirements -- OS: Linux -- Compiler: `gcc/g++>=12.3.0` (optional, recommended) -- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) +- Python: 3.9 -- 3.12 -(cpu-backend-quick-start-dockerfile)= +::::{tab-set} +:sync-group: device -## Quick start using Dockerfile +:::{tab-item} x86 +:sync: x86 -```console -$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env +```{include} x86.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" ``` -(build-cpu-backend-from-source)= +::: -## Build from source +:::{tab-item} ARM +:sync: arm -- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +```{include} arm.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` -```console -$ sudo apt-get update -y -$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev -$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" ``` -- Second, install Python packages for vLLM CPU backend building: +::: -```console -$ pip install --upgrade pip -$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy -$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +:::: + +## Set up using Python + +### Create a new Python environment + +```{include} ../python_env_setup.inc.md ``` -- Finally, build and install vLLM CPU backend: +### Pre-built wheels -```console -$ VLLM_TARGET_DEVICE=cpu python setup.py install +Currently, there are no pre-built CPU wheels. + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} x86 +:sync: x86 + +```{include} x86.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" ``` -```{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. +::: + +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" ``` -(env-intro)= +::: -## Related runtime environment variables +:::{tab-item} Apple silicon +:sync: apple -- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. -- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. +```{include} apple.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-build CPU images. + +### Build image from source + +```console +$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +:::{tip} +For ARM or Apple silicon, use `Dockerfile.arm` +::: -(ipex-guidance)= +## Supported features -## Intel Extension for PyTorch +vLLM CPU backend supports the following vLLM features: -- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. +- Tensor Parallel +- Model Quantization (`INT8 W8A8, AWQ, GPTQ`) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) -(cpu-backend-performance-tips)= +## Related runtime environment variables + +- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. ## Performance tips - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library -$ find / -name *libtcmalloc* # find the dynamic link library path -$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference.py # run vLLM +sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +find / -name *libtcmalloc* # find the dynamic link library path +export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +python examples/offline_inference/basic.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console -$ export VLLM_CPU_KVCACHE_SPACE=40 -$ export VLLM_CPU_OMP_THREADS_BIND=0-29 -$ vllm serve facebook/opt-125m +export VLLM_CPU_KVCACHE_SPACE=40 +export VLLM_CPU_OMP_THREADS_BIND=0-29 +vllm serve facebook/opt-125m ``` - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: @@ -132,23 +206,23 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference.py +$ python examples/offline_inference/basic.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. -## CPU Backend Considerations +## Other considerations - The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md). diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md new file mode 100644 index 0000000000000..e4f99d3cebdf2 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -0,0 +1,35 @@ +# Installation + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. + +## Requirements + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): AVX512 (optional, recommended) + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +:::{include} build.inc.md +::: + +```{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. +``` + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information + +## Intel Extension for PyTorch + +- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. diff --git a/docs/source/getting_started/installation/device.template.md b/docs/source/getting_started/installation/device.template.md new file mode 100644 index 0000000000000..44f538da93659 --- /dev/null +++ b/docs/source/getting_started/installation/device.template.md @@ -0,0 +1,17 @@ +# Installation + +## Requirements + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md deleted file mode 100644 index 796911d7305a6..0000000000000 --- a/docs/source/getting_started/installation/gpu-rocm.md +++ /dev/null @@ -1,163 +0,0 @@ -(installation-rocm)= - -# Installation for ROCm - -vLLM supports AMD GPUs with ROCm 6.2. - -## Requirements - -- OS: Linux -- Python: 3.9 -- 3.12 -- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -- ROCm 6.2 - -Installation options: - -1. [Build from source with docker](#build-from-source-docker-rocm) -2. [Build from source](#build-from-source-rocm) - -(build-from-source-docker-rocm)= - -## Option 1: Build from source with docker (recommended) - -You can build and install vLLM from source. - -First, build a docker image from and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: - -```console -{ - "features": { - "buildkit": true - } -} -``` - - uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. -It provides flexibility to customize the build of docker image using the following arguments: - -- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. -- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. -- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` -- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. - -Their values can be passed in when running `docker build` with `--build-arg` options. - -To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: - -```console -$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . -``` - -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: - -```console -$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . -``` - -To run the above docker image `vllm-rocm`, use the below command: - -```console -$ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash -``` - -Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. - -(build-from-source-rocm)= - -## Option 2: Build from source - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) -- [PyTorch](https://pytorch.org/) - -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) - -1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) - -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) - -```console -$ python3 -m pip install ninja cmake wheel pybind11 -$ pip uninstall -y triton -$ git clone https://github.com/OpenAI/triton.git -$ cd triton -$ git checkout e192dba -$ cd python -$ pip3 install . -$ cd ../.. -``` - -```{note} -- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. -``` - -2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) - -Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) -Alternatively, wheels intended for vLLM use can be accessed under the releases. - -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. - -```console -$ git clone https://github.com/ROCm/flash-attention.git -$ cd flash-attention -$ git checkout 3cea2fb -$ git submodule update --init -$ GPU_ARCHS="gfx90a" python3 setup.py install -$ cd .. -``` - -```{note} -- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -``` - -3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: - -```bash -$ pip install --upgrade pip - -# Install PyTorch -$ pip uninstall torch -y -$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - -# Build & install AMD SMI -$ pip install /opt/rocm/share/amd_smi - -# Install dependencies -$ pip install --upgrade numba scipy huggingface-hub[cli] -$ pip install "numpy<2" -$ pip install -r requirements-rocm.txt - -# Build vLLM for MI210/MI250/MI300. -$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" -$ python3 setup.py develop -``` - -This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation. - -```{tip} -- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. -- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. -- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. -- The ROCm version of PyTorch, ideally, should match the ROCm driver version. -``` - -```{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu/cuda.inc.md similarity index 50% rename from docs/source/getting_started/installation/gpu-cuda.md rename to docs/source/getting_started/installation/gpu/cuda.inc.md index 7ea10bb8b59ff..4cce65278c069 100644 --- a/docs/source/getting_started/installation/gpu-cuda.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -1,118 +1,118 @@ -(installation-cuda)= +# Installation -# Installation for CUDA - -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. +vLLM contains pre-compiled C++ and CUDA (12.1) binaries. ## Requirements -- OS: Linux -- Python: 3.9 -- 3.12 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -## Install released versions - -You can install vLLM using pip: +## Set up using Python -```console -$ # (Recommended) Create a new conda environment. -$ conda create -n myenv python=3.12 -y -$ conda activate myenv - -$ # Install vLLM with CUDA 12.1. -$ pip install vllm -``` +### Create a new Python environment ```{note} -Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See for more details. +PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. ``` -````{note} -As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. -We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: +In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. + +Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details. + +### Pre-built wheels + +You can install vLLM using either `pip` or `uv pip`: ```console -$ # Install vLLM with CUDA 11.8. -$ export VLLM_VERSION=0.6.1.post1 -$ export PYTHON_VERSION=310 -$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +# Install vLLM with CUDA 12.1. +pip install vllm # If you are using pip. +uv pip install vllm # If you are using uv. ``` -In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. +As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: -Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. -```` +```console +# Install vLLM with CUDA 11.8. +export VLLM_VERSION=0.6.1.post1 +export PYTHON_VERSION=310 +pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` (install-the-latest-code)= -## Install the latest code +#### Install the latest code + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command: +##### Install the latest code using `pip` ```console -$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` -If you want to access the wheels for previous commits, you can specify the commit hash in the URL: +`--pre` is required for `pip` to consider pre-released versions. + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` -Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. -Another way to access the latest code is to use the docker images: +##### Install the latest code using `uv` + +Another way to install the latest code is to use `uv`: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly ``` -These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. - -The latest code can contain bugs and may not be stable. Please use it with caution. +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: -(build-from-source)= +```console +export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +``` -## Build from source +The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -(python-only-build)= +### Build wheel from source -### Python-only build (without compilation) +#### Set up using Python-only build (without compilation) If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_USE_PRECOMPILED=1 pip install --editable . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel and use the compiled libraries from there in the install. +This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): ```console -$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl -$ pip install --editable . +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +pip install --editable . ``` -You can find more information about vLLM's wheels [above](#install-the-latest-code). +You can find more information about vLLM's wheels in . ```{note} There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. -It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel. +It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. ``` -### Full build (with compilation) +#### Full build (with compilation) If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -e . ``` ```{tip} @@ -125,7 +125,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ``` -#### Use an existing PyTorch installation +##### Use an existing PyTorch installation There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: @@ -135,32 +135,32 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python use_existing_torch.py -$ pip install -r requirements-build.txt -$ pip install -e . --no-build-isolation +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +pip install -r requirements-build.txt +pip install -e . --no-build-isolation ``` -#### Use the local cutlass for compilation +##### Use the local cutlass for compilation Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` -#### Troubleshooting +##### Troubleshooting To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```console -$ export MAX_JOBS=6 -$ pip install -e . +export MAX_JOBS=6 +pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -169,31 +169,56 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. ```console -$ # Use `--ipc=host` to make sure the shared memory is large enough. -$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +# Use `--ipc=host` to make sure the shared memory is large enough. +docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console -$ export CUDA_HOME=/usr/local/cuda -$ export PATH="${CUDA_HOME}/bin:$PATH" +export CUDA_HOME=/usr/local/cuda +export PATH="${CUDA_HOME}/bin:$PATH" ``` Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ```console -$ nvcc --version # verify that nvcc is in your PATH -$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +nvcc --version # verify that nvcc is in your PATH +${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` -### Unsupported OS build +#### Unsupported OS build vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```console -$ export VLLM_TARGET_DEVICE=empty -$ pip install -e . +export VLLM_TARGET_DEVICE=empty +pip install -e . +``` + +## Set up using Docker + +### Pre-built images + +See for instructions on using the official Docker image. + +Another way to access the latest code is to use the docker images: + +```console +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} ``` + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +### Build image from source + +See for instructions on building the Docker image. + +## Supported features + +See compatibility matrix for feature support information. diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md new file mode 100644 index 0000000000000..6c007382b2c3d --- /dev/null +++ b/docs/source/getting_started/installation/gpu/index.md @@ -0,0 +1,300 @@ +# GPU + +vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Set up using Python + +### Create a new Python environment + +```{include} ../python_env_setup.inc.md +``` + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Create a new Python environment" +:end-before: "### Pre-built wheels" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +There is no extra information on creating a new Python environment for this device. + +::: + +:::{tab-item} XPU +:sync: xpu + +There is no extra information on creating a new Python environment for this device. + +::: + +:::: + +### Pre-built wheels + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::: + +(build-from-source)= + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::: + +### Build image from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::: + +## Supported features + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Supported features" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "## Supported features" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "## Supported features" +``` + +::: + +:::: diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md new file mode 100644 index 0000000000000..f6f9d3c303f89 --- /dev/null +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -0,0 +1,166 @@ +# Installation + +vLLM supports AMD GPUs with ROCm 6.2. + +## Requirements + +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- ROCm 6.2 + +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built ROCm wheels. + +### Build wheel from source + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) + + For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + + Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + +1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) + + Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + + ```console + python3 -m pip install ninja cmake wheel pybind11 + pip uninstall -y triton + git clone https://github.com/OpenAI/triton.git + cd triton + git checkout e192dba + cd python + pip3 install . + cd ../.. + ``` + + ```{note} + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ``` + +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) + + Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) + Alternatively, wheels intended for vLLM use can be accessed under the releases. + + For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + + ```console + git clone https://github.com/ROCm/flash-attention.git + cd flash-attention + git checkout 3cea2fb + git submodule update --init + GPU_ARCHS="gfx90a" python3 setup.py install + cd .. + ``` + + ```{note} + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ``` + +3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: + + ```bash + $ pip install --upgrade pip + + # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + + # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi + + # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt + + # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + ``` + + This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. + + ```{tip} + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + ``` + +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built ROCm images. + +### Build image from source + +Building the Docker image from source is the recommended way to use vLLM with ROCm. + +First, build a docker image from and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +```console +{ + "features": { + "buildkit": true + } +} +``` + + uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. +- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. +- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` +- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` +- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. + +Their values can be passed in when running `docker build` with `--build-arg` options. + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +```console +DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +``` + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: + +```console +DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +``` + +To run the above docker image `vllm-rocm`, use the below command: + +```console +docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash +``` + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + +## Supported features + +See compatibility matrix for feature support information. diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/gpu/xpu.inc.md similarity index 57% rename from docs/source/getting_started/installation/xpu.md rename to docs/source/getting_started/installation/gpu/xpu.inc.md index be4e3b9bd1bc5..577986eba74fd 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -1,54 +1,33 @@ -(installation-xpu)= - -# Installation for XPUs +# Installation vLLM initially supports basic model inferencing and serving on Intel GPU platform. -Table of contents: - -1. [Requirements](#xpu-backend-requirements) -2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile) -3. [Build from source](#build-xpu-backend-from-source) - -(xpu-backend-requirements)= - ## Requirements -- OS: Linux - Supported Hardware: Intel Data Center GPU, Intel ARC GPU - OneAPI requirements: oneAPI 2024.2 -(xpu-backend-quick-start-dockerfile)= +## Set up using Python -## Quick start using Dockerfile - -```console -$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - vllm-xpu-env -``` +### Pre-built wheels -(build-xpu-backend-from-source)= +Currently, there are no pre-built XPU wheels. -## Build from source +### Build wheel from source - First, install required driver and intel OneAPI 2024.2 or later. - Second, install Python packages for vLLM XPU backend building: ```console -$ source /opt/intel/oneapi/setvars.sh -$ pip install --upgrade pip -$ pip install -v -r requirements-xpu.txt +source /opt/intel/oneapi/setvars.sh +pip install --upgrade pip +pip install -v -r requirements-xpu.txt ``` - Finally, build and install vLLM XPU backend: ```console -$ VLLM_TARGET_DEVICE=xpu python setup.py install +VLLM_TARGET_DEVICE=xpu python setup.py install ``` ```{note} @@ -56,19 +35,37 @@ $ VLLM_TARGET_DEVICE=xpu python setup.py install type will be supported in the future. ``` -## Distributed inference and serving +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built XPU images. + +### Build image from source + +```console +$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env +``` + +## Supported features XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: ```console -$ python -m vllm.entrypoints.openai.api_server \ -$ --model=facebook/opt-13b \ -$ --dtype=bfloat16 \ -$ --device=xpu \ -$ --max_model_len=1024 \ -$ --distributed-executor-backend=ray \ -$ --pipeline-parallel-size=2 \ -$ -tp=8 +python -m vllm.entrypoints.openai.api_server \ + --model=facebook/opt-13b \ + --dtype=bfloat16 \ + --device=xpu \ + --max_model_len=1024 \ + --distributed-executor-backend=ray \ + --pipeline-parallel-size=2 \ + -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md index 83de1aff409b2..bc1d268bf0c7e 100644 --- a/docs/source/getting_started/installation/index.md +++ b/docs/source/getting_started/installation/index.md @@ -7,13 +7,7 @@ vLLM supports the following hardware platforms: ```{toctree} :maxdepth: 1 -gpu-cuda -gpu-rocm -cpu-x86 -cpu-arm -hpu-gaudi -tpu -xpu -openvino -neuron +gpu/index +cpu/index +ai_accelerator/index ``` diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md new file mode 100644 index 0000000000000..25cfac5f58aa7 --- /dev/null +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -0,0 +1,19 @@ +You can create a new Python environment using `conda`: + +```console +# (Recommended) Create a new conda environment. +conda create -n myenv python=3.12 -y +conda activate myenv +``` + +```{note} +[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. +``` + +Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: + +```console +# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +``` diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index ff216f8af30f9..8ac80e5e5c553 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -2,34 +2,45 @@ # Quickstart -This guide will help you quickly get started with vLLM to: +This guide will help you quickly get started with vLLM to perform: -- [Run offline batched inference](#offline-batched-inference) -- [Run OpenAI-compatible inference](#openai-compatible-server) +- [Offline batched inference](#quickstart-offline) +- [Online serving using OpenAI-compatible server](#quickstart-online) ## Prerequisites - OS: Linux - Python: 3.9 -- 3.12 -- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) ## Installation -You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. +If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly. + +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -$ conda create -n myenv python=3.10 -y -$ conda activate myenv -$ pip install vllm +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +uv pip install vllm ``` -Please refer to the [installation documentation](#installation-index) for more details on installing vLLM. +You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. + +```console +conda create -n myenv python=3.12 -y +conda activate myenv +pip install vllm +``` + +```{note} +For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. +``` -(offline-batched-inference)= +(quickstart-offline)= ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -40,7 +51,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class from vllm import LLM, SamplingParams ``` -The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html). +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). ```python prompts = [ @@ -73,7 +84,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -(openai-compatible-server)= +(quickstart-online)= ## OpenAI-Compatible Server @@ -83,7 +94,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: ```console -$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} @@ -94,7 +105,7 @@ You can learn about overriding it [here](#chat-template). This server can be queried in the same format as OpenAI API. For example, to list the models: ```console -$ curl http://localhost:8000/v1/models +curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. @@ -104,14 +115,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: ```console -$ curl http://localhost:8000/v1/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "prompt": "San Francisco is a", -$ "max_tokens": 7, -$ "temperature": 0 -$ }' +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' ``` Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: @@ -131,7 +142,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` -A more detailed client example can be found here: +A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM @@ -140,15 +151,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: ```console -$ curl http://localhost:8000/v1/chat/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "messages": [ -$ {"role": "system", "content": "You are a helpful assistant."}, -$ {"role": "user", "content": "Who won the world series in 2020?"} -$ ] -$ }' +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"} + ] + }' ``` Alternatively, you can use the `openai` Python package: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 5a0310da0f2cb..1e290d2b4c0bd 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form ## Model is too large -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging @@ -48,6 +48,7 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. (troubleshooting-incorrect-hardware-driver)= + ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -118,13 +119,13 @@ dist.destroy_process_group() If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console -$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py ``` If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: ```console -$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py ``` If the script runs successfully, you should see the message `sanity check is successful!`. @@ -141,6 +142,7 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup ``` (troubleshooting-python-multiprocessing)= + ## Python multiprocessing ### `RuntimeError` Exception diff --git a/docs/source/index.md b/docs/source/index.md index 4bc40bf0f5e41..8f9493d77186e 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# Welcome to vLLM! +# Welcome to vLLM ```{figure} ./assets/logos/vllm-logo-text-light.png :align: center @@ -26,7 +26,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 @@ -54,6 +54,8 @@ For more information, check out the following: ## Documentation +% How to start using vLLM? + ```{toctree} :caption: Getting Started :maxdepth: 1 @@ -65,41 +67,26 @@ getting_started/troubleshooting getting_started/faq ``` -```{toctree} -:caption: Serving -:maxdepth: 1 - -serving/openai_compatible_server -serving/deploying_with_docker -serving/deploying_with_k8s -serving/deploying_with_helm -serving/deploying_with_nginx -serving/distributed_serving -serving/metrics -serving/integrations -serving/tensorizer -serving/runai_model_streamer -serving/engine_args -serving/env_vars -serving/usage_stats -``` +% What does vLLM support? ```{toctree} :caption: Models :maxdepth: 1 -models/supported_models models/generative_models models/pooling_models +models/supported_models +models/extensions/index ``` +% Additional capabilities + ```{toctree} :caption: Features :maxdepth: 1 features/quantization/index features/lora -features/multimodal_inputs features/tool_calling features/structured_outputs features/automatic_prefix_caching @@ -108,35 +95,47 @@ features/spec_decode features/compatibility_matrix ``` +% Details about running vLLM + ```{toctree} -:caption: Performance +:caption: Inference and Serving :maxdepth: 1 -performance/optimization -performance/benchmarks +serving/offline_inference +serving/openai_compatible_server +serving/multimodal_inputs +serving/distributed_serving +serving/metrics +serving/engine_args +serving/env_vars +serving/usage_stats +serving/integrations/index ``` -% Community: User community resources +% Scaling up vLLM for production ```{toctree} -:caption: Community +:caption: Deployment :maxdepth: 1 -community/meetups -community/sponsors +deployment/docker +deployment/k8s +deployment/nginx +deployment/frameworks/index +deployment/integrations/index ``` +% Making the most out of vLLM + ```{toctree} -:caption: API Reference -:maxdepth: 2 +:caption: Performance +:maxdepth: 1 -dev/sampling_params -dev/pooling_params -dev/offline_inference/offline_index -dev/engine/engine_index +performance/optimization +performance/benchmarks ``` -% Design Documents: Details about vLLM internals +% Explanation of vLLM internals ```{toctree} :caption: Design Documents @@ -146,13 +145,12 @@ design/arch_overview design/huggingface_integration design/plugin_system design/kernel/paged_attention -design/input_processing/model_inputs_index -design/multimodal/multimodal_index +design/mm_processing design/automatic_prefix_caching design/multiprocessing ``` -% Developer Guide: How to contribute to the vLLM project +% How to contribute to the vLLM project ```{toctree} :caption: Developer Guide @@ -162,9 +160,33 @@ contributing/overview contributing/profiling/profiling_index contributing/dockerfile/dockerfile contributing/model/index +contributing/vulnerability_management +``` + +% Technical API specifications + +```{toctree} +:caption: API Reference +:maxdepth: 2 + +api/offline_inference/index +api/engine/index +api/inference_params +api/multimodal/index +api/model/index +``` + +% Latest news and acknowledgements + +```{toctree} +:caption: Community +:maxdepth: 1 + +community/meetups +community/sponsors ``` -# Indices and tables +## Indices and tables - {ref}`genindex` - {ref}`modindex` diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md new file mode 100644 index 0000000000000..cff09d12eba47 --- /dev/null +++ b/docs/source/models/extensions/index.md @@ -0,0 +1,8 @@ +# Built-in Extensions + +```{toctree} +:maxdepth: 1 + +runai_model_streamer +tensorizer +``` diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md similarity index 69% rename from docs/source/serving/runai_model_streamer.md rename to docs/source/models/extensions/runai_model_streamer.md index d4269050ff574..75f7a9fcad416 100644 --- a/docs/source/serving/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -1,6 +1,6 @@ (runai-model-streamer)= -# Loading Models with Run:ai Model Streamer +# Loading models with Run:ai Model Streamer Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). @@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea You first need to install vLLM RunAI optional dependency: ```console -$ pip3 install vllm[runai] +pip3 install vllm[runai] ``` To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` ## Tunable parameters @@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` ```{note} diff --git a/docs/source/serving/tensorizer.md b/docs/source/models/extensions/tensorizer.md similarity index 89% rename from docs/source/serving/tensorizer.md rename to docs/source/models/extensions/tensorizer.md index d3dd29d48f730..ae17e3437bca6 100644 --- a/docs/source/serving/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -1,6 +1,6 @@ (tensorizer)= -# Loading Models with CoreWeave's Tensorizer +# Loading models with CoreWeave's Tensorizer vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized @@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). ```{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 383299d61b5dd..e4b4cd03a90d2 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. +For generative models, the only supported `--task` option is `"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. See [Engine Arguments](#engine-args) for a list of options when initializing the model. -For generative models, the only supported {code}`task` option is {code}`"generate"`. -Usually, this is automatically inferred so you don't have to specify it. - ### `LLM.generate` The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. @@ -33,7 +33,7 @@ for output in outputs: ``` You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. -For example, you can use greedy sampling by setting {code}`temperature=0`: +For example, you can use greedy sampling by setting `temperature=0`: ```python llm = LLM(model="facebook/opt-125m") @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: @@ -118,7 +118,7 @@ print("Loaded chat template:", custom_template) outputs = llm.chat(conversation, chat_template=custom_template) ``` -## Online Inference +## Online Serving Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 12ded68eb30b5..91db694be29a4 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -14,30 +14,53 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature pooling models as they only work on the generation or decode stage, so performance may not improve as much. ``` -## Offline Inference - -The {class}`~vllm.LLM` class provides various methods for offline inference. -See [Engine Arguments](#engine-args) for a list of options when initializing the model. - -For pooling models, we support the following {code}`task` options: - -- Embedding ({code}`"embed"` / {code}`"embedding"`) -- Classification ({code}`"classify"`) -- Sentence Pair Scoring ({code}`"score"`) -- Reward Modeling ({code}`"reward"`) +For pooling models, we support the following `--task` options. +The selected option sets the default pooler used to extract the final hidden states: + +```{list-table} +:widths: 50 25 25 25 +:header-rows: 1 + +* - Task + - Pooling Type + - Normalization + - Softmax +* - Embedding (`embed`) + - `LAST` + - ✅︎ + - ✗ +* - Classification (`classify`) + - `LAST` + - ✗ + - ✅︎ +* - Sentence Pair Scoring (`score`) + - \* + - \* + - \* +* - Reward Modeling (`reward`) + - `ALL` + - ✗ + - ✗ +``` -The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used: +\*The default pooler is always defined by the model. -- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. -- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. -- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. -- Reward Modeling: Extract all of the hidden states and return them directly. +```{note} +If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. +``` When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`). +we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). -You can customize the model's pooling method via the {code}`override_pooler_config` option, +```{tip} +You can customize the model's pooling method via the `--override-pooler-config` option, which takes priority over both the model's and Sentence Transformers's defaults. +``` + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. ### `LLM.encode` @@ -65,7 +88,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -80,7 +103,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -102,9 +125,9 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: -## Online Inference +## Online Serving Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5a2778026192a..642ef3c9655b8 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -1,9 +1,9 @@ (supported-models)= -# Supported Models +# List of Supported Models vLLM supports generative and pooling models across various tasks. -If a model supports more than one task, you can set the task via the {code}`--task` argument. +If a model supports more than one task, you can set the task via the `--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. @@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it. By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models). -To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository. -If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory. +To determine whether a given model is supported, you can check the `config.json` file inside the HF repository. +If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory. ````{tip} The easiest way to check if your model is really supported at runtime is to run the program below: @@ -45,10 +45,10 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: ```shell -$ export VLLM_USE_MODELSCOPE=True +export VLLM_USE_MODELSCOPE=True ``` -And use with {code}`trust_remote_code=True`. +And use with `trust_remote_code=True`. ```python from vllm import LLM @@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Qwen2ForCausalLM` - - Qwen2 + - QwQ, Qwen2 - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ @@ -420,20 +420,23 @@ you should explicitly specify the task type to ensure that the model is used in ``` ```{note} -{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. -You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`. +`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. +You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. ``` ```{note} -Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. -You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. +Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. +You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. -On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention +On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention despite being described otherwise on its model card. + +Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be +loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. #### Reward Modeling (`--task reward`) @@ -465,11 +468,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. +{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. ```{important} -For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, -e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, +e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. ``` #### Classification (`--task classify`) @@ -496,7 +499,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) @@ -537,16 +540,38 @@ The following modalities are supported depending on the model: - **V**ideo - **A**udio -Any combination of modalities joined by {code}`+` are supported. +Any combination of modalities joined by `+` are supported. -- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs. +- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs. -On the other hand, modalities separated by {code}`/` are mutually exclusive. +On the other hand, modalities separated by `/` are mutually exclusive. -- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. +- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. +````{important} +To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) +or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: + +Offline inference: +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` + +Online serving: +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` + +```{note} +vLLM currently only supports adding LoRA to the language backbone of multimodal models. +``` + ### Generative Models See [this page](#generative-models) for more information on how to use generative models. @@ -585,6 +610,13 @@ See [this page](#generative-models) for more information on how to use generativ - - ✅︎ - ✅︎ +* - `DeepseekVLV2ForCausalLM` + - DeepSeek-VL2 + - T + I+ + - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + - + - ✅︎ + - ✅︎ * - `FuyuForCausalLM` - Fuyu - T + I @@ -640,14 +672,14 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅︎ - - + - ✅︎ * - `LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - T + I+ + V+ - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - ✅︎ * - `MiniCPMV` - MiniCPM-V - T + IE+ @@ -686,14 +718,14 @@ See [this page](#generative-models) for more information on how to use generativ * - `Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + IE+ - - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. + - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - - ✅︎ - ✅︎ * - `PixtralForConditionalGeneration` - Pixtral - T + I+ - - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. + - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. - - ✅︎ - ✅︎ @@ -710,9 +742,9 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ - - + - ✅︎ * - `Qwen2VLForConditionalGeneration` - - Qwen2-VL + - QVQ, Qwen2-VL - T + IE+ + VE+ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ @@ -724,39 +756,35 @@ See [this page](#generative-models) for more information on how to use generativ - `fixie-ai/ultravox-v0_3` - - ✅︎ - - + - ✅︎ ``` E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -````{important} -To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) -or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: +````{note} +The `deepseek-ai/deepseek-vl2-tiny` is not supported yet. -```python -llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, -) +To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package: +```shell +pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git ``` -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 -``` +Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. ```` ```{note} -vLLM currently only supports adding LoRA to the language backbone of multimodal models. +To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` ```{note} -To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. +For more details, please see: ``` ```{note} -The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now. -For more details, please see: +The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). +A corrected version is available at . ``` ### Pooling Models @@ -770,7 +798,7 @@ you should explicitly specify the task type to ensure that the model is used in #### Text Embedding (`--task embed`) -Any text generation model can be converted into an embedding model by passing {code}`--task embed`. +Any text generation model can be converted into an embedding model by passing `--task embed`. ```{note} To get the best results, you should use pooling models that are specifically trained as such. @@ -810,19 +838,22 @@ The following table lists those that are tested in vLLM. _________________ -# Model Support Policy +## Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -```{tip} -When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. -``` + ```{tip} + When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + ``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 4fcde9b03b887..4fbc376e1aa39 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -``` +```text WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md deleted file mode 100644 index 5f9b0e4f55ecc..0000000000000 --- a/docs/source/serving/deploying_with_k8s.md +++ /dev/null @@ -1,248 +0,0 @@ -(deploying-with-k8s)= - -# Deploying with Kubernetes - -Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. - -## Prerequisites - -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` -- Available GPU resources in your cluster - -## Deployment Steps - -1. **Create a PVC , Secret and Deployment for vLLM** - -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options - -```yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: mistral-7b - namespace: default -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem -``` - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: hf-token-secret - namespace: default -type: Opaque -stringData: - token: "REPLACE_WITH_TOKEN" -``` - -Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. - -Here are two examples for using NVIDIA GPU and AMD GPU. - -- NVIDIA GPU - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 -``` - -- AMD GPU - -You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - # PVC - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "8Gi" - hostNetwork: true - hostIPC: true - containers: - - name: mistral-7b - image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 - securityContext: - seccompProfile: - type: Unconfined - runAsGroup: 44 - capabilities: - add: - - SYS_PTRACE - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - amd.com/gpu: "1" - requests: - cpu: "6" - memory: 6G - amd.com/gpu: "1" - volumeMounts: - - name: cache-volume - mountPath: /root/.cache/huggingface - - name: shm - mountPath: /dev/shm -``` -You can get the full example with steps and sample yaml files from . - -2. **Create a Kubernetes Service for vLLM** - -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: - -```yaml -apiVersion: v1 -kind: Service -metadata: - name: mistral-7b - namespace: default -spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP -``` - -3. **Deploy and Test** - -Apply the deployment and service configurations using `kubectl apply -f `: - -```console -kubectl apply -f deployment.yaml -kubectl apply -f service.yaml -``` - -To test the deployment, run the following `curl` command: - -```console -curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "mistralai/Mistral-7B-Instruct-v0.3", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' -``` - -If the service is correctly deployed, you should receive a response from the vLLM model. - -## Conclusion - -Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 6fbc1ea104678..daf6e2f250416 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. ``` -## Details for Distributed Inference and Serving +## Running vLLM on a single node vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. -To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: +To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: ```python from vllm import LLM @@ -32,45 +32,45 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4) output = llm.generate("San Franciso is a") ``` -To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: +To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console -$ vllm serve facebook/opt-13b \ -$ --tensor-parallel-size 4 + vllm serve facebook/opt-13b \ + --tensor-parallel-size 4 ``` -You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: +You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: ```console -$ vllm serve gpt2 \ -$ --tensor-parallel-size 4 \ -$ --pipeline-parallel-size 2 + vllm serve gpt2 \ + --tensor-parallel-size 4 \ + --pipeline-parallel-size 2 ``` -## Multi-Node Inference and Serving +## Running vLLM on multiple nodes If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. Pick a node as the head node, and run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --head \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --head \ + /path/to/the/huggingface/home/in/this/node ``` On the rest of the worker nodes, run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --worker \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --worker \ + /path/to/the/huggingface/home/in/this/node ``` Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. @@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 8 \ -$ --pipeline-parallel-size 2 + vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 8 \ + --pipeline-parallel-size 2 ``` You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 16 +vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 16 ``` To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md deleted file mode 100644 index d214c77254257..0000000000000 --- a/docs/source/serving/integrations.md +++ /dev/null @@ -1,17 +0,0 @@ -# Integrations - -```{toctree} -:maxdepth: 1 - -run_on_sky -deploying_with_kserve -deploying_with_kubeai -deploying_with_triton -deploying_with_bentoml -deploying_with_cerebrium -deploying_with_lws -deploying_with_dstack -serving_with_langchain -serving_with_llamaindex -serving_with_llamastack -``` diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md new file mode 100644 index 0000000000000..371c284981ce9 --- /dev/null +++ b/docs/source/serving/integrations/index.md @@ -0,0 +1,8 @@ +# External Integrations + +```{toctree} +:maxdepth: 1 + +langchain +llamaindex +``` diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/integrations/langchain.md similarity index 76% rename from docs/source/serving/serving_with_langchain.md rename to docs/source/serving/integrations/langchain.md index 96bd5943f3d64..03142d23b145a 100644 --- a/docs/source/serving/serving_with_langchain.md +++ b/docs/source/serving/integrations/langchain.md @@ -1,13 +1,13 @@ -(run-on-langchain)= +(serving-langchain)= -# Serving with Langchain +# LangChain -vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) . +vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) . -To install langchain, run +To install LangChain, run ```console -$ pip install langchain langchain_community -q +pip install langchain langchain_community -q ``` To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/integrations/llamaindex.md similarity index 68% rename from docs/source/serving/serving_with_llamaindex.md rename to docs/source/serving/integrations/llamaindex.md index 98859d8e3f828..8c72605202cf5 100644 --- a/docs/source/serving/serving_with_llamaindex.md +++ b/docs/source/serving/integrations/llamaindex.md @@ -1,13 +1,13 @@ -(run-on-llamaindex)= +(serving-llamaindex)= -# Serving with llama_index +# LlamaIndex -vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) . +vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) . -To install llamaindex, run +To install LlamaIndex, run ```console -$ pip install llama-index-llms-vllm -q +pip install llama-index-llms-vllm -q ``` To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 2dc78643f6d8f..6c84f6d1350a6 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -4,10 +4,10 @@ vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. -You can start the server using Python, or using [Docker](deploying_with_docker.md): +You can start the server using Python, or using [Docker](#deployment-docker): ```console -$ vllm serve unsloth/Llama-3.2-1B-Instruct +vllm serve unsloth/Llama-3.2-1B-Instruct ``` Then query the endpoint to get the latest metrics from the server: diff --git a/docs/source/features/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md similarity index 91% rename from docs/source/features/multimodal_inputs.md rename to docs/source/serving/multimodal_inputs.md index 4f45a9f448cf0..0213b0a3388ea 100644 --- a/docs/source/features/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -14,11 +14,11 @@ and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/ch To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: - `prompt`: The prompt should follow the format that is documented on HuggingFace. -- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`. +- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`. ### Image -You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples: +You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: ```python llm = LLM(model="llava-hf/llava-1.5-7b-hf") @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -122,21 +122,21 @@ for o in outputs: ### Video -You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary +You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio -You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. +You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, -pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. +pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. ```python # Inference with image embeddings as input @@ -199,7 +199,7 @@ for o in outputs: print(generated_text) ``` -## Online Inference +## Online Serving Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). @@ -271,7 +271,7 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` -Full example: +Full example: ```{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, @@ -294,7 +294,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT= ### Video -Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf). +Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf). First, launch the OpenAI-compatible server: @@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ``` Then, you can use the OpenAI client as follows: + ```python from openai import OpenAI @@ -342,7 +343,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching videos through HTTP URL is `30` seconds. @@ -418,7 +419,7 @@ result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) ``` -Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input: +Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: ```python chat_completion_from_url = client.chat.completions.create( @@ -445,7 +446,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. @@ -529,4 +530,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th example below for details. ``` -Full example: +Full example: diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md new file mode 100644 index 0000000000000..94703a1c32ade --- /dev/null +++ b/docs/source/serving/offline_inference.md @@ -0,0 +1,79 @@ +(offline-inference)= + +# Offline Inference + +You can run vLLM in your own code on a list of prompts. + +The offline API is based on the {class}`~vllm.LLM` class. +To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run. + +For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace +and runs it in vLLM using the default configuration. + +```python +llm = LLM(model="facebook/opt-125m") +``` + +After initializing the `LLM` instance, you can perform model inference using various APIs. +The available APIs depend on the type of model that is being run: + +- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text. +- [Pooling models](#pooling-models) output their hidden states directly. + +Please refer to the above pages for more details about each API. + +```{seealso} +[API Reference](/api/offline_inference/index) +``` + +## Configuration Options + +This section lists the most common options for running the vLLM engine. +For a full list, refer to the [Engine Arguments](#engine-args) page. + +### Reducing memory usage + +Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. + +#### Tensor Parallelism (TP) + +Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs. + +The following code splits the model across 2 GPUs. + +```python +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", + tensor_parallel_size=2) +``` + +```{important} +To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`) +before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. + +To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. +``` + +#### Quantization + +Quantized models take less memory at the cost of lower precision. + +Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic)) +and used directly without extra configuration. + +Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details. + +#### Context length and batch size + +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) +and the maximum batch size (`max_num_seqs` option). + +```python +llm = LLM(model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2) +``` + +### Performance optimization and tuning + +You can potentially improve the performance of vLLM by finetuning various options. +Please refer to [this guide](#optimization-and-tuning) for more details. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 97e9879075570..e49bbb06695f8 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -1,13 +1,17 @@ -# OpenAI Compatible Server +(openai-compatible-server)= -vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! +# OpenAI-Compatible Server + +vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! + +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): -You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. + ```python from openai import OpenAI client = OpenAI( @@ -48,6 +52,7 @@ In addition, we have the following custom APIs: - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= + ## Chat Template In order for the language model to support chat protocol, vLLM requires the model to include @@ -69,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -78,7 +84,7 @@ completion = client.chat.completions.create( ) ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like *"Detected the chat template content format to be..."*, and internally converts incoming requests to match @@ -113,12 +119,12 @@ completion = client.chat.completions.create( ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled -with `--enable-request-id-headers`. +with `--enable-request-id-headers`. > Note that enablement of the headers can impact performance significantly at high QPS > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio), > rather than within the vLLM layer for this reason. -> See https://github.com/vllm-project/vllm/pull/11529 for more details. +> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. ```python completion = client.chat.completions.create( @@ -145,6 +151,7 @@ print(completion._request_id) ## CLI Reference (vllm-serve)= + ### `vllm serve` The `vllm serve` command is used to launch the OpenAI-compatible server. @@ -173,7 +180,7 @@ uvicorn-log-level: "info" To use the above config file: ```bash -$ vllm serve SOME_MODEL --config config.yaml +vllm serve SOME_MODEL --config config.yaml ``` ```{note} @@ -184,16 +191,17 @@ The order of priorities is `command line > config file values > defaults`. ## API Reference (completions-api)= + ### Completions API Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters](#sampling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -210,6 +218,7 @@ The following extra parameters are supported: ``` (chat-api)= + ### Chat API Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); @@ -220,11 +229,11 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - *Note: `image_url.detail` parameter is not supported.* -Code example: +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters](#sampling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -241,6 +250,7 @@ The following extra parameters are supported: ``` (embeddings-api)= + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); @@ -253,11 +263,11 @@ which will be treated as a single prompt to the model. This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` -Code example: +Code example: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters](#pooling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -282,6 +292,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s ``` (tokenizer-api)= + ### Tokenizer API Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). @@ -291,15 +302,17 @@ It consists of two endpoints: - `/detokenize` corresponds to calling `tokenizer.decode()`. (pooling-api)= + ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: (score-api)= + ### Score API Our Score API applies a cross-encoder model to predict scores for sentence pairs. @@ -307,7 +320,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: +Code example: #### Single inference @@ -445,7 +458,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters](#pooling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md index 3d02fbab9216e..cfc3cb2576873 100644 --- a/docs/source/serving/usage_stats.md +++ b/docs/source/serving/usage_stats.md @@ -45,7 +45,7 @@ You can preview the collected data by running the following command: tail ~/.config/vllm/usage_stats.json ``` -## Opt-out of Usage Stats Collection +## Opting out You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: diff --git a/examples/gguf_inference.py b/examples/gguf_inference.py deleted file mode 100644 index 09a5fcc22e553..0000000000000 --- a/examples/gguf_inference.py +++ /dev/null @@ -1,38 +0,0 @@ -from huggingface_hub import hf_hub_download - -from vllm import LLM, SamplingParams - - -def run_gguf_inference(model_path): - PROMPT_TEMPLATE = "<|system|>\n{system_message}\n<|user|>\n{prompt}\n<|assistant|>\n" # noqa: E501 - system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501 - # Sample prompts. - prompts = [ - "How many helicopters can a human eat in one sitting?", - "What's the future of AI?", - ] - prompts = [ - PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt) - for prompt in prompts - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0, max_tokens=128) - - # Create an LLM. - llm = LLM(model=model_path, - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - gpu_memory_utilization=0.95) - - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -if __name__ == "__main__": - repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" - filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" - model = hf_hub_download(repo_id, filename=filename) - run_gguf_inference(model) diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py similarity index 100% rename from examples/aqlm_example.py rename to examples/offline_inference/aqlm_example.py diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/arctic.py similarity index 100% rename from examples/offline_inference_arctic.py rename to examples/offline_inference/arctic.py diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py similarity index 100% rename from examples/offline_inference_audio_language.py rename to examples/offline_inference/audio_language.py diff --git a/examples/offline_inference.py b/examples/offline_inference/basic.py similarity index 100% rename from examples/offline_inference.py rename to examples/offline_inference/basic.py diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py similarity index 100% rename from examples/offline_inference_with_default_generation_config.py rename to examples/offline_inference/basic_with_model_default_sampling.py diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/chat.py similarity index 100% rename from examples/offline_inference_chat.py rename to examples/offline_inference/chat.py diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py similarity index 100% rename from examples/offline_chat_with_tools.py rename to examples/offline_inference/chat_with_tools.py diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/classification.py similarity index 100% rename from examples/offline_inference_classification.py rename to examples/offline_inference/classification.py diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/cli.py similarity index 100% rename from examples/offline_inference_cli.py rename to examples/offline_inference/cli.py diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py similarity index 100% rename from examples/cpu_offload.py rename to examples/offline_inference/cpu_offload.py diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/distributed.py similarity index 100% rename from examples/offline_inference_distributed.py rename to examples/offline_inference/distributed.py diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/embedding.py similarity index 100% rename from examples/offline_inference_embedding.py rename to examples/offline_inference/embedding.py diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py similarity index 100% rename from examples/offline_inference_encoder_decoder.py rename to examples/offline_inference/encoder_decoder.py diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py similarity index 93% rename from examples/florence2_inference.py rename to examples/offline_inference/florence2_inference.py index b58ac2e1f7ed4..c24096e90004b 100644 --- a/examples/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -3,7 +3,8 @@ encoder/decoder models, specifically Florence-2 ''' # TODO(Isotr0py): -# Move to offline_inference_vision_language.py after porting vision backbone +# Move to offline_inference/vision_language.py +# after porting vision backbone from vllm import LLM, SamplingParams dtype = "float" diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py new file mode 100644 index 0000000000000..aa05c4c0bfaa5 --- /dev/null +++ b/examples/offline_inference/gguf_inference.py @@ -0,0 +1,32 @@ +from huggingface_hub import hf_hub_download + +from vllm import LLM, SamplingParams + + +def run_gguf_inference(model_path, tokenizer): + # Sample prompts. + prompts = [ + "How many helicopters can a human eat in one sitting?", + "What's the future of AI?", + ] + prompts = [[{"role": "user", "content": prompt}] for prompt in prompts] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0, max_tokens=128) + + # Create an LLM. + llm = LLM(model=model_path, tokenizer=tokenizer) + + outputs = llm.chat(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" + filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" + tokenizer = "microsoft/Phi-3-medium-4k-instruct" + model = hf_hub_download(repo_id, filename=filename) + run_gguf_inference(model, tokenizer) diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py similarity index 100% rename from examples/llm_engine_example.py rename to examples/offline_inference/llm_engine_example.py diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py similarity index 100% rename from examples/lora_with_quantization_inference.py rename to examples/offline_inference/lora_with_quantization_inference.py diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py similarity index 100% rename from examples/offline_inference_mlpspeculator.py rename to examples/offline_inference/mlpspeculator.py diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py similarity index 100% rename from examples/multilora_inference.py rename to examples/offline_inference/multilora_inference.py diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/neuron.py similarity index 81% rename from examples/offline_inference_neuron.py rename to examples/offline_inference/neuron.py index 2856be7c864ea..f098c8e5fed1e 100644 --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference/neuron.py @@ -1,12 +1,5 @@ -import os - from vllm import LLM, SamplingParams -# creates XLA hlo graphs for all the context length buckets. -os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" -# creates XLA hlo graphs for all the token gen buckets. -os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" - # Sample prompts. prompts = [ "Hello, my name is", @@ -26,8 +19,8 @@ # Currently, this is a known limitation in continuous batching support # in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, + max_model_len=1024, + block_size=1024, # The device can be automatically detected when AWS Neuron SDK is installed. # The device argument can be either unspecified for automated detection, # or explicitly assigned. diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py similarity index 100% rename from examples/offline_inference_neuron_int8_quantization.py rename to examples/offline_inference/neuron_int8_quantization.py diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md similarity index 92% rename from examples/offline_inference_openai.md rename to examples/offline_inference/openai/openai_batch.md index 2436417cb543a..a4774e57cd9a5 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ``` -python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -66,10 +66,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run ``` -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ``` -aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl similarity index 100% rename from examples/openai_example_batch.jsonl rename to examples/offline_inference/openai/openai_example_batch.jsonl diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py similarity index 100% rename from examples/offline_inference_pixtral.py rename to examples/offline_inference/pixtral.py diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py similarity index 100% rename from examples/offline_inference_with_prefix.py rename to examples/offline_inference/prefix_caching.py diff --git a/examples/offline_profile.py b/examples/offline_inference/profiling.py similarity index 99% rename from examples/offline_profile.py rename to examples/offline_inference/profiling.py index 46afe8aa2604b..8a94b5c2a8623 100644 --- a/examples/offline_profile.py +++ b/examples/offline_inference/profiling.py @@ -363,7 +363,7 @@ def abort_requests(): example: ``` - python examples/offline_profile.py \\ + python examples/offline_inference/profiling.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --enforce-eager run_num_steps -n 2 diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py similarity index 100% rename from examples/save_sharded_state.py rename to examples/offline_inference/save_sharded_state.py diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/scoring.py similarity index 100% rename from examples/offline_inference_scoring.py rename to examples/offline_inference/scoring.py diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py similarity index 100% rename from examples/offline_inference_with_profiler.py rename to examples/offline_inference/simple_profiling.py diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py similarity index 100% rename from examples/offline_inference_structured_outputs.py rename to examples/offline_inference/structured_outputs.py diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/tpu.py similarity index 100% rename from examples/offline_inference_tpu.py rename to examples/offline_inference/tpu.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py similarity index 97% rename from examples/offline_inference_vision_language.py rename to examples/offline_inference/vision_language.py index b51bfae455267..ad32b9fe242e9 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -66,6 +66,23 @@ def run_chameleon(question: str, modality: str): return llm, prompt, stop_token_ids +# Deepseek-VL2 +def run_deepseek_vl2(question: str, modality: str): + assert modality == "image" + + model_name = "deepseek-ai/deepseek-vl2-small" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}) + + prompt = f"<|User|>: \n{question}\n\n<|Assistant|>:" + stop_token_ids = None + return llm, prompt, stop_token_ids + + # Fuyu def run_fuyu(question: str, modality: str): assert modality == "image" @@ -498,6 +515,7 @@ def run_qwen2_vl(question: str, modality: str): "aria": run_aria, "blip-2": run_blip2, "chameleon": run_chameleon, + "deepseek_vl_v2": run_deepseek_vl2, "fuyu": run_fuyu, "glm4v": run_glm4v, "h2ovl_chat": run_h2ovl, diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py similarity index 100% rename from examples/offline_inference_vision_language_embedding.py rename to examples/offline_inference/vision_language_embedding.py diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py similarity index 89% rename from examples/offline_inference_vision_language_multi_image.py rename to examples/offline_inference/vision_language_multi_image.py index 6af8d7768e75d..c6cf3f30c31cb 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -23,7 +23,7 @@ class ModelRequestData(NamedTuple): llm: LLM prompt: str - stop_token_ids: Optional[List[str]] + stop_token_ids: Optional[List[int]] image_data: List[Image] chat_template: Optional[str] @@ -44,12 +44,36 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" "<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return ModelRequestData( llm=llm, prompt=prompt, stop_token_ids=stop_token_ids, image_data=[fetch_image(url) for url in image_urls], - chat_template=None) + chat_template=None, + ) + + +def load_deepseek_vl2(question: str, image_urls: List[str]): + model_name = "deepseek-ai/deepseek-vl2-small" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=2, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + limit_mm_per_prompt={"image": len(image_urls)}) + + placeholder = "".join(f"image_{i}:\n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: @@ -166,7 +190,8 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, ) - prompt = f"<|image|><|image|><|begin_of_text|>{question}" + placeholders = "<|image|>" * len(image_urls) + prompt = f"{placeholders}<|begin_of_text|>{question}" return ModelRequestData( llm=llm, prompt=prompt, @@ -209,6 +234,31 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) +def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "mistral-community/pixtral-12b" + + # Adjust this as necessary to fit in GPU + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + tensor_parallel_size=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "[IMG]" * len(image_urls) + prompt = f"[INST]{question}\n{placeholders}[/INST]" + stop_token_ids = None + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended @@ -244,7 +294,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: +def load_qwen_vl_chat(question: str, + image_urls: List[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" llm = LLM( model=model_name, @@ -274,6 +325,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return ModelRequestData( llm=llm, prompt=prompt, @@ -342,13 +394,15 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, + "deepseek_vl2": load_deepseek_vl2, "h2ovl_chat": load_h2onvl, "idefics3": load_idefics3, "internvl_chat": load_internvl, "mllama": load_mllama, "NVLM_D": load_nvlm_d, "phi3_v": load_phi3v, - "qwen_vl_chat": load_qwenvl_chat, + "pixtral_hf": load_pixtral_hf, + "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, } diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/whisper.py similarity index 100% rename from examples/offline_inference_whisper.py rename to examples/offline_inference/whisper.py diff --git a/examples/api_client.py b/examples/online_serving/api_client.py similarity index 100% rename from examples/api_client.py rename to examples/online_serving/api_client.py diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore similarity index 100% rename from examples/chart-helm/.helmignore rename to examples/online_serving/chart-helm/.helmignore diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml similarity index 100% rename from examples/chart-helm/Chart.yaml rename to examples/online_serving/chart-helm/Chart.yaml diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md new file mode 100644 index 0000000000000..6aa126d4fd22c --- /dev/null +++ b/examples/online_serving/chart-helm/README.md @@ -0,0 +1,21 @@ +# Helm Charts + +This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more. + +## Files + +- Chart.yaml: Defines the chart metadata including name, version, and maintainers. +- ct.yaml: Configuration for chart testing. +- lintconf.yaml: Linting rules for YAML files. +- values.schema.json: JSON schema for validating values.yaml. +- values.yaml: Default values for the Helm chart. +- templates/_helpers.tpl: Helper templates for defining common configurations. +- templates/configmap.yaml: Template for creating ConfigMaps. +- templates/custom-objects.yaml: Template for custom Kubernetes objects. +- templates/deployment.yaml: Template for creating Deployments. +- templates/hpa.yaml: Template for Horizontal Pod Autoscaler. +- templates/job.yaml: Template for Kubernetes Jobs. +- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget. +- templates/pvc.yaml: Template for Persistent Volume Claims. +- templates/secrets.yaml: Template for Kubernetes Secrets. +- templates/service.yaml: Template for creating Services. \ No newline at end of file diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml similarity index 100% rename from examples/chart-helm/ct.yaml rename to examples/online_serving/chart-helm/ct.yaml diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml similarity index 100% rename from examples/chart-helm/lintconf.yaml rename to examples/online_serving/chart-helm/lintconf.yaml diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl similarity index 100% rename from examples/chart-helm/templates/_helpers.tpl rename to examples/online_serving/chart-helm/templates/_helpers.tpl diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml similarity index 100% rename from examples/chart-helm/templates/configmap.yaml rename to examples/online_serving/chart-helm/templates/configmap.yaml diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml similarity index 100% rename from examples/chart-helm/templates/custom-objects.yaml rename to examples/online_serving/chart-helm/templates/custom-objects.yaml diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml similarity index 100% rename from examples/chart-helm/templates/deployment.yaml rename to examples/online_serving/chart-helm/templates/deployment.yaml diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml similarity index 100% rename from examples/chart-helm/templates/hpa.yaml rename to examples/online_serving/chart-helm/templates/hpa.yaml diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml similarity index 100% rename from examples/chart-helm/templates/job.yaml rename to examples/online_serving/chart-helm/templates/job.yaml diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml similarity index 100% rename from examples/chart-helm/templates/poddisruptionbudget.yaml rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml similarity index 100% rename from examples/chart-helm/templates/pvc.yaml rename to examples/online_serving/chart-helm/templates/pvc.yaml diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml similarity index 100% rename from examples/chart-helm/templates/secrets.yaml rename to examples/online_serving/chart-helm/templates/secrets.yaml diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml similarity index 100% rename from examples/chart-helm/templates/service.yaml rename to examples/online_serving/chart-helm/templates/service.yaml diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json similarity index 100% rename from examples/chart-helm/values.schema.json rename to examples/online_serving/chart-helm/values.schema.json diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml similarity index 100% rename from examples/chart-helm/values.yaml rename to examples/online_serving/chart-helm/values.yaml diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh similarity index 100% rename from examples/disaggregated_prefill.sh rename to examples/online_serving/disaggregated_prefill.sh diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py similarity index 100% rename from examples/gradio_openai_chatbot_webserver.py rename to examples/online_serving/gradio_openai_chatbot_webserver.py diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py similarity index 100% rename from examples/gradio_webserver.py rename to examples/online_serving/gradio_webserver.py diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py similarity index 100% rename from examples/openai_chat_completion_client.py rename to examples/online_serving/openai_chat_completion_client.py diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py similarity index 98% rename from examples/openai_chat_completion_client_for_multimodal.py rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py index 213d075542e81..03cc037bb6779 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,5 +1,5 @@ """An example showing how to use vLLM to serve multimodal models -and run online inference with OpenAI client. +and run online serving with OpenAI client. Launch the vLLM server with the following command: @@ -309,7 +309,7 @@ def main(args) -> None: if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Demo on using OpenAI client for online inference with ' + description='Demo on using OpenAI client for online serving with ' 'multimodal language models served with vLLM.') parser.add_argument('--chat-type', '-c', diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py similarity index 100% rename from examples/openai_chat_completion_client_with_tools.py rename to examples/online_serving/openai_chat_completion_client_with_tools.py diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py similarity index 100% rename from examples/openai_chat_completion_structured_outputs.py rename to examples/online_serving/openai_chat_completion_structured_outputs.py diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py similarity index 100% rename from examples/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py similarity index 100% rename from examples/openai_completion_client.py rename to examples/online_serving/openai_completion_client.py diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py similarity index 100% rename from examples/openai_cross_encoder_score.py rename to examples/online_serving/openai_cross_encoder_score.py diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py similarity index 100% rename from examples/openai_embedding_client.py rename to examples/online_serving/openai_embedding_client.py diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py similarity index 100% rename from examples/openai_pooling_client.py rename to examples/online_serving/openai_pooling_client.py diff --git a/examples/production_monitoring/Otel.md b/examples/online_serving/opentelemetry/Otel.md similarity index 100% rename from examples/production_monitoring/Otel.md rename to examples/online_serving/opentelemetry/Otel.md diff --git a/examples/production_monitoring/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py similarity index 100% rename from examples/production_monitoring/dummy_client.py rename to examples/online_serving/opentelemetry/dummy_client.py diff --git a/examples/production_monitoring/README.md b/examples/online_serving/prometheus_grafana/README.md similarity index 95% rename from examples/production_monitoring/README.md rename to examples/online_serving/prometheus_grafana/README.md index 807c0470e7b30..c49e5306a1cb4 100644 --- a/examples/production_monitoring/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -1,4 +1,4 @@ -# vLLM + Prometheus/Grafana +# Prometheus and Grafana This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. @@ -6,7 +6,7 @@ Install: - [`docker`](https://docs.docker.com/engine/install/) - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository) -### Launch +## Launch Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: ```bash @@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \ Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM. -### Grafana Dashboard +## Grafana Dashboard Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`). -#### Add Prometheus Data Source +### Add Prometheus Data Source Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. @@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". -#### Import Dashboard +### Import Dashboard Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following: diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml similarity index 100% rename from examples/production_monitoring/docker-compose.yaml rename to examples/online_serving/prometheus_grafana/docker-compose.yaml diff --git a/examples/production_monitoring/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json similarity index 100% rename from examples/production_monitoring/grafana.json rename to examples/online_serving/prometheus_grafana/grafana.json diff --git a/examples/production_monitoring/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml similarity index 100% rename from examples/production_monitoring/prometheus.yaml rename to examples/online_serving/prometheus_grafana/prometheus.yaml diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh similarity index 100% rename from examples/run_cluster.sh rename to examples/online_serving/run_cluster.sh diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh similarity index 100% rename from examples/sagemaker-entrypoint.sh rename to examples/online_serving/sagemaker-entrypoint.sh diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md similarity index 100% rename from examples/logging_configuration.md rename to examples/other/logging_configuration.md diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py similarity index 96% rename from examples/tensorize_vllm_model.py rename to examples/other/tensorize_vllm_model.py index dd77a4ad0c6b7..5fff1fdf502c9 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -25,7 +25,7 @@ To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -45,7 +45,7 @@ To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -63,11 +63,11 @@ model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.tensorize_vllm_model serialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model serialize --help`. Or for deserializing: -`python -m examples.tensorize_vllm_model deserialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -88,7 +88,7 @@ In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.tensorize_vllm_model deserialize --help` +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja new file mode 100644 index 0000000000000..e94661cb39071 --- /dev/null +++ b/examples/template_pixtral_hf.jinja @@ -0,0 +1,38 @@ +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} + +{{- bos_token }} +{%- for message in loop_messages %} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif %} + {%- if message["role"] == "user" %} + {%- if loop.last and system_message is defined %} + {{- "[INST]" + system_message + "\n" }} + {%- else %} + {{- "[INST]" }} + {%- endif %} + {%- if message["content"] is not string %} + {%- for chunk in message["content"] %} + {%- if chunk["type"] == "text" %} + {{- chunk["text"] }} + {%- elif chunk["type"] == "image" %} + {{- "[IMG]" }} + {%- else %} + {{- raise_exception("Unrecognized content type!") }} + {%- endif %} + {%- endfor %} + {%- else %} + {{- message["content"] }} + {%- endif %} + {{- "[/INST]" }} + {%- elif message["role"] == "assistant" %} + {{- message["content"] + eos_token}} + {%- else %} + {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} + {%- endif %} +{%- endfor %} diff --git a/format.sh b/format.sh index 0b196de9d0773..2277eef93c745 100755 --- a/format.sh +++ b/format.sh @@ -41,7 +41,7 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}') CODESPELL_VERSION=$(codespell --version) ISORT_VERSION=$(isort --vn) CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') +PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}') # # params: tool name, tool version, required version tool_version_check() { @@ -58,7 +58,7 @@ tool_version_check "mypy" "$MYPY_VERSION" tool_version_check "isort" "$ISORT_VERSION" tool_version_check "codespell" "$CODESPELL_VERSION" tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" +tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION" YAPF_FLAGS=( '--recursive' @@ -316,6 +316,6 @@ else echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi -echo 'vLLM sphinx-lint:' -tools/sphinx-lint.sh -echo 'vLLM sphinx-lint: Done' +echo 'vLLM doc-lint:' +tools/doc-lint.sh +echo 'vLLM doc-lint: Done' diff --git a/pyproject.toml b/pyproject.toml index 7c628e4721a30..2a777e6638d61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta" line-length = 80 exclude = [ # External file, leaving license intact - "examples/fp8/quantizer/quantize.py" + "examples/other/fp8/quantizer/quantize.py" ] [tool.ruff.lint.per-file-ignores] @@ -101,3 +101,9 @@ markers = [ "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", ] + +[tool.pymarkdown] +plugins.md013.enabled = false # line-length +plugins.md041.enabled = false # first-line-h1 +plugins.md033.enabled = false # inline-html +plugins.md024.allow_different_nesting = true # no-duplicate-headers diff --git a/python_only_dev.py b/python_only_dev.py index f70b4984025b3..7d95ac96e6e4b 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -7,7 +7,7 @@ or export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install -e . """ # noqa diff --git a/requirements-cpu.txt b/requirements-cpu.txt index e62f313297762..056fbf5a7adec 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for CPUs -torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" -torch==2.5.1; platform_machine == "aarch64" +torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" +torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch -datasets # for benchmark scripts \ No newline at end of file +datasets # for benchmark scripts diff --git a/requirements-lint.txt b/requirements-lint.txt index 711bb50a0e936..ffc73f90a0d48 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -6,7 +6,7 @@ ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 -sphinx-lint==1.0.0 +pymarkdownlnt==0.9.26 # type checking mypy==1.11.1 diff --git a/requirements-test.in b/requirements-test.in index fb4179c3d8423..4b4dc376d1fa5 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests peft +pqdm ray[adag]==2.40.0 sentence-transformers # required for embedding tests soundfile # required for audio tests diff --git a/requirements-test.txt b/requirements-test.txt index 3771577fe8ed0..f576e42afcbbf 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -48,6 +48,8 @@ botocore==1.35.57 # awscli # boto3 # s3transfer +bounded-pool-executor==0.0.3 + # via pqdm buildkite-test-collector==0.1.9 # via -r requirements-test.in certifi==2024.8.30 @@ -342,6 +344,8 @@ pooch==1.8.2 # via librosa portalocker==2.10.1 # via sacrebleu +pqdm==0.2.0 + # via -r requirements-test.in propcache==0.2.0 # via yarl protobuf==5.28.3 diff --git a/setup.py b/setup.py index 02d84a15f26aa..b7bc64fd950e7 100644 --- a/setup.py +++ b/setup.py @@ -34,9 +34,14 @@ def load_module_from_path(module_name, path): VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE -if not sys.platform.startswith("linux"): +if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu": logger.warning( - "vLLM only supports Linux platform (including WSL). " + "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS") + VLLM_TARGET_DEVICE = "cpu" +elif not (sys.platform.startswith("linux") + or sys.platform.startswith("darwin")): + logger.warning( + "vLLM only supports Linux platform (including WSL) and MacOS." "Building on %s, " "so vLLM may not be able to run correctly", sys.platform) VLLM_TARGET_DEVICE = "empty" @@ -252,7 +257,7 @@ def run(self): class repackage_wheel(build_ext): """Extracts libraries and other files from an existing wheel.""" - default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" def run(self) -> None: wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 1c2193bb17a55..31a101e48e026 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -44,7 +44,6 @@ def test_vllm_gc_ed(): assert weak_llm() is None -@pytest.mark.skip_v1 @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/conftest.py b/tests/conftest.py index 917151ddcb8d4..95af4ac1eb17b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,12 +28,13 @@ init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompts) + TokensPrompt, to_enc_dec_tuple_list, + zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity) + identity, is_list_of) logger = init_logger(__name__) @@ -886,6 +887,12 @@ def generate_beam_search( beam_width: int, max_tokens: int, ) -> List[Tuple[List[List[int]], List[str]]]: + if is_list_of(prompts, str, check="all"): + prompts = [TextPrompt(prompt=prompt) for prompt in prompts] + else: + prompts = [ + TokensPrompt(prompt_token_ids=tokens) for tokens in prompts + ] outputs = self.model.beam_search( prompts, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py new file mode 100644 index 0000000000000..46a064f6d9e68 --- /dev/null +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -0,0 +1,269 @@ +import asyncio +import json +import shutil +from contextlib import suppress + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio +# downloading lora to test lora requests +from huggingface_hub import snapshot_download + +from ...utils import RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.fixture(scope="module") +def server_with_lora_modules_json(zephyr_lora_files): + # Define the json format LoRA module configurations + lora_module_1 = { + "name": "zephyr-lora", + "path": zephyr_lora_files, + "base_model_name": MODEL_NAME + } + + lora_module_2 = { + "name": "zephyr-lora2", + "path": zephyr_lora_files, + "base_model_name": MODEL_NAME + } + + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + json.dumps(lora_module_1), + json.dumps(lora_module_2), + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "64", + ] + + # Enable the /v1/load_lora_adapter endpoint + envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server_with_lora_modules_json): + async with server_with_lora_modules_json.get_async_client( + ) as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_static_lora_lineage(client: openai.AsyncOpenAI, + zephyr_lora_files): + models = await client.models.list() + models = models.data + served_model = models[0] + lora_models = models[1:] + assert served_model.id == MODEL_NAME + assert served_model.root == MODEL_NAME + assert served_model.parent is None + assert all(lora_model.root == zephyr_lora_files + for lora_model in lora_models) + assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) + assert lora_models[0].id == "zephyr-lora" + assert lora_models[1].id == "zephyr-lora2" + + +@pytest.mark.asyncio +async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, + zephyr_lora_files): + + response = await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "zephyr-lora-3", + "lora_path": zephyr_lora_files + }) + # Ensure adapter loads before querying /models + assert "success" in response + + models = await client.models.list() + models = models.data + dynamic_lora_model = models[-1] + assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.parent == MODEL_NAME + assert dynamic_lora_model.id == "zephyr-lora-3" + + +@pytest.mark.asyncio +async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI): + with pytest.raises(openai.NotFoundError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "notfound", + "lora_path": "/not/an/adapter" + }) + + +@pytest.mark.asyncio +async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, + tmp_path): + invalid_files = tmp_path / "invalid_files" + invalid_files.mkdir() + (invalid_files / "adapter_config.json").write_text("this is not json") + + with pytest.raises(openai.BadRequestError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid-json", + "lora_path": str(invalid_files) + }) + + +@pytest.mark.asyncio +async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI, + tmp_path, zephyr_lora_files): + invalid_rank = tmp_path / "invalid_rank" + + # Copy adapter from zephyr_lora_files to invalid_rank + shutil.copytree(zephyr_lora_files, invalid_rank) + + with open(invalid_rank / "adapter_config.json") as f: + adapter_config = json.load(f) + + print(adapter_config) + + # assert False + + # Change rank to invalid value + adapter_config["r"] = 1024 + with open(invalid_rank / "adapter_config.json", "w") as f: + json.dump(adapter_config, f) + + with pytest.raises(openai.BadRequestError, + match="is greater than max_lora_rank"): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid-json", + "lora_path": str(invalid_rank) + }) + + +@pytest.mark.asyncio +async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files): + """Validate that many loras can be dynamically registered and inferenced + with concurrently""" + + # This test file configures the server with --max-cpu-loras=2 and this test + # will concurrently load 10 adapters, so it should flex the LRU cache + async def load_and_run_adapter(adapter_name: str): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": str(zephyr_lora_files) + }) + for _ in range(3): + await client.completions.create( + model=adapter_name, + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + + lora_tasks = [] + for i in range(10): + lora_tasks.append( + asyncio.create_task(load_and_run_adapter(f"adapter_{i}"))) + + results, _ = await asyncio.wait(lora_tasks) + + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + +@pytest.mark.asyncio +async def test_loading_invalid_adapters_does_not_break_others( + client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files): + + invalid_files = tmp_path / "invalid_files" + invalid_files.mkdir() + (invalid_files / "adapter_config.json").write_text("this is not json") + + stop_good_requests_event = asyncio.Event() + + async def run_good_requests(client): + # Run chat completions requests until event set + + results = [] + + while not stop_good_requests_event.is_set(): + try: + batch = await client.completions.create( + model="zephyr-lora", + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + results.append(batch) + except Exception as e: + results.append(e) + + return results + + # Create task to run good requests + good_task = asyncio.create_task(run_good_requests(client)) + + # Run a bunch of bad adapter loads + for _ in range(25): + with suppress(openai.NotFoundError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "notfound", + "lora_path": "/not/an/adapter" + }) + for _ in range(25): + with suppress(openai.BadRequestError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid", + "lora_path": str(invalid_files) + }) + + # Ensure all the running requests with lora adapters succeeded + stop_good_requests_event.set() + results = await good_task + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + # Ensure we can load another adapter and run it + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "valid", + "lora_path": zephyr_lora_files + }) + await client.completions.create( + model="valid", + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py deleted file mode 100644 index ce4f85c13fff9..0000000000000 --- a/tests/entrypoints/openai/test_lora_lineage.py +++ /dev/null @@ -1,109 +0,0 @@ -import json - -import openai # use the official client for correctness check -import pytest -import pytest_asyncio -# downloading lora to test lora requests -from huggingface_hub import snapshot_download - -from ...utils import RemoteOpenAIServer - -# any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" - - -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def server_with_lora_modules_json(zephyr_lora_files): - # Define the json format LoRA module configurations - lora_module_1 = { - "name": "zephyr-lora", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - - lora_module_2 = { - "name": "zephyr-lora2", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - - args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - json.dumps(lora_module_1), - json.dumps(lora_module_2), - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "64", - ] - - # Enable the /v1/load_lora_adapter endpoint - envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} - - with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client_for_lora_lineage(server_with_lora_modules_json): - async with server_with_lora_modules_json.get_async_client( - ) as async_client: - yield async_client - - -@pytest.mark.asyncio -async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - zephyr_lora_files): - models = await client_for_lora_lineage.models.list() - models = models.data - served_model = models[0] - lora_models = models[1:] - assert served_model.id == MODEL_NAME - assert served_model.root == MODEL_NAME - assert served_model.parent is None - assert all(lora_model.root == zephyr_lora_files - for lora_model in lora_models) - assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) - assert lora_models[0].id == "zephyr-lora" - assert lora_models[1].id == "zephyr-lora2" - - -@pytest.mark.asyncio -async def test_dynamic_lora_lineage( - client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): - - response = await client_for_lora_lineage.post("load_lora_adapter", - cast_to=str, - body={ - "lora_name": - "zephyr-lora-3", - "lora_path": - zephyr_lora_files - }) - # Ensure adapter loads before querying /models - assert "success" in response - - models = await client_for_lora_lineage.models.list() - models = models.data - dynamic_lora_model = models[-1] - assert dynamic_lora_model.root == zephyr_lora_files - assert dynamic_lora_model.parent == MODEL_NAME - assert dynamic_lora_model.id == "zephyr-lora-3" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 97248f1150979..85f485364a411 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -35,6 +35,7 @@ class MockModelConfig: logits_processor_pattern = None diff_sampling_param: Optional[dict] = None allowed_local_media_path: str = "" + encoder_config = None def get_diff_sampling_param(self): return self.diff_sampling_param or {} @@ -51,7 +52,7 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() - models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) + models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat(engine, model_config, models, @@ -72,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, model_config=MockModelConfig()) serving_chat = OpenAIServingChat(mock_engine, MockModelConfig(), @@ -115,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config(): mock_engine.errored = False # Initialize the serving chat - models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config) serving_chat = OpenAIServingChat(mock_engine, mock_model_config, diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 96897dc730da2..657ea20213ec9 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -4,6 +4,7 @@ import pytest from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, UnloadLoraAdapterRequest) @@ -21,13 +22,16 @@ async def _async_serving_models_init() -> OpenAIServingModels: mock_model_config = MagicMock(spec=ModelConfig) + mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute mock_model_config.max_model_len = 2048 - serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + serving_models = OpenAIServingModels(engine_client=mock_engine_client, + base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config, lora_modules=None, prompt_adapters=None) + await serving_models.init_static_loras() return serving_models @@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found(): request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.type == "NotFoundError" + assert response.code == HTTPStatus.NOT_FOUND diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 6fcc92022855b..090523a836e12 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,6 +1,3 @@ -import json -import os - import openai import pytest @@ -10,16 +7,7 @@ @pytest.mark.asyncio -async def test_shutdown_on_engine_failure(tmp_path): - # Use a bad adapter to crash the engine - # (This test will fail when that bug is fixed) - adapter_path = tmp_path / "bad_adapter" - os.mkdir(adapter_path) - with open(adapter_path / "adapter_model_config.json", "w") as f: - json.dump({"not": "real"}, f) - with open(adapter_path / "adapter_model.safetensors", "wb") as f: - f.write(b"this is fake") - +async def test_shutdown_on_engine_failure(): # dtype, max-len etc set so that this can run in CI args = [ "--dtype", @@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path): "--enforce-eager", "--max-num-seqs", "128", - "--enable-lora", - "--lora-modules", - f"bad-adapter={tmp_path / 'bad_adapter'}", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path): with pytest.raises( (openai.APIConnectionError, openai.InternalServerError)): - # This crashes the engine - await client.completions.create(model="bad-adapter", - prompt="Hello, my name is") + # Asking for lots of prompt logprobs will currently crash the + # engine. This may change in the future when that bug is fixed + prompt = "Hello " * 4000 + await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + extra_body={"prompt_logprobs": 10}) # Now the server should shut down return_code = remote_server.proc.wait(timeout=8) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index d63b963522e73..8f242df4a60e3 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -758,6 +758,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_llava.jinja", "string"), + ("template_pixtral_hf.jinja", "openai"), ("template_vlm2vec.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 916cc2efa3895..a08c874407e3f 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -1,10 +1,10 @@ -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest import torch from tests.kernels.utils import override_backend_env_variable -from vllm.attention.selector import which_attn_to_use +from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.openvino import OpenVinoPlatform @@ -12,6 +12,13 @@ from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + @pytest.mark.parametrize( "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @@ -24,67 +31,70 @@ def test_env(name: str, device: str, monkeypatch): if device == "cpu": with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "TORCH_SDPA" + backend = get_attn_backend(16, torch.float16, torch.float16, 16, + False) + assert backend.get_name() == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.current_platform", RocmPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "ROCM_FLASH" + backend = get_attn_backend(16, torch.float16, torch.float16, 16, + False) + assert backend.get_name() == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.current_platform", - OpenVinoPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == "OPENVINO" + OpenVinoPlatform()), patch.dict('sys.modules', + {'openvino': Mock()}): + backend = get_attn_backend(16, torch.float16, torch.float16, 16, + False) + assert backend.get_name() == "OPENVINO" else: - with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) - assert backend.name == name + if name in ["XFORMERS", "FLASHINFER"]: + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + assert backend.get_name() == name def test_flash_attn(monkeypatch): """Test FlashAttn validation.""" # TODO: When testing for v1, pipe in `use_v1` as an argument to - # which_attn_to_use + # get_attn_backend override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) # Unsupported CUDA arch with patch("torch.cuda.get_device_capability", return_value=(7, 5)): - backend = which_attn_to_use(16, torch.float16, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported data type - backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = which_attn_to_use(16, torch.float16, "fp8", 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported block size - backend = which_attn_to_use(16, torch.float16, None, 8, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, None, 8, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # flash-attn is not installed with patch.dict('sys.modules', {'vllm_flash_attn': None}): - backend = which_attn_to_use(16, torch.float16, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Unsupported head size - backend = which_attn_to_use(17, torch.float16, None, 16, False) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(17, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True) - assert backend.name != STR_FLASH_ATTN_VAL + backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + assert backend.get_name() != STR_FLASH_ATTN_VAL def test_invalid_env(monkeypatch): """Throw an exception if the backend name is invalid.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with pytest.raises(ValueError): - which_attn_to_use(16, torch.float16, None, 16, False) + get_attn_backend(16, torch.float16, None, 16, False) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 6bc2e59ac6dff..48ca54a39cdb6 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -13,8 +13,7 @@ import torch from tests.kernels.utils import * -from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, - AttentionType) +from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) @@ -67,6 +66,7 @@ class TestPoint(NamedTuple): max_dec_seq_len: int max_enc_seq_len: int num_blocks: int + attn_type: AttentionType class TestResources(NamedTuple): @@ -99,7 +99,6 @@ class TestResources(NamedTuple): ''' scale: float - attn_backend: AttentionBackend attn: Attention kv_cache: torch.Tensor @@ -132,26 +131,33 @@ class that Attention will automatically select when it is constructed. ''' scale = float(1.0 / (test_pt.head_size**0.5)) - attn_backend = make_backend(test_pt.backend_name) attn = Attention( test_pt.num_heads, test_pt.head_size, scale=scale, + prefix=f"{test_pt.attn_type}", + attn_type=test_pt.attn_type, ) if test_pt.num_blocks is None or test_pt.num_heads is None: # Caller does not require a KV cache return TestResources( - scale, attn_backend, attn, + scale, attn, torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) # Construct KV cache - kv_cache = make_kv_cache(test_pt.num_blocks, - test_pt.num_heads, - test_pt.head_size, - test_pt.block_size, - device=CUDA_DEVICE, - backend=test_pt.backend_name) - return TestResources(scale, attn_backend, attn, kv_cache) + if test_pt.attn_type in (AttentionType.DECODER, + AttentionType.ENCODER_DECODER): + kv_cache = make_kv_cache(test_pt.num_blocks, + test_pt.num_heads, + test_pt.head_size, + test_pt.block_size, + device=CUDA_DEVICE, + backend=test_pt.backend_name) + else: + kv_cache = torch.tensor([]) + + attn.kv_cache = [kv_cache] + return TestResources(scale, attn, kv_cache) def _encoder_attn_setup( @@ -196,6 +202,7 @@ def _encoder_attn_setup( _, max_q_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -304,6 +311,7 @@ def _decoder_attn_setup( max_q_seq_len, _, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -491,6 +499,7 @@ def _enc_dec_cross_attn_setup_reuses_query( max_decoder_seq_len, max_encoder_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -625,7 +634,6 @@ def _run_encoder_attention_test( & attn_metadata ''' assert attn_metadata.num_decode_tokens == 0 - attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None with set_forward_context(attn_metadata, vllm_config): @@ -638,14 +646,11 @@ def _run_encoder_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + return attn.forward( + reshaped_query, packed_qkv.key, packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), attn_metadata) def _run_decoder_self_attention_test( @@ -678,7 +683,6 @@ def _run_decoder_self_attention_test( * Attention.forward() applied to packed_{query,key,value}, kv_cache & attn_metadata ''' - attn_type = AttentionType.DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv @@ -693,12 +697,8 @@ def _run_decoder_self_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value, + kv_cache, attn_metadata) def _run_encoder_decoder_cross_attention_test( @@ -745,7 +745,6 @@ def _run_encoder_decoder_cross_attention_test( ''' assert decoder_test_params.packed_qkvo.packed_qkv is not None - attn_type = AttentionType.ENCODER_DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache if cross_test_params is None: @@ -765,12 +764,8 @@ def _run_encoder_decoder_cross_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, key, value, kv_cache, + attn_metadata) @pytest.fixture(autouse=True) @@ -842,7 +837,7 @@ def test_encoder_only( # is not part of this test test_pt = TestPoint(num_heads, head_size, attn_backend.name, batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + max_enc_seq_len, 4096, AttentionType.ENCODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init @@ -858,7 +853,7 @@ def test_encoder_only( # Shared prefill metadata structure prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, None, decoder_test_params=None, @@ -962,20 +957,29 @@ def test_e2e_enc_dec_attn( # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test - test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.ENCODER) + enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, + AttentionType.ENCODER_DECODER) + dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.DECODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): - test_rsrcs = _make_test_resources(test_pt) + enc_test_rsrcs = _make_test_resources(enc_test_pt) + enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt) + dec_test_rsrcs = _make_test_resources(dec_test_pt) # Construct encoder attention test params (only used # during prefill) - enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs) + enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs) # Construct Decoder self-attention prefill-phase & decode-phase # test params, including query/key/value tensors, decoder self-attention @@ -988,7 +992,7 @@ def test_e2e_enc_dec_attn( prephase_dec_test_params, decphase_dec_test_params, cross_block_base_addr, - ) = _decoder_attn_setup(test_pt, test_rsrcs) + ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs) # Construct encoder/decoder cross-attention prefill-phase # & decode-phase test params, including key/value tensors, @@ -1001,14 +1005,14 @@ def test_e2e_enc_dec_attn( dec_qkv, enc_test_params, prephase_dec_test_params, - test_pt, - test_rsrcs, + enc_dec_test_pt, + enc_dec_test_rsrcs, block_base_addr=cross_block_base_addr) # Shared prefill metadata structure assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens, decoder_test_params=prephase_dec_test_params, @@ -1018,10 +1022,10 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder attention - enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, + enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn, enc_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_test_pt, vllm_config=vllm_config) # - Is encoder attention result correct? @@ -1031,10 +1035,10 @@ def test_e2e_enc_dec_attn( # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, prephase_dec_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is prefill decoder self-attention correct? @@ -1045,11 +1049,11 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is prefill encoder/decoder cross-attention correct? @@ -1060,7 +1064,7 @@ def test_e2e_enc_dec_attn( # DECODE: build decode-phase attention metadata decphase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, False, dec_qkv.q_seq_lens, decoder_test_params=decphase_dec_test_params, @@ -1071,10 +1075,10 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, decphase_dec_test_params, decphase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is decode-phase decoder self-attention correct? @@ -1085,11 +1089,11 @@ def test_e2e_enc_dec_attn( # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is decode-phase encoder/decoder cross-attention correct? diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 30f6de14910cb..4d3568e09e40f 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) +from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( + fused_moe as iterative_moe) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE @@ -55,7 +57,13 @@ def test_fused_moe( w2 = F.pad(w2, (0, 128), "constant", 0) torch.cuda.empty_cache() triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) - torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0) + torch_output = torch_moe(a, w1, w2, score, topk) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) + iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False) + torch.testing.assert_close(iterative_output, + torch_output, + atol=1e-2, + rtol=0) @pytest.mark.parametrize("dtype", diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 0fb22638374a4..c9f29014fddcf 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,6 +13,7 @@ from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms.interface import _Backend from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_ROCM_FLASH_ATTN_VAL, STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) @@ -797,7 +798,7 @@ def make_block_tables_slot_mapping( def make_test_metadata( - attn_backend: AttentionBackend, + attn_backend: _Backend, is_prompt: bool, seq_lens: Optional[List[int]], decoder_test_params: Optional[PhaseTestParameters], @@ -822,7 +823,7 @@ def make_test_metadata( Arguments: - * attn_backend: Backend for sourcing attention kernels + * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence * decoder_test_params: decoder self-attention test params; @@ -889,6 +890,8 @@ def make_test_metadata( # (kv_mmap) cross_kv_mmap = cross_test_params.kv_mmap + attn_backend_obj = make_backend(attn_backend.name) + if is_prompt: # Prefill-phase scenario @@ -909,8 +912,7 @@ def make_test_metadata( context_lens, encoder_seq_lens, device=device) - - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), multi_modal_placeholder_index_maps=None, @@ -960,7 +962,7 @@ def make_test_metadata( encoder_seq_lens, device=device) - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, multi_modal_placeholder_index_maps=None, diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 4beba4dc05dde..1cc1ced9968d7 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -22,13 +22,13 @@ def test_run(my_rank, pipe): x2 = pipe.recv_tensor() print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", y2) else: x2 = pipe.recv_tensor() print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", y2) pipe.send_tensor(x) print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 57ebaa424fc59..e7378d00765f0 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model +from vllm.platforms import current_platform class ContextIDInfo(TypedDict): @@ -65,13 +66,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): temp_file = tempfile.mkstemp()[1] - init_distributed_environment( - world_size=1, - rank=0, - distributed_init_method=f"file://{temp_file}", - local_rank=0, - backend="nccl", - ) + + backend = "nccl" + if current_platform.is_cpu(): + backend = "gloo" + + init_distributed_environment(world_size=1, + rank=0, + distributed_init_method=f"file://{temp_file}", + local_rank=0, + backend=backend) initialize_model_parallel(1, 1) yield cleanup_dist_env_and_memory(shutdown_ray=True) @@ -81,13 +85,15 @@ def dist_init(): def dist_init_torch_only(): if torch.distributed.is_initialized(): return + backend = "nccl" + if current_platform.is_cpu(): + backend = "gloo" + temp_file = tempfile.mkstemp()[1] - torch.distributed.init_process_group( - backend="nccl", - world_size=1, - rank=0, - init_method=f"file://{temp_file}", - ) + torch.distributed.init_process_group(world_size=1, + rank=0, + init_method=f"file://{temp_file}", + backend=backend) @pytest.fixture diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index fb8c0b2a7ba26..08a589d7ee29c 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -48,10 +48,14 @@ torch.float32: (5e-3, 5e-3), torch.bfloat16: (3e-2, 2e-2), } -# TODO: Modify this based on platform -DEVICES = [ + +pytestmark = pytest.mark.skipif( + not (current_platform.is_cuda_alike() or current_platform.is_cpu()), + reason="Backend not supported") + +DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if current_platform.is_cuda_alike() else ["cpu"]) #For GPU, we will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) @@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool: from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU return type(punica_wrapper) is PunicaWrapperGPU + elif current_platform.is_cpu(): + from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU + + return type(punica_wrapper) is PunicaWrapperCPU else: return False @@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA # device, see: https://github.com/triton-lang/triton/issues/2925 # Same below. - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 @@ -313,7 +322,9 @@ def create_random_embedding_layer(): def test_embeddings_with_new_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) max_loras = 8 punica_wrapper = get_punica_wrapper(8192, 256, device) @@ -450,7 +461,9 @@ def create_random_embedding_layer(): def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) max_loras = 8 punica_wrapper = get_punica_wrapper(8192, 256, device) @@ -582,7 +595,9 @@ def _pretest(): def test_linear_replicated(dist_init, num_loras, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -695,7 +710,9 @@ def create_random_linear_replicated_layer(): def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -818,7 +835,9 @@ def create_random_linear_parallel_layer(): def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -971,6 +990,8 @@ class FakeConfig: @pytest.mark.parametrize("rotary_dim", [None, 32]) @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Only CUDA backends are supported") def test_rotary_embedding_long_context(dist_init, num_loras, device, scaling_factors, max_position, is_neox_style, rotary_dim, head_size, diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index a099f36b0a465..ca523c66abe42 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -20,6 +20,7 @@ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.platforms import current_platform EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", @@ -28,9 +29,9 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] -CUDA_DEVICES = [ +DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if current_platform.is_cuda_alike() else ["cpu"]) def test_peft_helper(sql_lora_files): @@ -83,7 +84,7 @@ def test_peft_helper(sql_lora_files): PEFTHelper.from_dict(config) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) @@ -171,7 +172,7 @@ def test_replace_submodules(dist_init, dummy_model): manager = LoRAModelManager( model, 1, 1, 1, LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), - torch.device("cuda")) + torch.device(DEVICES[0])) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -183,7 +184,7 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -244,7 +245,7 @@ def test_lora_model_manager(dist_init, dummy_model, device): assert manager.punica_wrapper.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -336,7 +337,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager @@ -466,7 +467,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) @@ -545,7 +546,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): # Should remove every LoRA not specified in the request. @@ -621,7 +622,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py deleted file mode 100644 index 78bf5a1617233..0000000000000 --- a/tests/lora/test_minicpmv.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import List - -import pytest - -import vllm -from vllm.assets.image import ImageAsset -from vllm.lora.request import LoRARequest -from vllm.platforms import current_platform - -MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" - -PROMPT_TEMPLATE = ( - "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" - "(./)\nWhat is in the image?<|eot_id|>" - "<|start_header_id|>assistant<|end_header_id|>\n\n") - -IMAGE_ASSETS = [ - ImageAsset("stop_sign"), - ImageAsset("cherry_blossom"), -] - -# After fine-tuning with LoRA, all generated content should start begin `A`. -EXPECTED_OUTPUT = [ - "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 - "A pink cherry blossom tree with a blue sky in the background.", -] - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - sampling_params = vllm.SamplingParams( - temperature=0, - max_tokens=5, - stop_token_ids=[128001, 128009], # eos_id, eot_id - ) - - inputs = [{ - "prompt": PROMPT_TEMPLATE, - "multi_modal_data": { - "image": asset.pil_image - }, - } for asset in IMAGE_ASSETS] - - outputs = llm.generate( - inputs, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None, - ) - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="MiniCPM-V dependency xformers incompatible with ROCm") -def test_minicpmv_lora(minicpmv_lora_files): - llm = vllm.LLM( - MODEL_PATH, - max_num_seqs=2, - enable_lora=True, - max_loras=4, - max_lora_rank=64, - trust_remote_code=True, - enable_chunked_prefill=True, - ) - output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) - for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output1[i]) - output2 = do_sample(llm, minicpmv_lora_files, lora_id=2) - for i in range(len(EXPECTED_OUTPUT)): - assert EXPECTED_OUTPUT[i].startswith(output2[i]) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 930f177953a5f..3b0f18325a40b 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -3,10 +3,10 @@ import pytest import vllm +from tests.utils import fork_new_process_for_each_test from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest - -from ..utils import multi_gpu_test +from vllm.platforms import current_platform MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -17,13 +17,11 @@ IMAGE_ASSETS = [ ImageAsset("stop_sign"), - ImageAsset("cherry_blossom"), ] # After fine-tuning with LoRA, all generated content should start begin `A`. EXPECTED_OUTPUT = [ "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 - "A pink cherry blossom tree with a blue sky in the background.", ] @@ -50,48 +48,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: # Print the outputs. generated_texts: List[str] = [] for output in outputs: - prompt = output.prompt generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Generated text: {generated_text!r}") return generated_texts -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded): +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_lora(minicpmv_lora_files): + llm = vllm.LLM( + MODEL_PATH, + max_num_seqs=2, + enable_lora=True, + max_loras=2, + max_lora_rank=8, + enforce_eager=True, + trust_remote_code=True, + enable_chunked_prefill=True, + ) + output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output1[i]) + output2 = do_sample(llm, minicpmv_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output2[i]) + + +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, enable_lora=True, max_num_seqs=2, max_loras=4, max_lora_rank=64, - tensor_parallel_size=2, + tensor_parallel_size=4, trust_remote_code=True, - fully_sharded_loras=fully_sharded, + enforce_eager=True, enable_chunked_prefill=True, ) - output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) - for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) -@multi_gpu_test(num_gpus=4) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") +@fork_new_process_for_each_test +def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, enable_lora=True, max_num_seqs=2, - max_loras=4, - max_lora_rank=64, + max_loras=2, + max_lora_rank=8, tensor_parallel_size=4, trust_remote_code=True, - fully_sharded_loras=fully_sharded, + fully_sharded_loras=True, enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) + output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 797a495201d33..940a865228806 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -5,6 +5,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" @@ -31,7 +32,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, @pytest.mark.parametrize("tp_size", [4]) def test_mixtral_lora(mixtral_lora_files, tp_size): """Original test, the LoRA model has the common target modules, not all""" - if torch.cuda.device_count() < tp_size: + if torch.cuda.device_count( + ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike(): pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") prompts = [ diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_ops_sizes.py similarity index 63% rename from tests/lora/test_punica_sizes.py rename to tests/lora/test_punica_ops_sizes.py index 66b5f82bbb97d..433ca7577d084 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_ops_sizes.py @@ -4,19 +4,21 @@ whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. """ +from threading import Lock + import pytest import torch -from vllm.lora.ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice -from vllm.lora.ops.sgmv_shrink import sgmv_shrink +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.platforms import current_platform -from .utils import (generate_data, generate_data_for_expand_nslices, - ref_torch_groupgemm) +from .utils import (assert_close, generate_data, + generate_data_for_expand_nslices, + generate_data_for_nslices) HIDDEN_SIZES = [ 128, @@ -110,16 +112,9 @@ MAX_RANKS = [32] SCALES = [0.5] SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] - +DEVICES = [f"cuda:{0}"] -def assert_close(a, b): - rtol, atol = { - torch.float16: (6e-2, 6e-2), - torch.bfloat16: (6e-2, 6e-2), - torch.float32: (1e-2, 1e-2), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) +_dict_lock = Lock() @pytest.mark.parametrize("batches", BATCHES) @@ -127,16 +122,18 @@ def assert_close(a, b): @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("nslices", [1, 2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_sgmv( batches: int, num_loras: int, rank: int, hidden_size: int, scaling: float, + nslices: int, dtype: torch.dtype, op_type: str, seed: int, @@ -148,19 +145,20 @@ def test_punica_sgmv( seq_length = 128 ( inputs_tensor, - lora_weights, + lora_weights_lst, our_out_tensor, ref_out_tensor, b_seq_start_loc, lora_indices_tensor, seq_len_tensor, indices, - ) = generate_data( + ) = generate_data_for_nslices( batches, hidden_size, num_loras, rank, seq_length, + nslices, dtype, op_type, device, @@ -172,43 +170,85 @@ def test_punica_sgmv( else: max_seq_length = max_seq_length.item() if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) + # Preventing cache error pointer. + with _dict_lock: + _LORA_A_PTR_DICT.clear() + torch.ops.vllm.sgmv_shrink( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + for index in range(nslices): + sgmv_shrink( + inputs_tensor, + lora_weights_lst[index], + ref_out_tensor[index], + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) + with _dict_lock: + _LORA_B_PTR_DICT.clear() + torch.ops.vllm.sgmv_expand( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + offset_start=0, + add_inputs=True, + ) + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + add_inputs=True, + ) + else: + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + slice_offset, + hidden_size, + add_inputs=True, + ) + slice_offset += hidden_size + assert_close(our_out_tensor, ref_out_tensor) @@ -220,7 +260,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv( batches: int, num_loras: int, @@ -256,31 +296,38 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - bgmv_shrink( + torch.ops.vllm.bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, indices, scaling, ) + + bgmv_shrink( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + scaling, + ) + else: - bgmv_expand( + torch.ops.vllm.bgmv_expand( inputs_tensor, lora_weights, our_out_tensor, indices, add_inputs=True, ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + add_inputs=True, + ) + if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -292,25 +339,22 @@ def test_punica_bgmv( @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_punica_expand_nslices( +@pytest.mark.parametrize("device", DEVICES) +def test_punica_bgmv_expand_nslices( batches: int, num_loras: int, rank: int, hidden_size: int, nslices: int, dtype: torch.dtype, - op_type: str, seed: int, device: str, ): - torch.set_default_device(device) current_platform.seed_everything(seed) - seq_length = 128 if op_type == "sgmv" else 1 + seq_length = 1 ( inputs_tensor, lora_weights_lst, @@ -330,50 +374,26 @@ def test_punica_expand_nslices( nslices, device, ) - max_seq_length = seq_len_tensor.max() - token_nums = seq_len_tensor.sum().item() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - if op_type == "sgmv": - sgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - slice_offset, - hidden_size, - add_inputs=True, - ) - else: - - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_outputs[:, slice_offset:slice_offset + hidden_size], + torch.ops.vllm.bgmv_expand_slice( inputs_tensor, lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - 1.0, - op_type="expand", + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + ref_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, ) slice_offset += hidden_size diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_ops_variation.py similarity index 58% rename from tests/lora/test_punica_variation.py rename to tests/lora/test_punica_ops_variation.py index 3b20033271d26..2bb84c1cf11e9 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_ops_variation.py @@ -3,22 +3,24 @@ under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ +from threading import Lock + import pytest import torch # Enable custom op register -import vllm.lora.ops.bgmv_expand -import vllm.lora.ops.bgmv_expand_slice -import vllm.lora.ops.bgmv_shrink -import vllm.lora.ops.sgmv_expand -import vllm.lora.ops.sgmv_expand_slice -import vllm.lora.ops.sgmv_shrink # noqa: F401 +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.platforms import current_platform -from .utils import (generate_data, generate_data_for_expand_nslices, - ref_torch_groupgemm) +from .utils import (assert_close, generate_data, + generate_data_for_expand_nslices, + generate_data_for_nslices) -HIDDEN_SIZES = [4097] +HIDDEN_SIZES = [2049] BATCHES = [1, 4, 16, 32] NUM_LORA = [1, 8, 32, 128] @@ -26,26 +28,9 @@ MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256] SCALES = [0.5] SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] - - -def assert_close(a, b): - rtol, atol = { - torch.float16: (6e-2, 6e-2), - torch.bfloat16: (6e-2, 6e-2), - torch.float32: (1e-2, 1e-2), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - +DEVICES = [f"cuda:{0}"] -# Unlike test_punica_sizes.py, we directly utilize custom op for -# testing, which verifies the correct registration of these ops. -bgmv_expand = torch.ops.vllm.bgmv_expand -bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice -bgmv_shrink = torch.ops.vllm.bgmv_shrink -sgmv_expand = torch.ops.vllm.sgmv_expand -sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice -sgmv_shrink = torch.ops.vllm.sgmv_shrink +_dict_lock = Lock() @pytest.mark.parametrize("batches", BATCHES) @@ -53,16 +38,18 @@ def assert_close(a, b): @pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("scaling", SCALES) +@pytest.mark.parametrize("nslices", [1, 2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_sgmv( batches: int, num_loras: int, rank: int, hidden_size: int, scaling: float, + nslices: int, dtype: torch.dtype, op_type: str, seed: int, @@ -74,19 +61,20 @@ def test_punica_sgmv( seq_length = 128 ( inputs_tensor, - lora_weights, + lora_weights_lst, our_out_tensor, ref_out_tensor, b_seq_start_loc, lora_indices_tensor, seq_len_tensor, indices, - ) = generate_data( + ) = generate_data_for_nslices( batches, hidden_size, num_loras, rank, seq_length, + nslices, dtype, op_type, device, @@ -98,43 +86,85 @@ def test_punica_sgmv( else: max_seq_length = max_seq_length.item() if op_type == "shrink": - sgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - scaling, - ) + # Preventing cache error pointer. + with _dict_lock: + _LORA_A_PTR_DICT.clear() + torch.ops.vllm.sgmv_shrink( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + for index in range(nslices): + sgmv_shrink( + inputs_tensor, + lora_weights_lst[index], + ref_out_tensor[index], + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + scaling, + ) + else: - sgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) - if op_type == "shrink": - ref_out_tensor = ref_out_tensor.to(torch.float32) + with _dict_lock: + _LORA_B_PTR_DICT.clear() + torch.ops.vllm.sgmv_expand( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + offset_start=0, + add_inputs=True, + ) + slice_offset = 0 + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + add_inputs=True, + ) + else: + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + slice_offset, + hidden_size, + add_inputs=True, + ) + slice_offset += hidden_size + assert_close(our_out_tensor, ref_out_tensor) @@ -146,7 +176,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv( batches: int, num_loras: int, @@ -158,7 +188,6 @@ def test_punica_bgmv( seed: int, device: str, ): - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -183,32 +212,38 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - bgmv_shrink( + torch.ops.vllm.bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, indices, scaling, ) - else: - bgmv_expand( + bgmv_shrink( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + scaling, + ) + + else: + torch.ops.vllm.bgmv_expand( inputs_tensor, lora_weights, our_out_tensor, indices, add_inputs=True, ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + add_inputs=True, + ) + if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -220,24 +255,22 @@ def test_punica_bgmv( @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_punica_expand_nslices( +@pytest.mark.parametrize("device", DEVICES) +def test_punica_bgmv_expand_nslices( batches: int, num_loras: int, rank: int, hidden_size: int, nslices: int, dtype: torch.dtype, - op_type: str, seed: int, device: str, ): torch.set_default_device(device) current_platform.seed_everything(seed) - seq_length = 128 if op_type == "sgmv" else 1 + seq_length = 1 ( inputs_tensor, lora_weights_lst, @@ -257,49 +290,26 @@ def test_punica_expand_nslices( nslices, device, ) - max_seq_length = seq_len_tensor.max() - token_nums = seq_len_tensor.sum().item() - if isinstance(max_seq_length, tuple): - max_seq_length = max_seq_length[0].item() - else: - max_seq_length = max_seq_length.item() slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - if op_type == "sgmv": - sgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - b_seq_start_loc, - seq_len_tensor, - lora_indices_tensor, - batches, - max_seq_length, - token_nums, - slice_offset, - hidden_size, - add_inputs=True, - ) - else: - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) - ref_torch_groupgemm( - ref_outputs[:, slice_offset:slice_offset + hidden_size], + torch.ops.vllm.bgmv_expand_slice( inputs_tensor, lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - 1.0, - op_type="expand", + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + ref_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, ) slice_offset += hidden_size diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 026269667b473..26bf770cc0d4a 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -72,7 +72,8 @@ def format_prompt_tuples(prompt): @pytest.mark.parametrize("tp_size", [1]) def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, tp_size): - if num_gpus_available < tp_size: + if num_gpus_available < tp_size and \ + tp_size > 1 and current_platform.is_cuda_alike(): pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") llm = vllm.LLM( diff --git a/tests/lora/utils.py b/tests/lora/utils.py index e394c33b3f9ea..ce47546f2154b 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -18,11 +18,13 @@ def set_module_lora(self, module_name: str, lora: LoRALayerWeights): def get_module_lora(self, module_name: str) -> LoRALayerWeights: return self._loras[module_name] - def init_random_lora(self, - module_name: str, - weight: torch.Tensor, - rank: int = 8, - generate_embeddings_tensor: int = 0): + def init_random_lora( + self, + module_name: str, + weight: torch.Tensor, + rank: int = 8, + generate_embeddings_tensor: int = 0, + ): lora = LoRALayerWeights( module_name, rank=rank, @@ -35,21 +37,25 @@ def init_random_lora(self, device=self._device), ) if generate_embeddings_tensor: - lora.embeddings_tensor = torch.rand(5, - generate_embeddings_tensor, - dtype=weight.dtype, - device=self._device) + lora.embeddings_tensor = torch.rand( + 5, + generate_embeddings_tensor, + dtype=weight.dtype, + device=self._device, + ) self.set_module_lora(module_name, lora) return lora - def init_lora(self, - module_name: str, - input_dim: int, - output_dim: int, - rank=8, - noop=False, - embeddings_tensor=None): + def init_lora( + self, + module_name: str, + input_dim: int, + output_dim: int, + rank=8, + noop=False, + embeddings_tensor=None, + ): lora = LoRALayerWeights( module_name, rank=rank, @@ -98,35 +104,16 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) -def ref_torch_groupgemm( - out_tensor, - inputs, - lora_weights, - lora_indices_tensor, - seq_len_tensor, +def generate_data( batches, - scaling, + hidden_size, + lora_nums, + max_rank, + seq_length, + dtype, op_type, -) -> torch.Tensor: - out_list = [] - current_offset = 0 - for lora_index, b_length in zip(range(batches), seq_len_tensor): - input_weight = inputs[current_offset:b_length + current_offset, :] - current_offset += b_length - lora_weight = lora_weights[lora_indices_tensor[lora_index]] - result = torch.nn.functional.linear(input_weight, lora_weight) - result *= scaling - out_list.append(result) - cat_result = torch.cat(out_list, dim=0) - if op_type == "expand": - out_tensor += cat_result - else: - out_tensor.copy_(cat_result) - return - - -def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, - op_type, device): + device, +): seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -187,8 +174,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, ) -def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, - seq_length, dtype, nslices, device): +def generate_data_for_expand_nslices( + batches, + hidden_size, + lora_nums, + max_rank, + seq_length, + dtype, + nslices, + device, +): seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches, )).to(device) b_seq_start_loc = torch.cumsum( @@ -221,7 +216,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, for b_id in range(batches): lora_index = lora_indices_tensor[b_id] indices[current_offset:current_offset + - seq_len_tensor[b_id]] = lora_index.item() + seq_len_tensor[b_id]] = (lora_index.item()) + current_offset += seq_len_tensor[b_id].item() + + lora_indices_tensor = lora_indices_tensor.to(device) + return ( + inputs_tensor, + lora_weights_lst, + our_out_tensor, + ref_out_tensor, + b_seq_start_loc, + lora_indices_tensor, + seq_len_tensor, + indices, + ) + + +def generate_data_for_nslices( + batches, + hidden_size, + lora_nums, + max_rank, + seq_length, + nslices, + dtype, + op_type, + device, +): + seq_len_tensor = torch.randint(seq_length, seq_length + 1, + (batches, )).to(device) + b_seq_start_loc = torch.cumsum( + torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0, + ).to(device) + total_tokens = seq_len_tensor.sum() + + lora_weights_lst = [] + if op_type == "shrink": + + inputs_tensor = torch.rand((total_tokens, hidden_size), + dtype=dtype).to(device) + + for _ in range(nslices): + if op_type == "shrink": + lora_weights_lst.append( + torch.rand( + (lora_nums, max_rank, hidden_size), # col-major + dtype=dtype, + ).to(device)) + # NOTE shrink kernel using torch.float32 as output type + # shrink op need atomic_add, so output is initinized by 0 + our_out_tensor = torch.zeros( + (nslices, total_tokens, max_rank), + dtype=torch.float32, + ).to(device) + else: + inputs_tensor = torch.rand( + (nslices, total_tokens, max_rank), + dtype=dtype, + ).to(device) + for _ in range(nslices): + lora_weights_lst.append( + torch.rand( + (lora_nums, hidden_size, max_rank), # col-major + dtype=dtype, + ).to(device)) + # expand op needs to complete y+=a@lora_b, so output is + # initinized randomly + our_out_tensor = torch.rand((total_tokens, hidden_size * nslices), + dtype=dtype).to(device) + + # Ensure the same input. + ref_out_tensor = our_out_tensor.clone() + lora_indices_tensor = torch.randint(0, + lora_nums - 1 if lora_nums > 1 else 1, + (batches, )) + indices = torch.zeros((total_tokens), dtype=torch.long).to(device) + current_offset = 0 + for b_id in range(batches): + lora_index = lora_indices_tensor[b_id] + indices[current_offset:current_offset + + seq_len_tensor[b_id]] = (lora_index.item()) current_offset += seq_len_tensor[b_id].item() lora_indices_tensor = lora_indices_tensor.to(device) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index ed321ba9f00c1..0609fd96825e3 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -2,7 +2,7 @@ import pytest -from vllm.model_executor.layers.pooler import PoolingType +from vllm.model_executor.layers.pooler import CLSPool, PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform @@ -92,3 +92,28 @@ def test_roberta_model_loading_with_params(vllm_runner): # assert output assert output + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_facebook_roberta_model_loading_with_params(vllm_runner): + """ + Test loading roberta-base model with no lm_head. + """ + model_name = "FacebookAI/roberta-base" + with vllm_runner(model_name=model_name, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_tokenizer = model.model.llm_engine.tokenizer + assert model_tokenizer.tokenizer_id == model_name + + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert not hasattr(model, "lm_head") + assert isinstance(model, RobertaEmbeddingModel) + assert isinstance(model._pooler, CLSPool) + + assert output diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 0bb98df1b58e6..1e329dc4cb22e 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, @pytest.mark.asyncio -async def test_online_inference(client, audio_assets): - """Exercises online inference with/without chunked prefill enabled.""" +async def test_online_serving(client, audio_assets): + """Exercises online serving with/without chunked prefill enabled.""" messages = [{ "role": diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 2b8f5e2faa45e..81b93ebdf0fc0 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -4,6 +4,7 @@ """ import os +from typing import List, NamedTuple, Type import pytest from huggingface_hub import hf_hub_download @@ -11,6 +12,7 @@ from tests.quantization.utils import is_quant_method_supported +from ....conftest import VllmRunner from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -18,31 +20,74 @@ MAX_MODEL_LEN = 1024 +class GGUFTestConfig(NamedTuple): + original_model: str + gguf_repo: str + gguf_filename: str + + @property + def gguf_model(self): + return hf_hub_download(self.gguf_repo, filename=self.gguf_filename) + + +LLAMA_CONFIG = GGUFTestConfig( + original_model="meta-llama/Llama-3.2-1B-Instruct", + gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", + gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf", +) + +QWEN2_CONFIG = GGUFTestConfig( + original_model="Qwen/Qwen2.5-1.5B-Instruct", + gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF", + gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", +) + +PHI3_CONFIG = GGUFTestConfig( + original_model="microsoft/Phi-3.5-mini-instruct", + gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", + gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf", +) + +GPT2_CONFIG = GGUFTestConfig( + original_model="openai-community/gpt2-large", + gguf_repo="QuantFactory/gpt2-large-GGUF", + gguf_filename="gpt2-large.Q4_K_M.gguf", +) + +STABLELM_CONFIG = GGUFTestConfig( + original_model="stabilityai/stablelm-3b-4e1t", + gguf_repo="afrideva/stablelm-3b-4e1t-GGUF", + gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf", +) + +STARCODER_CONFIG = GGUFTestConfig( + original_model="bigcode/starcoder2-3b", + gguf_repo="QuantFactory/starcoder2-3b-GGUF", + gguf_filename="starcoder2-3b.Q6_K.gguf", +) + +MODELS = [ + LLAMA_CONFIG, + QWEN2_CONFIG, + PHI3_CONFIG, + GPT2_CONFIG, + STABLELM_CONFIG, + # STARCODER_CONFIG, # broken +] + + @pytest.mark.skipif(not is_quant_method_supported("gguf"), reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-Q4_K_M.gguf"), - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-IQ4_XS.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF", - "qwen2-1_5b-instruct-q4_k_m.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", - "Qwen2-1.5B-Instruct.IQ4_XS.gguf"), -]) +@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("tp_size", [1, 2]) def test_models( - num_gpus_available, - vllm_runner, - example_prompts, - original_model, - gguf_id, - gguf_path, + num_gpus_available: int, + vllm_runner: Type[VllmRunner], + example_prompts: List[str], + model: GGUFTestConfig, dtype: str, max_tokens: int, num_logprobs: int, @@ -51,28 +96,26 @@ def test_models( if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - gguf_model = hf_hub_download(gguf_id, filename=gguf_path) - - tokenizer = AutoTokenizer.from_pretrained(original_model) - messages = [[{ - 'role': 'user', - 'content': prompt - }] for prompt in example_prompts] - example_prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model.original_model) + if tokenizer.chat_template is not None: + messages = [[{ + 'role': 'user', + 'content': prompt + }] for prompt in example_prompts] + example_prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=original_model, + with vllm_runner(model_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as original_model: - original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. - with vllm_runner(model_name=gguf_model, + with vllm_runner(model_name=model.gguf_model, + tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as gguf_model: diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 2a7ed8826d2f3..4e110366a09f3 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -48,6 +48,10 @@ ), pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm pytest.param("bigcode/starcoder2-3b"), # starcoder2 + pytest.param( + "ehristoforu/Falcon3-MoE-2x7B-Insruct", # mixtral + marks=[pytest.mark.cpu_model], + ) ]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py deleted file mode 100644 index 6c8d300717de4..0000000000000 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ /dev/null @@ -1,57 +0,0 @@ -import pytest -from PIL import Image -from transformers import AutoTokenizer - -from vllm.inputs import InputProcessingContext - -from ....utils import build_model_context - - -# Fixtures lazy import to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_llava_next(): - from vllm.model_executor.models.llava_next import ( - LlavaNextMultiModalProcessor) - return LlavaNextMultiModalProcessor - - -@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) -@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198)]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_prompt_replacements( - processor_for_llava_next, - model_id: str, - image_size: tuple[int, int], - num_imgs: int, -): - """ - Ensure LlavaNextMultiModalProcessor handles prompt replacement properly. - """ - ctx = build_model_context( - model_name=model_id, - tokenizer_name=model_id, - mm_processor_kwargs=None, - limit_mm_per_prompt={"image": num_imgs}, - ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) - - # Build the image str / prompt based on the number of images we pass - prompt = "" * num_imgs - mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} - - # The processor will throw an error if there is a mismatch - # in the prompt replacements - processor = processor_for_llava_next(ctx) - processed_inputs = processor.apply(prompt, mm_data, {}) - - image_placeholders = processed_inputs["mm_placeholders"]["image"] - assert len(image_placeholders) == num_imgs - - first_placeholder = image_placeholders[0] - - # NOTE: There is a BOS token - assert first_placeholder["offset"] == 1 - assert first_placeholder["length"] == ( - len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py deleted file mode 100644 index 71adde6568a17..0000000000000 --- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py +++ /dev/null @@ -1,59 +0,0 @@ -import pytest -from PIL import Image -from transformers import AutoTokenizer - -from vllm.inputs import InputProcessingContext - -from ....utils import build_model_context - - -# Fixtures lazy import to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_llava_onevision(): - from vllm.model_executor.models.llava_onevision import ( - LlavaOnevisionMultiModalProcessor) - return LlavaOnevisionMultiModalProcessor - - -@pytest.mark.parametrize("model_id", - ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) -@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198)]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_prompt_replacements( - processor_for_llava_onevision, - model_id: str, - image_size: tuple[int, int], - num_imgs: int, -): - """ - Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement - properly. - """ - ctx = build_model_context( - model_name=model_id, - tokenizer_name=model_id, - mm_processor_kwargs=None, - limit_mm_per_prompt={"image": num_imgs}, - ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) - - # Build the image str / prompt based on the number of images we pass - prompt = "" * num_imgs - mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} - - # The processor will throw an error if there is a mismatch - # in the prompt replacements - processor = processor_for_llava_onevision(ctx) - processed_inputs = processor.apply(prompt, mm_data, {}) - - image_placeholders = processed_inputs["mm_placeholders"]["image"] - assert len(image_placeholders) == num_imgs - - first_placeholder = image_placeholders[0] - - # NOTE: There is a BOS token - assert first_placeholder["offset"] == 0 - assert first_placeholder["length"] == len( - processed_inputs["prompt_token_ids"]) // num_imgs diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index dc0b683c1f1cb..7620ed1107e8f 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -188,6 +188,33 @@ max_tokens=8, dtype="bfloat16", ), + "deepseek_vl_v2": VLMTestInfo( + models=["deepseek-ai/deepseek-vl2-small"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + dtype="bfloat16", + prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "\nWhat's the color of the stop sign and car?", + "cherry_blossom": "\nWhat's the color of the tower?", + }), + multi_image_prompt="image_1:\nimage_2:\nDescribe the two images shortly.", # noqa: E501 + vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501 + image_size_factors=[(0.10, 0.15)], + patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, + postprocess_inputs=model_utils.cast_dtype_post_processor("images"), + hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, + stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 + num_logprobs=5, + marks=[ + pytest.mark.skipif( + not is_flash_attn_2_available(), + reason="Model needs flash-attn for numeric convergence.", + ), + large_gpu_mark(min_gb=48), + ], + ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], test_type=VLMTestType.IMAGE, @@ -341,6 +368,16 @@ ), hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), + "molmo": VLMTestInfo( + models=["allenai/Molmo-7B-D-0924"], + test_type=(VLMTestType.IMAGE), + prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + image_size_factors=[(),(1.0, 1.0, 1.0)], + patch_hf_runner=model_utils.mlomo_patch_hf_runner, + postprocess_inputs=model_utils.molmo_post_processor, + ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 3eca8fb9dcb1a..1ca85c7bb2056 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -5,17 +5,20 @@ import re import types from pathlib import PosixPath -from typing import Callable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from PIL.Image import Image -from transformers import AutoConfig, AutoTokenizer, BatchEncoding +from transformers import (AutoConfig, AutoTokenizer, BatchEncoding, + GenerationConfig) from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from .....conftest import HfRunner, ImageAsset, _ImageAssets +from .....conftest import (HfRunner, ImageAsset, PromptAudioInput, + PromptImageInput, PromptVideoInput, _ImageAssets) +from ....utils import TokensTextLogprobs from .types import RunnerOutput @@ -180,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, ####### Post-processors for HF outputs +def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, + model: str) -> RunnerOutput: + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith("<|end▁of▁sentence|>"): + output_str = output_str.split("<|end▁of▁sentence|>")[0] + return output_ids, output_str, out_logprobs + + def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput: output_ids, output_str, out_logprobs = hf_output @@ -222,6 +233,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str): return {"model_inputs": hf_inputs} +def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str): + hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype) + return {k: v.unsqueeze(0) for k, v in hf_inputs.items()} + + ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], @@ -253,6 +269,34 @@ def qwen_prompt_path_encoder( ####### Model-specific HuggingFace runner patchers +def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for GLM4.""" + hf_processor = hf_model.processor + + def processor(*args, text="", images=None, **kwargs): + if isinstance(images, Image): + images = [images] + # inputs is a custom class instead of dict or BatchFeature + inputs = hf_processor( + *args, + prompt=text, + images=images, + **kwargs, + ) + inputs = { + k: inputs[k] + for k in inputs.keys() # noqa + if k not in ("seq_lens", "sft_format") + } + inputs = BatchEncoding(data=inputs, tensor_type="pt") + return inputs + + hf_model.processor = processor + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language.model.embed_tokens + return hf_model + + def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner to use for GLM4.""" hf_processor = hf_model.processor @@ -451,3 +495,88 @@ def _generate(self, *args, **kwargs): hf_model.model.generate = types.MethodType(_generate, hf_model.model) return hf_model + + +def _generate_greedy_logprobs_limit( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + **kwargs: Any, +) -> List[TokensTextLogprobs]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + # Process in batches for inference. + if len(all_inputs): + input_ids_lst = [] + images_lst = [] + images_input_idx_lst = [] + imges_masks_lst = [] + for inputs in all_inputs: + input_ids_lst.append(inputs["input_ids"]) + images_lst.append(inputs["images"]) + images_input_idx_lst.append(inputs["image_input_idx"]) + imges_masks_lst.append(inputs["image_masks"]) + batch_inputs = {} + batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0) + batch_inputs['images'] = torch.cat(images_lst, dim=0) + batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst, + dim=0) + batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0) + + outputs = self.model.generate_from_batch( + batch=self.wrap_device(batch_inputs, + device=self.model.device.type), + generation_config=GenerationConfig( + max_new_tokens=max_tokens, + stop_strings="<|endoftext|>", + do_sample=False, + ), + tokenizer=self.tokenizer, + output_hidden_states=True, + return_dict_in_generate=True, + ) + + all_logprobs: List[List[Dict[int, float]]] = [] + all_output_ids: List[List[int]] = [] + all_output_strs: List[str] = [] + + for index in range(len(all_inputs)): + ( + seq_logprobs_lst, + output_len, + ) = self._hidden_states_to_logprobs(outputs.hidden_states, + num_logprobs) + all_logprobs.append(seq_logprobs_lst) + seq_ids = outputs.sequences[index] + output_ids = seq_ids[-output_len:] + all_output_ids.append(output_ids.tolist()) + all_output_strs.append(self.tokenizer.decode(output_ids)) + outputs = zip(all_output_ids, all_output_strs, all_logprobs) + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + + +####### Molmo-specific HuggingFace runner patchers +def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for Molmo.""" + hf_processor = hf_model.processor + + def _processor(*args, **kwargs): + return hf_processor.process(*args, **kwargs) + + hf_model.processor = _processor + + setattr( # noqa: B010 + hf_model, + "generate_greedy_logprobs_limit", + types.MethodType(_generate_greedy_logprobs_limit, hf_model), + ) + + return hf_model diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index f458ef5ef556d..04ab4dd7371a3 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -15,6 +15,7 @@ # [Encoder-only] pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-large"), # [Encoder-decoder] pytest.param("intfloat/e5-mistral-7b-instruct", @@ -24,6 +25,7 @@ pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/decoder_only/vision_language/processing/__init__.py b/tests/models/multimodal/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/processing/__init__.py rename to tests/models/multimodal/__init__.py diff --git a/tests/models/multimodal/processing/__init__.py b/tests/models/multimodal/processing/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py new file mode 100644 index 0000000000000..0a38779e0e4f0 --- /dev/null +++ b/tests/models/multimodal/processing/test_common.py @@ -0,0 +1,201 @@ +from functools import partial + +import numpy as np +import pytest +from PIL import Image + +from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.processing import ProcessingCache +from vllm.multimodal.utils import cached_get_tokenizer + +from ....multimodal.utils import random_audio, random_image, random_video + + +def _test_processing_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": + hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} + else: + hf_overrides = {} + + limit_mm_per_prompt = { + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() + } + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=True, + seed=0, + dtype="float16", + revision=None, + hf_overrides=hf_overrides, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + # Ensure that it can fit all of the data + cache = ProcessingCache(capacity=1 << 30) + + baseline_processor = factories.build_processor(ctx, cache=None) + cached_processor = factories.build_processor(ctx, cache=cache) + dummy_inputs = baseline_processor.dummy_inputs + tokenizer = baseline_processor.info.get_tokenizer() + + rng = np.random.RandomState(0) + + input_to_hit = { + "image": Image.new("RGB", size=(128, 128)), + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), + "audio": (np.zeros((512, )), 16000), + } + input_factory = { + "image": + partial(random_image, rng, min_wh=128, max_wh=256), + "video": + partial(random_video, + rng, + min_frames=2, + max_frames=8, + min_wh=128, + max_wh=256), + "audio": + partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), + } + + for batch_idx in range(num_batches): + mm_data = { + k: + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) + for _ in range(rng.randint(limit_mm_per_prompt[k]))] + for k in modalities + } + + mm_counts = {k: len(vs) for k, vs in mm_data.items()} + prompt = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt_text + + # Drop unnecessary keys and test single -> multi conversion + if rng.rand() < simplify_rate: + for k in list(mm_data.keys()): + if not mm_data[k]: + del mm_data[k] + elif len(mm_data[k]) == 1: + mm_data[k] = mm_data[k][0] + + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == cached_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + baseline_tokenized_result = baseline_processor.apply( + tokenizer.encode(prompt), + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == baseline_tokenized_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + cached_tokenized_result = cached_processor.apply( + tokenizer.encode(prompt), + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert cached_result == cached_tokenized_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + +# yapf: disable +# True if the model supports multiple data items of the modality per request +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", {"image": False}), + ("facebook/chameleon-7b", {"image": False}), + ("adept/fuyu-8b", {"image": False}), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), + ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + _test_processing_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) + + +# yapf: disable +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness_phi3v( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + # HACK - this is an attempted workaround for the following bug + # https://github.com/huggingface/transformers/issues/34307 + from transformers import AutoImageProcessor # noqa: F401 + from transformers import AutoProcessor # noqa: F401 + + AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) + + _test_processing_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) diff --git a/tests/models/decoder_only/vision_language/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py similarity index 98% rename from tests/models/decoder_only/vision_language/processing/test_idefics3.py rename to tests/models/multimodal/processing/test_idefics3.py index c71a2d359043d..69b91ad4a5df8 100644 --- a/tests/models/decoder_only/vision_language/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -8,8 +8,8 @@ from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context models = ["HuggingFaceM4/Idefics3-8B-Llama3"] diff --git a/tests/models/decoder_only/vision_language/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py similarity index 98% rename from tests/models/decoder_only/vision_language/processing/test_internvl.py rename to tests/models/multimodal/processing/test_internvl.py index af0c2aa211998..d6c60595ca5ea 100644 --- a/tests/models/decoder_only/vision_language/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -7,8 +7,8 @@ from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context models = ["OpenGVLab/InternVL2-2B"] diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py new file mode 100644 index 0000000000000..1eec35d9c3c72 --- /dev/null +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -0,0 +1,132 @@ +import itertools +from functools import partial + +import pytest +from PIL import Image +from pqdm.threads import pqdm + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import cached_get_tokenizer + +from ...utils import build_model_context + + +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + + try: + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 1 + assert first_placeholder["length"] == ( + len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs + + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, + num_imgs: int, + image_sizes: list[ImageSize], +) -> None: + """ + Ensure LlavaNextMultiModalProcessor + handles prompt replacement properly for input images. + """ + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_prompt_replacements_one, + processor, + num_imgs, + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) + + +@pytest.mark.skip("This test takes around 2 hours to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("num_imgs", [1]) +def test_processor_prompt_replacements_all(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(64, 1024), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py new file mode 100644 index 0000000000000..94ea604c58b43 --- /dev/null +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -0,0 +1,132 @@ +import itertools +from functools import partial + +import pytest +from PIL import Image +from pqdm.threads import pqdm + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import cached_get_tokenizer + +from ...utils import build_model_context + + +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + + try: + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + assert first_placeholder["offset"] == 0 + assert first_placeholder["length"] == len( + processed_inputs["prompt_token_ids"]) // num_imgs + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, + num_imgs: int, + image_sizes: list[ImageSize], +) -> None: + """ + Ensure LlavaOnevisionMultiModalProcessor + handles prompt replacement properly for input images. + """ + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_prompt_replacements_one, + processor, + num_imgs, + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) + + +@pytest.mark.skip("This test takes around 2 hours to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("num_imgs", [1]) +def test_processor_prompt_replacements_all(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(64, 1024), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py similarity index 68% rename from tests/models/decoder_only/vision_language/processing/test_phi3v.py rename to tests/models/multimodal/processing/test_phi3v.py index 249045b3c04ce..7f82a8f18f0ca 100644 --- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -1,19 +1,11 @@ """Tests for phi3v's multimodal preprocessing kwargs.""" import pytest -from transformers import AutoTokenizer -from vllm.inputs import InputProcessingContext -from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import _ImageAssets -from ....utils import build_model_context - - -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_phi3v(): - from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor - return Phi3VMultiModalProcessor +from ....conftest import _ImageAssets +from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) @@ -29,7 +21,6 @@ def processor_for_phi3v(): # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( - processor_for_phi3v, image_assets: _ImageAssets, model_id: str, mm_processor_kwargs: dict[str, int], @@ -37,21 +28,26 @@ def test_processor_override( num_imgs: int, ): """Ensure input_processor_for_phi3v handles num_crops properly.""" + # Avoid initializing CUDA early + from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID + ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, trust_remote_code=True, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) # Build the image str / prompt based on the number of images we pass img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" mm_data = {"image": [image_assets[0].pil_image] * num_imgs} - processor = processor_for_phi3v(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py similarity index 98% rename from tests/models/decoder_only/vision_language/processing/test_qwen.py rename to tests/models/multimodal/processing/test_qwen.py index 163220c91a27d..af0ace711ba3e 100644 --- a/tests/models/decoder_only/vision_language/processing/test_qwen.py +++ b/tests/models/multimodal/processing/test_qwen.py @@ -9,8 +9,8 @@ from vllm.multimodal import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import IMAGE_ASSETS -from ....utils import build_model_context +from ....conftest import IMAGE_ASSETS +from ...utils import build_model_context ### Multimodal preprocessing tests SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py similarity index 70% rename from tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py rename to tests/models/multimodal/processing/test_qwen2_vl.py index b9ac887edf90f..de14fbbffe5b7 100644 --- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -1,17 +1,10 @@ import pytest -from transformers import AutoTokenizer -from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import _ImageAssets -from ....utils import build_model_context - - -# Fixtures lazy import to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_qwen2_vl(): - from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor - return Qwen2VLMultiModalProcessor +from ....conftest import _ImageAssets +from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) @@ -24,7 +17,6 @@ def processor_for_qwen2_vl(): # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( - processor_for_qwen2_vl, image_assets: _ImageAssets, model_id: str, mm_processor_kwargs: dict[str, object], @@ -39,18 +31,20 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) # Build the image str / prompt based on the number of images we pass prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs mm_data = {"image": [image_assets[0].pil_image] * num_imgs} - processor = processor_for_qwen2_vl(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size - hf_processor = processor._get_hf_processor(**mm_processor_kwargs) + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape diff --git a/tests/models/registry.py b/tests/models/registry.py index dcb8bfa0f9510..d079725b2f78d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -22,6 +22,11 @@ class _HfExamplesInfo: for speculative decoding. """ + min_transformers_version: Optional[str] = None + """ + The minimum version of HF Transformers that is required to run this model. + """ + is_available_online: bool = True """ Set this to ``False`` if the name of this architecture no longer exists on @@ -174,6 +179,8 @@ class _HfExamplesInfo: trust_remote_code=True), "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", is_available_online=False), + # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported + "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b728f2744fca..daece7c93c0ef 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,7 +1,9 @@ from unittest.mock import patch import pytest +from packaging.version import Version from transformers import PretrainedConfig +from transformers import __version__ as TRANSFORMERS_VERSION from vllm import LLM @@ -13,9 +15,20 @@ def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) if not model_info.is_available_online: pytest.skip("Model is not available online") + if model_info.min_transformers_version is not None: + current_version = TRANSFORMERS_VERSION + required_version = model_info.min_transformers_version + if Version(current_version) < Version(required_version): + pytest.skip( + f"You have `transformers=={current_version}` installed, but " + f"`transformers>={required_version}` is required to run this " + "model") # Avoid OOM def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: + if hf_config.model_type == "deepseek_vl_v2": + hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]}) + if hasattr(hf_config, "text_config"): text_config: PretrainedConfig = hf_config.text_config else: diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 7203d635c2fa8..8456a463adeeb 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -16,7 +16,6 @@ NUM_PROMPTS = [10] DEFAULT_SERVER_ARGS: List[str] = [ - "--disable-log-requests", "--worker-use-ray", "--gpu-memory-utilization", "0.85", @@ -110,7 +109,7 @@ async def test_multi_step( # Spin up client/server & issue completion API requests. # Default `max_wait_seconds` is 240 but was empirically - # was raised 3x to 720 *just for this test* due to + # was raised 5x to 1200 *just for this test* due to # observed timeouts in GHA CI ref_completions = await completions_with_server_args( prompts, diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 75d878217b657..54269c3ef7ce0 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,25 +1,25 @@ from contextlib import nullcontext -from functools import partial from typing import cast from unittest.mock import MagicMock import numpy as np import pytest -from PIL import Image from vllm.config import ModelConfig -from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, - _PlaceholderInfo, find_mm_placeholders, +from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement, + find_mm_placeholders, find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) +from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby +from .utils import random_image + # yapf: disable @pytest.mark.parametrize( @@ -431,7 +431,7 @@ def test_find_replace_tokens( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], { "pattern_1": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=0, start_idx=6, @@ -445,13 +445,13 @@ def test_find_replace_tokens( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=0, start_idx=1, replacement=[32000, 32000], ), - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=1, start_idx=5, @@ -459,7 +459,7 @@ def test_find_replace_tokens( ), ], "pattern_3": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_3", item_idx=0, start_idx=7, @@ -472,13 +472,13 @@ def test_find_replace_tokens( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=0, start_idx=1, replacement=[32000, 32000], ), - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=1, start_idx=3, @@ -486,7 +486,7 @@ def test_find_replace_tokens( ), ], "pattern_3": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_3", item_idx=0, start_idx=6, @@ -526,37 +526,6 @@ def test_find_mm_placeholders( assert result == expected -def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int): - w, h = rng.randint(min_wh, max_wh, size=(2, )) - arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) - return Image.fromarray(arr) - - -def _rand_video( - rng: np.random.RandomState, - min_frames: int, - max_frames: int, - min_wh: int, - max_wh: int, -): - # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 - num_frames = rng.randint(min_frames, max_frames) - num_frames = (num_frames // 2) * 2 - - w, h = rng.randint(min_wh, max_wh, size=(2, )) - return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) - - -def _rand_audio( - rng: np.random.RandomState, - min_len: int, - max_len: int, - sr: int, -): - audio_len = rng.randint(min_len, max_len) - return rng.rand(audio_len), sr - - @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize( ("limit", "num_supported", "is_valid"), @@ -577,19 +546,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): revision=None, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( + processor = MULTIMODAL_REGISTRY.create_processor( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), ) - - processor = processor_factory(ctx, cache=None) - profiler = processor.profiling_info + profiler = MultiModalProfiler(processor) mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) - profiler.get_supported_mm_limits = mock_supported_mm_limits + processor.info.get_supported_mm_limits = mock_supported_mm_limits if is_valid: exc_ctx = nullcontext() @@ -597,7 +562,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): exc_ctx = pytest.raises(ValueError, match="this model only supports") with exc_ctx: - profiler.get_mm_limits() + profiler.get_dummy_data(model_config.max_model_len) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @@ -620,18 +585,14 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): revision=None, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( + processor = MULTIMODAL_REGISTRY.create_processor( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), ) - processor = processor_factory(ctx, cache=None) - rng = np.random.RandomState(0) - image = _rand_img(rng, min_wh=128, max_wh=256) + image = random_image(rng, min_wh=128, max_wh=256) if num_images == 0: mm_data = {} elif num_images == 1: @@ -650,171 +611,3 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): mm_data=mm_data, hf_processor_mm_kwargs={}, ) - - -def _test_processing_cache_correctness( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": - hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} - else: - hf_overrides = {} - - limit_mm_per_prompt = { - modality: 3 if supports_multi else 1 - for modality, supports_multi in modalities.items() - } - - model_config = ModelConfig( - model_id, - task="auto", - tokenizer=model_id, - tokenizer_mode="auto", - trust_remote_code=True, - seed=0, - dtype="float16", - revision=None, - hf_overrides=hf_overrides, - limit_mm_per_prompt=limit_mm_per_prompt, - ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( - model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), - ) - # Ensure that it can fit all of the data - cache = ProcessingCache(capacity=1 << 30) - - baseline_processor = processor_factory(ctx, cache=None) - cached_processor = processor_factory(ctx, cache=cache) - - rng = np.random.RandomState(0) - - input_to_hit = { - "image": Image.new("RGB", size=(128, 128)), - "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), - "audio": (np.zeros((512, )), 16000), - } - input_factory = { - "image": - partial(_rand_img, rng, min_wh=128, max_wh=256), - "video": - partial(_rand_video, - rng, - min_frames=2, - max_frames=8, - min_wh=128, - max_wh=256), - "audio": - partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000), - } - - for batch_idx in range(num_batches): - mm_data = { - k: - [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(limit_mm_per_prompt[k]))] - for k in modalities - } - - mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor.profiling_info.get_dummy_processor_inputs( - model_config.max_model_len, - mm_counts, - ).prompt_text - - # Drop unnecessary keys and test single -> multi conversion - if rng.rand() < simplify_rate: - for k in list(mm_data.keys()): - if not mm_data[k]: - del mm_data[k] - elif len(mm_data[k]) == 1: - mm_data[k] = mm_data[k][0] - - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - assert baseline_result == cached_result, ( - f"Failed ({batch_idx=}, {mm_data=})") - - -# yapf: disable -# True if the model supports multiple data items of the modality per request -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("rhymes-ai/Aria", {"image": True}), - ("Salesforce/blip2-opt-2.7b", {"image": False}), - ("facebook/chameleon-7b", {"image": False}), - ("adept/fuyu-8b", {"image": False}), - ("llava-hf/llava-1.5-7b-hf", {"image": True}), - ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), - ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), - ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), - ("mistral-community/pixtral-12b", {"image": True}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), - ("fixie-ai/ultravox-v0_3", {"audio": True}), -]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_cache_correctness( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - _test_processing_cache_correctness( - model_id, - modalities, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) - - -# yapf: disable -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), -]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_cache_correctness_phi3v( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - # HACK - this is an attempted workaround for the following bug - # https://github.com/huggingface/transformers/issues/34307 - from transformers import AutoImageProcessor # noqa: F401 - from transformers import AutoProcessor # noqa: F401 - - AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) - - _test_processing_cache_correctness( - model_id, - modalities, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 6029f2e514772..198344e5bd88c 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,16 +2,22 @@ import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import Dict, Tuple +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple import numpy as np import pytest from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (MediaConnector, + merge_and_sort_multimodal_metadata, repeat_and_pad_placeholder_tokens) +if TYPE_CHECKING: + from vllm.multimodal.hasher import MultiModalHashDict + from vllm.multimodal.inputs import MultiModalPlaceholderDict + # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model): assert new_prompt == expected_prompt assert new_token_ids == expected_token_ids assert ranges == expected_ranges + + +# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. +class TestCase(NamedTuple): + mm_positions: "MultiModalPlaceholderDict" + mm_hashes: Optional["MultiModalHashDict"] + expected_modalities: list[str] + expected_ranges: list[PlaceholderRange] + expected_hashes: Optional[list[str]] + + +def test_merge_and_sort_multimodal_metadata(): + + test_cases = [ + # Single modality should return result as is but flattened + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ] + }, + mm_hashes={"image": ["hash1", "hash2"]}, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ], + expected_hashes=["hash1", "hash2"], + ), + + # Single modality without hashes return None for mm hash. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ] + }, + mm_hashes=None, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ], + expected_hashes=None, + ), + + # Multiple modalities with hashes should return sorted modalities + # and flattened ranges and hashes. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1", "audio_hash2"], + }, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=[ + "audio_hash1", "audio_hash2", "image_hash1", "image_hash2" + ], + ), + + # Multiple modalities without hashes should return sorted modalities + # and flattened ranges and None. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes=None, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=None, + ), + + # Three modalities + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + ], + "video": [ + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1"], + "video": ["video_hash1", "video_hash2", "video_hash3"] + }, + expected_modalities=["audio", "video", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + expected_hashes=[ + "audio_hash1", "video_hash1", "video_hash2", "video_hash3", + "image_hash1", "image_hash2" + ], + ), + ] + + for (mm_positions, mm_hashes, expected_modalities, expected_ranges, + expected_hashes) in test_cases: + modalities, ranges, hashes = merge_and_sort_multimodal_metadata( + mm_positions, mm_hashes) + + assert modalities == expected_modalities + assert ranges == expected_ranges + assert hashes == expected_hashes + + +def test_merge_and_sort_multimodal_metadata_with_interleaving(): + + test_cases = [ + + #