forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllava_next_example.py
38 lines (30 loc) · 1.09 KB
/
llava_next_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from io import BytesIO
import requests
from PIL import Image
from vllm import LLM, SamplingParams
from vllm.multimodal.image import ImagePixelData
# Dynamic image input is currently not supported and therefore
# a fixed image input shape and its corresponding feature size is required.
# See https://github.com/vllm-project/vllm/pull/4199 for the complete
# configuration matrix.
llm = LLM(
model="llava-hf/llava-v1.6-mistral-7b-hf",
image_input_type="pixel_values",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=1176,
)
prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
image = Image.open(BytesIO(requests.get(url).content))
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": ImagePixelData(image),
},
sampling_params=sampling_params)
generated_text = ""
for o in outputs:
generated_text += o.outputs[0].text
print(f"LLM output:{generated_text}")