diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 8f5aa58f9f2b9..45316fd34a5d2 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -135,6 +135,33 @@ Instead of passing in a single image, you can pass in a list of images. A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. +Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: + +.. code-block:: python + + # Specify the maximum number of frames per video to be 4. This can be changed. + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + + # Create the request payload. + video_frames = ... # load your video making sure it only has the number of frames specified earlier. + message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], + } + for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + + # Perform inference and log output. + outputs = llm.chat([message]) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + Online Inference ----------------