[model] Support for Llava-Next-Video model (vllm-project#7559)

Co-authored-by: Roger Wang <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Signed-off-by: Alvant <[email protected]>
compressa-ai · Oct 26, 2024 · bc70773 · bc70773
1 parent a96bf40
commit bc70773
Show file tree

Hide file tree

Showing 21 changed files with 1,083 additions and 18 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -5,6 +5,7 @@ FROM ubuntu:22.04 AS cpu-test-1
 RUN --mount=type=cache,target=/var/cache/apt \
     apt-get update -y \
     && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update \
+    && apt-get install python3 python3-pip -y \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -4,7 +4,8 @@
 FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+    apt-get install -y python3-pip git && \
+    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 WORKDIR /workspace
 
 # copy requirements

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
 FROM $BASE_IMAGE
 WORKDIR /workspace
 
+# Install some basic utilities
+RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+
 # Install the TPU and Pallas dependencies.
 RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     chmod 644 /usr/share/keyrings/intel-graphics.gpg
 
 RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
-
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -99,6 +99,7 @@ def setup(app):
     "aiohttp",
     "compressed_tensors",
     "cpuinfo",
+    "cv2",
     "torch",
     "transformers",
     "psutil",

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -227,6 +227,11 @@ Multimodal Language Models
     - Image\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
+  * - :code:`LlavaNextVideoForConditionalGeneration`
+    - LLaVA-NeXT-Video
+    - Video
+    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
+    -
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - Image\ :sup:`+`
@@ -260,6 +265,15 @@ Multimodal Language Models
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
+  For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
+  This can be installed by running the following command: 
+
+
+  .. code-block:: bash
+    
+    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
+
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -9,12 +9,9 @@
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 
-# Input image and question
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-question = "What is the content of this image?"
-
 
 # LLaVA-1.5
 def run_llava(question):
@@ -30,7 +27,16 @@ def run_llava(question):
 def run_llava_next(question):
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(question):
+    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -176,6 +182,7 @@ def run_qwen_vl(question):
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
+    "llava-next-video": run_llava_next_video,
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
@@ -187,11 +194,49 @@ def run_qwen_vl(question):
 }
 
 
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
     llm, prompt, stop_token_ids = model_example_map[model](question)
 
     # We set temperature to 0.2 so that outputs can be different
@@ -206,7 +251,7 @@ def main(args):
         inputs = {
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         }
 
@@ -215,7 +260,7 @@ def main(args):
         inputs = [{
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         } for _ in range(args.num_prompts)]
 
@@ -238,8 +283,15 @@ def main(args):
                         help='Huggingface "model_type".')
     parser.add_argument('--num-prompts',
                         type=int,
-                        default=1,
+                        default=4,
                         help='Number of prompts to run.')
-
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
     args = parser.parse_args()
     main(args)
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -11,6 +11,7 @@ awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio test
+opencv-python # required for video test
 peft
 requests
 ray[adag]>=2.35

diff --git a/setup.py b/setup.py
@@ -505,6 +505,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "video": ["opencv-python"],  # Required for video processing
         "audio": ["librosa", "soundfile"]  # Required for audio processing
     },
     cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},