[Model] Add Qwen2-Audio model support (vllm-project#9248)

Co-authored-by: DarkLight1337 <[email protected]> Signed-off-by: NickLucche <[email protected]>
NickLucche · Oct 31, 2024 · bf2f523 · bf2f523
1 parent 4f87575
commit bf2f523
Show file tree

Hide file tree

Showing 7 changed files with 515 additions and 17 deletions.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -459,6 +459,12 @@ Text Generation
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
     -
     - ✅︎
+  * - :code:`Qwen2AudioForConditionalGeneration`
+    - Qwen2-Audio
+    - T + A\ :sup:`+`
+    - :code:`Qwen/Qwen2-Audio-7B-Instruct`
+    -
+    - ✅︎
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`+`

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
@@ -12,14 +12,15 @@
 from vllm.utils import FlexibleArgumentParser
 
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
-question_per_audio_count = [
-    "What is recited in the audio?",
-    "What sport and what nursery rhyme are referenced?"
-]
+question_per_audio_count = {
+    0: "What is 1+1?",
+    1: "What is recited in the audio?",
+    2: "What sport and what nursery rhyme are referenced?"
+}
 
 
 # Ultravox 0.3
-def run_ultravox(question, audio_count):
+def run_ultravox(question: str, audio_count: int):
     model_name = "fixie-ai/ultravox-v0_3"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -42,9 +43,29 @@ def run_ultravox(question, audio_count):
     return llm, prompt, stop_token_ids
 
 
-model_example_map = {
-    "ultravox": run_ultravox,
-}
+# Qwen2-Audio
+def run_qwen2_audio(question: str, audio_count: int):
+    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+
+    audio_in_prompt = "".join([
+        f"Audio {idx+1}: "
+        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+    ])
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_in_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
 
 
 def main(args):
@@ -54,24 +75,25 @@ def main(args):
 
     audio_count = args.num_audios
     llm, prompt, stop_token_ids = model_example_map[model](
-        question_per_audio_count[audio_count - 1], audio_count)
+        question_per_audio_count[audio_count], audio_count)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2,
                                      max_tokens=64,
                                      stop_token_ids=stop_token_ids)
 
-    assert args.num_prompts > 0
-    inputs = {
-        "prompt": prompt,
-        "multi_modal_data": {
+    mm_data = {}
+    if audio_count > 0:
+        mm_data = {
             "audio": [
                 asset.audio_and_sample_rate
                 for asset in audio_assets[:audio_count]
             ]
-        },
-    }
+        }
+
+    assert args.num_prompts > 0
+    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
     if args.num_prompts > 1:
         # Batch inference
         inputs = [inputs] * args.num_prompts
@@ -100,7 +122,7 @@ def main(args):
     parser.add_argument("--num-audios",
                         type=int,
                         default=1,
-                        choices=[1, 2],
+                        choices=[0, 1, 2],
                         help="Number of audio items per prompt.")
 
     args = parser.parse_args()

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
@@ -199,6 +199,7 @@ def iter_params(self, model_name: str):
     "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
     "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
     "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
 }

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -196,7 +196,10 @@ def _placeholder_str(self, modality: ModalityStr,
         elif modality == "audio":
             if model_type == "ultravox":
                 return "<|reserved_special_token_0|>"
-            raise TypeError(f"Unknown {modality} model type: {model_type}")
+            if model_type == "qwen2_audio":
+                return (f"Audio {current_count}: "
+                        f"<|audio_bos|><|AUDIO|><|audio_eos|>")
+            raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
             if model_type == "qwen2_vl":
                 return "<|vision_start|><|video_pad|><|vision_end|>"