From 04a98d4105794a9f9053c5f86d3193d5cd9f56bd Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 24 Jun 2024 15:16:39 +0800
Subject: [PATCH 01/15] update multi-modal docs (#1212)

---
 ...00\344\275\263\345\256\236\350\267\265.md" |  26 ++
 ...00\344\275\263\345\256\236\350\267\265.md" |   9 +
 ...00\344\275\263\345\256\236\350\267\265.md" |  11 +
 ...00\344\275\263\345\256\236\350\267\265.md" |  26 ++
 docs/source/Multi-Modal/index.md              |   4 +-
 ...00\344\275\263\345\256\236\350\267\265.md" |  28 +-
 ...00\344\275\263\345\256\236\350\267\265.md" |  75 +++--
 ...00\344\275\263\345\256\236\350\267\265.md" |  22 ++
 ...00\344\275\263\345\256\236\350\267\265.md" |  18 +
 ...00\344\275\263\345\256\236\350\267\265.md" |   9 +
 ...00\344\275\263\345\256\236\350\267\265.md" |   9 +
 ...00\344\275\263\345\256\236\350\267\265.md" |   9 +
 ...00\344\275\263\345\256\236\350\267\265.md" |   7 +
 ...00\344\275\263\345\256\236\350\267\265.md" |   9 +
 ...00\344\275\263\345\256\236\350\267\265.md" |  13 +
 swift/llm/infer.py                            |   9 +-
 swift/llm/utils/dataset.py                    |   8 +-
 swift/llm/utils/media.py                      |   4 +-
 swift/llm/utils/model.py                      |   2 +
 swift/llm/utils/preprocess.py                 |   6 +-
 swift/llm/utils/template.py                   | 309 +++++++++---------
 21 files changed, 397 insertions(+), 216 deletions(-)

diff --git "a/docs/source/Multi-Modal/cogvlm2\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/cogvlm2\346\234\200\344\275\263\345\256\236\350\267\265.md"
index d81482315..d326b9e5e 100644
--- "a/docs/source/Multi-Modal/cogvlm2\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/cogvlm2\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -32,6 +32,11 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type cogvlm2-19b-chat
 输出: (支持传入本地路径或URL)
 ```python
 """
+<<< 你好
+Input a media path or URL <<<
+你好！我是一个人工智能助手，随时准备回答你的问题。有什么我可以帮助你的吗？
+--------------------------------------------------
+<<< clear
 <<< 描述这张图片
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
 这是一张特写照片，展示了一只灰色和白色相间的猫。这只猫的眼睛是灰色的，鼻子是粉色的，嘴巴微微张开。它的毛发看起来柔软而蓬松，背景模糊，突出了猫的面部特征。
@@ -68,6 +73,23 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 但心中的美好永远留存。
 这段旅程，
 让他们更加珍惜生命中的每一刻。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+图中内容是：
+
+简介
+
+SWIFT支持250+LLM和35+ MLLM(多模态大模型)的训练、推理、评测和部署。开发者可以直接将我们的框架应用到自己的Research和生产环境中，实现模型训练评测到应用的完整链路。我们除支持了PEFT提供的轻量训练方案外，也提供了一个完整的Adapters库以支持最新的训练技术，如NEFTune、LoRA+、LLaMA-PRO等，这个适配器库可以脱离训练脚本直接使用在自己的自定流程中。
+
+为方便不熟悉深度学习的用户使用，我们提供了一个Gradio的web-ui用于控制训练和推理，并提供了配套的深度学习课程和最佳实践供新手入门。
+
+此外，我们也在拓展其他模态的能力，目前我们支持了AnimateDiff的全参数训练和LoRA训练。
+
+SWIFT具有丰富的文档体系，如有使用问题请查看这里。
+
+可以在Huggingface space和ModelScope创空间中体验SWIFTweb-ui功能了。
 """
 ```
 
@@ -89,6 +111,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/cogvlm\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/cogvlm\346\234\200\344\275\263\345\256\236\350\267\265.md"
index c41103209..482a95a45 100644
--- "a/docs/source/Multi-Modal/cogvlm\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/cogvlm\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -48,6 +48,11 @@ In a world where night and day intertwine,
 A boat floats gently, reflecting the moon's shine.
 Fireflies dance, their glow a mesmerizing trance,
 As the boat sails through a tranquil, enchanted expanse.
+--------------------------------------------------
+<<< clear
+<<< Perform OCR on the image.
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png
+The image contains textual content that describes the capabilities and features of the SWIFT framework. It mentions support for training, inference, and deployment of 250+ LLMs and 35+ MLMs, and how developers can apply this framework to their research and production environments. It also mentions lightweight training solutions provided by PEFT and an adapter library to support the latest training techniques. Additionally, the text highlights that SWIFT offers capabilities for other modalities and supports full-parameter training and LLaMA training for AnimateDiff. There's also a mention of rich documentation available on Huggingface space and ModelScope studio.
 """
 ```
 
@@ -69,6 +74,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr_en:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/deepseek-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/deepseek-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
index f9c348eb0..2d1c7d7b7 100644
--- "a/docs/source/Multi-Modal/deepseek-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/deepseek-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -66,6 +66,14 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type deepseek-vl-1_3b-chat
 舟儿前行不自知。
 夜深人静思绪远，
 孤舟独行心悠然。
+--------------------------------------------------
+<<< clear
+<<< <img>https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png</img>对图片进行OCR
+The image contains Chinese text and appears to be a screenshot of a document or webpage. The text is divided into several paragraphs, and there are several instances of URLs and Chinese characters. The text is not entirely clear due to the resolution, but some of the visible words and phrases include "SWIFT", "250+", "LLM35+", "MLM", "PEFT", "adapters", "GPT", "XNLI", "Tune", "LORA", "LAMA-PRO", "Gradio", "web.ui", "AnimateDiff", "HuggingFace", "space", "ModelScope", and "SWIFT web".
+
+The text seems to be discussing topics related to machine learning, specifically mentioning models like SWIFT, GPT, and LAMA-PRO, as well as tools and frameworks like HuggingFace and ModelScope. The URLs suggest that the text might be referencing online resources or repositories related to these topics.
+
+The text is not fully legible due to the low resolution and the angle at which the image was taken, which makes it difficult to provide a precise transcription. However, the presence of technical terms and URLs indicates that the content is likely from a technical or academic context, possibly a research paper, a technical report, or an article discussing advancements in machine learning and related technologies.
 """
 ```
 
@@ -87,6 +95,9 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
 
 **单样本推理**
 
diff --git "a/docs/source/Multi-Modal/glm4v\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/glm4v\346\234\200\344\275\263\345\256\236\350\267\265.md"
index be5f284cb..ffe4189d4 100644
--- "a/docs/source/Multi-Modal/glm4v\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/glm4v\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -30,6 +30,11 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type glm4v-9b-chat
 输出: (支持传入本地路径或URL)
 ```python
 """
+<<< 你好
+Input a media path or URL <<<
+你好👋！很高兴见到你，欢迎问我任何问题。
+--------------------------------------------------
+<<< clear
 <<< 描述这张图片
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
 这是一张特写照片，展示了一只毛茸茸的小猫。小猫的眼睛大而圆，呈深蓝色，眼珠呈金黄色，非常明亮。它的鼻子短而小巧，是粉色的。小猫的嘴巴紧闭，胡须细长。它的耳朵竖立着，耳朵内侧是白色的，外侧是棕色的。小猫的毛发看起来柔软而浓密，主要是白色和棕色相间的条纹图案。背景模糊不清，但似乎是一个室内环境。
@@ -54,6 +59,23 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 人在画中寻诗意，
 
 心随景迁忘忧愁。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+图片中的OCR结果如下：
+
+简介
+
+SWIFT支持250+LLM和35+MLLM（多模态大模型）的训练、推理、评测和部署。开发者可以直接将我们的框架应用到自己的Research和生产环境中，实现模型训练评测到应用的完整链路。我们除支持了PEFT提供的轻量训练方案外，也提供了一个完整的Adapters库以支持最新的训练技术，如NEFTune、LoRA+、LLaMA-PRO等，这个适配器库可以脱离训练脚本直接使用在自己的自定流程中。
+
+为方便不熟悉深度学习的用户使用，我们提供了一个Gradio的web-ui用于控制训练和推理，并提供了配套的深度学习课程和最佳实践供新入门。
+
+此外，我们也在拓展其他模态的能力，目前我们支持了AnimateDiff的全参数训练和LoRA训练。
+
+SWIFT具有丰富的文档体系，如有使用问题请请查看这里。
+
+可以在Huggingface space和ModelScope创空间中体验SWIFT web-ui功能了。
 """
 ```
 
@@ -75,6 +97,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md
index 1e4798e68..b542e758b 100644
--- a/docs/source/Multi-Modal/index.md
+++ b/docs/source/Multi-Modal/index.md
@@ -13,13 +13,13 @@
 5. [Phi3-Vision最佳实践](phi3-vision最佳实践.md)
 
 
-一轮对话只能包含一张图片:
+一轮对话只能包含一张图片（可能可以不含图片）:
 1. [Llava最佳实践](llava最佳实践.md)
 2. [Yi-VL最佳实践.md](yi-vl最佳实践.md)
 3. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md)
 
 
-整个对话围绕一张图片:
+整个对话围绕一张图片（可能可以不含图片）:
 1. [CogVLM最佳实践](cogvlm最佳实践.md), [CogVLM2最佳实践](cogvlm2最佳实践.md), [glm4v最佳实践](glm4v最佳实践.md)
 2. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md), [MiniCPM-V-2.5最佳实践](minicpm-v-2.5最佳实践.md)
 3. [InternVL-Chat-V1.5最佳实践](internvl最佳实践.md)
diff --git "a/docs/source/Multi-Modal/internlm-xcomposer2\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/internlm-xcomposer2\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 9d5bd8e37..469b5d2ba 100644
--- "a/docs/source/Multi-Modal/internlm-xcomposer2\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/internlm-xcomposer2\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -26,26 +26,27 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type internlm-xcomposer2-7b-chat
 ```python
 """
 <<< 你是谁？
- 我是你的助手，一个基于语言的人工智能模型，可以回答你的问题。
+我是浦语·灵笔，一个由上海人工智能实验室开发的语言模型。我能理解并流畅地使用英语和中文与你对话。
 --------------------------------------------------
 <<< <img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png</img><img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png</img>这两张图片有什么区别
- 这两张图片是不同的, 第一张是羊的图片, 第二张是猫的图片
+这两张图片没有直接的关联，它们分别展示了两个不同的场景。第一幅图是一张卡通画，描绘了一群羊在草地上，背景是蓝天和山脉。第二幅图则是一张猫的照片，猫正看着镜头，背景模糊不清。
 --------------------------------------------------
+<<< clear
 <<< <img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png</img>图中有几只羊
- 图中有4只羊
+图中有4只羊
 --------------------------------------------------
 <<< <img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/math.png</img>计算结果是多少
- 计算结果是1452+45304=46756
+1452 + 45304 = 46756
 --------------------------------------------------
 <<< <img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png</img>根据图片中的内容写首诗
- 湖面波光粼粼，小舟独自飘荡。
-船上点灯，照亮夜色，
-星星点点，倒映水中。
-
-远处山峦，云雾缭绕，
-天空繁星，闪烁不停。
-湖面如镜，倒影清晰，
-小舟穿行，如诗如画。
+夜色苍茫月影斜，
+湖面平静如明镜。
+小舟轻荡波光里，
+灯火微摇映水乡。
+--------------------------------------------------
+<<< clear
+<<< <img>https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png</img>对图片进行OCR
+很抱歉，我无法对您提供的图片进行OCR。如果您需要文本识别服务，您可以上传图片到其他支持OCR服务的平台，或者您可以尝试使用一些在线OCR工具。
 """
 ```
 
@@ -67,6 +68,9 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
 
 **单样本推理**
 
diff --git "a/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 3a543321b..5c6715e72 100644
--- "a/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -42,43 +42,58 @@ CUDA_VISIBLE_DEVICES=0,1 swift infer --model_type internvl-chat-v1_5 --dtype bf1
 输出: (支持传入本地路径或URL)
 ```python
 """
-<<< Describe this image.
-Input a media path or URL <<<  http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
-This is a high-resolution image of a kitten. The kitten has striking blue eyes and a fluffy white and grey coat. The fur pattern suggests that it may be a Maine Coon or a similar breed. The kitten's ears are perked up, and it has a curious and innocent expression. The background is blurred, which brings the focus to the kitten's face.
+<<< 你是谁
+Input a media path or URL <<<
+我是一个人工智能助手，旨在通过自然语言处理和机器学习技术来帮助用户解决问题和完成任务。
 --------------------------------------------------
 <<< clear
-<<< How many sheep are in the picture?
+<<< 描述这张图片
+Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
+这张图片是一只小猫咪的特写照片。这只小猫咪有着蓝灰色的眼睛和白色的毛发，上面有灰色和黑色的条纹。它的耳朵是尖的，眼睛睁得大大的，看起来非常可爱和好奇。背景是模糊的，无法分辨具体的环境，但看起来像是在室内，有柔和的光线。
+--------------------------------------------------
+<<< clear
+<<< 图中有几只羊
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png
-There are four sheep in the picture.
+图中有四只羊。
 --------------------------------------------------
 <<< clear
-<<< What is the calculation result?
+<<< 计算结果是多少?
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/math.png
-The calculation result is 59,856.
+将两个数相加，得到：
+1452 + 45304 = 46766
+因此，1452 + 45304 = 46766。
 --------------------------------------------------
 <<< clear
-<<< Write a poem based on the content of the picture.
+<<< 根据图片中的内容写首诗
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png
-Token indices sequence length is longer than the specified maximum sequence length for this model (5142 > 4096). Running this sequence through the model will result in indexing errors
-In the still of the night,
-A lone boat sails on the light.
-The stars above, a twinkling sight,
-Reflecting in the water's might.
-
-The trees stand tall, a silent guard,
-Their leaves rustling in the yard.
-The boatman's lantern, a beacon bright,
-Guiding him through the night.
-
-The river flows, a gentle stream,
-Carrying the boatman's dream.
-His journey long, his heart serene,
-In the beauty of the scene.
-
-The stars above, a guiding light,
-Leading him through the night.
-The boatman's journey, a tale to tell,
-Of courage, hope, and love as well.
+夜色笼罩水面，
+小舟轻摇入画帘。
+星辉闪烁如珠串，
+月色朦胧似轻烟。
+
+树影婆娑映水面，
+静谧宁和心自安。
+夜深人静思无限，
+唯有舟影伴我眠。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+图中所有文字：
+简介
+SWIFT支持250＋LLM和35＋MLLM（多模态大模型）的训练、推
+理、评测和部署。开发者可以直接将我们的框架应用到自己的Research和
+生产环境中，实现模型训练评测到应用的完整链路。我们除支持
+PEFT提供的轻量训练方案外，也提供了一个完整的Adapters库以支持
+最新的训练技术，如NEFTune、LoRA+、LLaMA-PRO等，这个适配
+器库可以脱离训练脚本直接使用在自已的自定义流程中。
+为了方便不熟悉深度学习的用户使用，我们提供了一个Gradio的web-ui
+于控制训练和推理，并提供了配套的深度学习课程和最佳实践供新手入
+此外，我们也正在拓展其他模态的能力，目前我们支持了AnimateDiff的全参
+数训练和LoRA训练。
+SWIFT具有丰富的文档体系，如有使用问题请查看这里：
+可以在Huggingface space和ModelScope创空间中体验SWIFT web-
+ui功能了。
 """
 ```
 
@@ -100,6 +115,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/llava\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/llava\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 5e9f91561..9e5e9b7f0 100644
--- "a/docs/source/Multi-Modal/llava\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/llava\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -42,6 +42,10 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 swift infer --model_type llava1_6-yi-34b-instruct
 输出: (支持传入本地路径或URL)
 ```python
 """
+<<< who are you
+Input a media path or URL <<<
+I am a language model, specifically a transformer model, trained to generate text based on the input it receives. I do not have personal experiences or emotions, and I do not have a physical form. I exist purely as a software program that can process and generate text.
+--------------------------------------------------
 <<< Describe this image.
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
 The image shows a close-up of a kitten with a soft, blurred background that suggests a natural, outdoor setting. The kitten has a mix of white and gray fur with darker stripes, typical of a tabby pattern. Its eyes are wide open, with a striking blue color that contrasts with the kitten's fur. The kitten's nose is small and pink, and its whiskers are long and white, adding to the kitten's cute and innocent appearance. The lighting in the image is soft and diffused, creating a gentle and warm atmosphere. The focus is sharp on the kitten's face, while the rest of the image is slightly out of focus, which draws attention to the kitten's features.
@@ -85,6 +89,20 @@ The boat, a symbol of solitude,
 In the vast expanse of the universe's beauty,
 A lone journey, a solitary quest,
 In the quiet of the night, it finds its rest.
+--------------------------------------------------
+<<< Perform OCR on the image.
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png
+The text in the image is as follows:
+
+INTRODUCTION
+
+SWIFT supports training, inference, evaluation and deployment of 250+ LLMs (multimodal large models). Developers can directly apply our framework to their own research and production environments to realize the complete workflow from model training and evaluation to application. In addition, SWIFT provides a complete Adapters library to support the latest training techniques such as NLP, Vision, etc. This adapter library can be used directly in your own custom workflow without our training scripts.
+
+To facilitate use by users unfamiliar with deep learning, we provide a Grado web-ui for controlling training and inference, as well as accompanying deep learning courses and best practices for beginners.
+
+SWIFT has rich documentation for users, please check here.
+
+SWIFT is web-ui available both on Huggingface space and ModelScope studio, please feel free to try!
 """
 ```
 
@@ -106,6 +124,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr_en:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 0d0d985c7..7678149ed 100644
--- "a/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/minicpm-v-2.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -59,6 +59,20 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 在这宁静的夜晚中，创造出一幅美丽的画面，
 船只在水面上轻轻摇晃，
 在这宁静的夜晚中，创造出一幅美丽的画面。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+标题: SWIFT支持多模态大模型的训练、推理、评测和部署
+
+正文:
+开发者可以直接将我们的框架应用到自己的Research和生产环境中，实现模型训练评测到应用的完整链路。我们除支持了PEFT提供的轻量训练方案外，也提供了一个完整的Adapters库以支持最新的训练技术，如NEFTune、LoRA+、LLaMA-PRO等，这个适配器库可以脱离训练脚本直接使用在自己的自定流程中。
+
+为方便不熟悉深度学习的用户使用，我们提供了一个Gradio的web-ui用于控制训练和推理，并提供了配套的深度学习课程和最佳实践供新手入门。此外，我们也在拓展其他模态的能力，目前我们支持了AnimateDiff的全参数训练和LoRA训练。
+
+SWIFT具有丰富的文档体系，如有使用问题请请查看这里。
+
+可以在Huggingface space和ModelScope创空间中体验SWIFT web-ui功能了。
 """
 ```
 
@@ -80,6 +94,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/minicpm-v-2\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/minicpm-v-2\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 1656e9b0a..bbac200fc 100644
--- "a/docs/source/Multi-Modal/minicpm-v-2\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/minicpm-v-2\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -45,6 +45,11 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 <<< 根据图片中的内容写首诗
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png
 这幅图片描绘了一个宁静的夜晚场景，一艘船漂浮在水面之上。船看起来是一艘小木船，船头有一个桅杆，上面挂着一个灯笼，发出温暖的光芒。船身涂成深棕色，与水面形成鲜明对比。水面反射着星星和船只的灯光，营造出一种宁静而梦幻的氛围。背景中，树木繁茂，树叶呈现出金色和绿色，暗示着可能是黄昏或黎明时分。天空布满星星，给整个场景增添了神秘感。整体氛围宁静而幽静，让人联想到一个童话般的场景。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+图片中的文字内容为：“SWIFT支持250+LLM和35+LLM（多模态大模型）的训练、推理和评估部署。开发者可以直接将我们的Research和生产环境中应用的模型框架应用到自己的Research和生产环境中。我们提供了完整的Adapters库以支持最新的训练技术，如NEFtune、Lora、LMA-PRO等，这个适配器库可以脱壳脚本直接在自己的流程中使用。为方便不熟悉深度学习用户的使用，我们提供了配套的深度学习课程和最佳实践新手入门门。此外，我们还在拓展其他强大的能力，目前我们支持了AnimateDiff的全参数LORA训练。SWIFT有丰富的文档体系，如有使用问题请查看这里。”
 """
 ```
 
@@ -66,6 +71,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/minicpm-v\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/minicpm-v\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 4d31330a0..c18de05db 100644
--- "a/docs/source/Multi-Modal/minicpm-v\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/minicpm-v\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -49,6 +49,11 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 <<< 根据图片中的内容写首诗
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png
 在宁静的夜晚，一艘船在平静的湖面上航行。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+Swift 250+ LMM35+ MLLM
 """
 ```
 
@@ -70,6 +75,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/mplug-owl2\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/mplug-owl2\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 1d38a638b..65d62c952 100644
--- "a/docs/source/Multi-Modal/mplug-owl2\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/mplug-owl2\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -48,6 +48,11 @@ The calculation result is 1452 + 45304 = 46756.
 <<< Write a poem based on the content of the picture.
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png
 In the stillness of the night, a boat glides across the water, its light shining bright. The stars twinkle above, casting a magical glow. A man and a dog are on board, enjoying the serene journey. The boat floats gently, as if it's floating on air. The calm waters reflect the stars, creating a breathtaking scene. The man and his dog are lost in their thoughts, taking in the beauty of nature. The boat seems to be floating in a dream, as if they are on a journey to find their way back home.
+--------------------------------------------------
+<<< clear
+<<< Perform OCR on the image.
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png
+Text: Swift support training, inference and deployment of 250+ LLMs and 350+ MLMs (multimodal models). Developers can directly apply framework their own research and production environments to realize a complete workflow from model training and evaluation to application. In addition to supporting the lightweight training models provided by PEFT, we also provide a Complete Adapters library that can be adapted to various models such as NeTune, LoRaT, LLMA-PRO, etc. This adapter library can be used directly in your own custom workflow. The library is user-friendly with unfamiliar deep learning, Gradio UI for controlling training and inference, as well as accompanying learning courses and best practices for beginners. Additionally, we provide extra training and Lora LRN for AnimateDiff. Swift has rich documents for users on Huggingface and ModelScope, so please feel free to try it!
 """
 ```
 
@@ -69,6 +74,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr_en:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/phi3-vision\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/phi3-vision\346\234\200\344\275\263\345\256\236\350\267\265.md"
index e0b438222..703ac4df1 100644
--- "a/docs/source/Multi-Modal/phi3-vision\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/phi3-vision\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -66,6 +66,10 @@ In the stillness of the night, a sense of peace,
 The boat, the river, the trees, all in their place.
 A moment frozen in time, a scene so serene,
 A journey through the night, a dream so unseen.
+--------------------------------------------------
+<<< clear
+<<< <img>https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png</img>Perform OCR on the image.
+The image contains a text section with the heading 'Introduction'. It discusses the capabilities of SWIFT, which support training, inference, evaluation, and deployment of over 250 large language models (LLMs) and 35+ multimodal large models (MLLMs). It mentions that developers can apply this framework to their research and production environments, and that SWIFT supports lightweight training solutions provided by PEFT, as well as a complete Adapters library for various training techniques. It also highlights the availability of a Gradio web-ui for controlling training and inference, and the provision of deep learning courses and best practices for beginners. The text further states that SWIFT is expanding capabilities for other modalities, currently supporting full-parameter training and LoRA training for AnimateDiff. There are references to rich documentation and the availability of SWIFT web-ui on Huggingface space and ModelScope studio. The text is clear and fully visible in the image.
 """
 ```
 
@@ -87,6 +91,9 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr_en:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr_en.png" width="250" style="display: inline-block;">
 
 **单样本推理**
 
diff --git "a/docs/source/Multi-Modal/qwen-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/qwen-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 4f15d0eb5..60f073ea2 100644
--- "a/docs/source/Multi-Modal/qwen-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/qwen-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -49,6 +49,11 @@ Picture 2:<img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.pn
 <<<[M] Picture 1:<img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png</img>
 根据图片中的内容写首诗#
 月光如水船如星，独坐船头吹夜风。深林倒影照水面，萤火点点照船行。
+--------------------------------------------------
+<<< clear
+<<<[M] Picture 1:<img>http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png</img>
+对图片进行OCR#
+SWIFT支持250+ LLM和35+ MLLM（多模态大模型）的训练、推理、评测和部署。开发者可以直接将我们的框架应用到自己的Research和生产环境中，实现模型训练评测到应用的完整链路。我们除了支持PEPT提供的轻量训练方案外，也提供了一个完整的Adapters库以支持最新的训练技术，如NEFTune、LoRA+、LLaMa-PRO等，这个适配器库可以脱离训练脚本直接使用在自己的自定流程中。
 """
 ```
 
@@ -70,6 +75,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git "a/docs/source/Multi-Modal/yi-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/yi-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
index c910bb891..0209cbb4c 100644
--- "a/docs/source/Multi-Modal/yi-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/yi-vl\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -31,6 +31,10 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type yi-vl-6b-chat
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
 图片显示一只小猫坐在地板上,眼睛睁开,凝视着摄像机。小猫看起来很可爱,有灰色和白色的毛皮,以及蓝色的眼睛。它似乎正在看摄像机,可能对周围环境很好奇。
 --------------------------------------------------
+<<< 你是谁？
+Input a media path or URL <<<
+我是人工智能助手,随时准备帮助你解答问题或提供信息。
+--------------------------------------------------
 <<< 图中有几只羊
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png
 图中有四只羊.
@@ -62,6 +66,11 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 构成了一个美丽的画面,
 它唤起一种宁静的感觉,
 在喧嚣的城市生活之外。
+--------------------------------------------------
+<<< clear
+<<< 对图片进行OCR
+Input a media path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png
+这是一段关于SWIFT的文字，其中包括了它的版本、功能以及一些链接。
 """
 ```
 
@@ -83,6 +92,10 @@ poem:
 
 <img src="http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/poem.png" width="250" style="display: inline-block;">
 
+ocr:
+
+<img src="https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png" width="250" style="display: inline-block;">
+
 **单样本推理**
 
 ```python
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index c6eca1872..16dd435d4 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -356,8 +356,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
                 infer_kwargs = {}
 
             read_media_file(infer_kwargs, args.infer_media_type)
-            if args.truncation_strategy:
-                infer_kwargs['truncation_strategy'] = args.truncation_strategy
+            infer_kwargs['truncation_strategy'] = args.truncation_strategy
             if system is None and template.use_default_system:
                 system = template.default_system
             if args.infer_backend == 'vllm':
@@ -456,8 +455,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
                 request['system'] = system
                 if images is not None:
                     request['images'] = images
-                if args.truncation_strategy:
-                    request['truncation_strategy'] = args.truncation_strategy
+                request['truncation_strategy'] = args.truncation_strategy
                 request_list.append(request)
             resp_list = inference_vllm(llm_engine, template, request_list, use_tqdm=True)
             result = []
@@ -499,8 +497,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
                     kwargs['images'] = images
                 if tools is not None:
                     kwargs['tools'] = tools
-                if args.truncation_strategy:
-                    kwargs['truncation_strategy'] = args.truncation_strategy
+                kwargs['truncation_strategy'] = args.truncation_strategy
                 if args.infer_backend == 'vllm':
                     assert args.stream is True
                     if args.verbose:
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 249e43514..b1cf65eb9 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -300,10 +300,12 @@ def load_ms_dataset(dataset_id: str,
         if use_hf:
             try:
                 dataset = load_hf_dataset(dataset_id, name=subset_name, split=split)
-            except Exception as e:
+            except ValueError as e:
                 logger.error(f'Dataset {dataset_id} load failed: subset_name={subset_name},'
                              f'split={split} with error: {e}')
                 continue
+            except Exception:
+                raise
         else:
             if is_dist() and not is_local_master():
                 force_redownload = False
@@ -312,10 +314,12 @@ def load_ms_dataset(dataset_id: str,
             download_mode = 'force_redownload' if force_redownload else 'reuse_dataset_if_exists'
             try:
                 dataset = MsDataset.load(dataset_id, subset_name=subset_name, split=split, download_mode=download_mode)
-            except Exception as e:
+            except ValueError as e:
                 logger.error(f'Dataset {dataset_id} load failed: subset_name={subset_name},'
                              f'split={split} with error: {e}')
                 continue
+            except Exception:
+                raise
             if hasattr(dataset, 'to_hf_dataset'):
                 dataset = dataset.to_hf_dataset()
         dataset_list.append(dataset)
diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
index d9a2f831b..d30b14746 100644
--- a/swift/llm/utils/media.py
+++ b/swift/llm/utils/media.py
@@ -1,7 +1,7 @@
 import os
 import shutil
 import time
-from typing import List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import numpy as np
 
@@ -63,7 +63,7 @@ def __init__(self,
         self.task_type = task_type
         self.media_tag = media_tag or '<unused_tag>'
 
-    def __call__(self, d: dict, medias: Union[tuple, list], objects: List = None):
+    def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List = None) -> None:
         """Format the query/response/history with medias
 
         Args:
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 6df263a8c..e7a67e260 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -2360,6 +2360,8 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     if version.parse(transformers.__version__) >= version.parse('4.36'):
         if use_flash_attn:
             model_config._attn_implementation = 'flash_attention_2'
+        else:
+            model_config._attn_implementation = 'eager'
     else:
         model_config._flash_attn_2_enabled = use_flash_attn
     return get_model_tokenizer_from_repo(
diff --git a/swift/llm/utils/preprocess.py b/swift/llm/utils/preprocess.py
index 3713eabbe..a2af4021a 100644
--- a/swift/llm/utils/preprocess.py
+++ b/swift/llm/utils/preprocess.py
@@ -11,13 +11,13 @@
 PreprocessFunc = Callable[[HfDataset], HfDataset]
 
 
-def parse_medias(d, media_key=None):
+def parse_medias(d: Dict[str, Any], media_key=None):
     if isinstance(media_key, str):
         if media_key in d:
             medias = d[media_key]
         else:
             medias = None
-    elif media_key:
+    elif media_key:  # function
         medias = media_key(d)
     else:
         medias = None
@@ -41,7 +41,7 @@ def media_name(self):
             return None
         return self.media_replacer.media_keys[self.media_type]
 
-    def parse_medias(self, d):
+    def parse_medias(self, d: Dict[str, Any]):
         return parse_medias(d, self.media_key)
 
     @property
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index ee6c52a60..7d1a988f9 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -18,7 +18,7 @@
 
 DEFAULT_SYSTEM = 'You are a helpful assistant.'
 History = List[Union[Tuple[str, str], List[str]]]
-Prompt = List[Union[str, List[str], List[int]]]
+Prompt = List[Union[str, List[int], List[str]]]
 StopWords = Prompt
 Context = Union[str, List[int]]
 TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
@@ -250,14 +250,14 @@ def _init_template(self,
             value = self._preprocess_prompt(tokenizer, value)
             setattr(self, key, value)
 
-    def check_example(self, example):
+    def check_example(self, example: Dict[str, Any]) -> None:
         pass
 
-    def add_default_tags(self, example):
-        history: Optional[History] = deepcopy(example.get('history') or [])
-        query: Optional[str] = example.get('query') or ''
+    def add_default_tags(self, example: Dict[str, Any]) -> None:
+        history: History = deepcopy(example.get('history') or [])
+        query: str = example.get('query') or ''
         for media_key, media_tag in [('videos', '<video_label>'), ('images', '<image>'), ('audios', '<audio_label>')]:
-            if example.get(media_key) and media_tag not in ''.join([h[0] for h in history]) + query:
+            if example.get(media_key) and media_tag not in ('\n'.join([h[0] for h in history]) + f'\n{query}'):
                 infer_media_type = TEMPLATE_MAPPING[self.template_type].get('infer_media_type')
                 if infer_media_type == 'round':
                     assert len(example[media_key]) == len(history) + 1
@@ -272,9 +272,9 @@ def add_default_tags(self, example):
                     media_len = len(example[media_key]) if isinstance(example[media_key],
                                                                       (tuple, list)) else 1 if example[media_key] else 0
                     if history:
-                        history[0][0] = ''.join([media_tag] * media_len) + history[0][0]
+                        history[0][0] = media_tag * media_len + history[0][0]
                     else:
-                        query = ''.join([media_tag] * media_len) + query
+                        query = media_tag * media_len + query
 
         example['query'] = query
         example['history'] = history
@@ -284,7 +284,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if not self._is_init:
             raise ValueError(
                 'Template is not initialized, please use the `get_template` function to obtain the template.')
-        if 'images' in example and not isinstance(example['images'], (tuple, list)):
+        if example.get('images') and not isinstance(example['images'], (tuple, list)):
             # change images field to list
             example['images'] = [example['images']]
         self.add_default_tags(example)
@@ -292,17 +292,16 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if example.get('objects') and isinstance(example['objects'], str):
             # reload grounding from str
             example['objects'] = json.loads(example['objects'])
-        query: Optional[str] = example.get('query', None)
-        query_role: Optional[str] = example.get('query_role', None)
-        response: Optional[str] = example.get('response', None)
-        history: Optional[History] = example.get('history', None)
-        history_roles: Optional[History] = example.get('history_roles', None)
+        query: str = example.get('query') or ''
+        query_role: str = example.get('query_role') or 'user'
+        response: Optional[str] = example.get('response')
+        history: History = example.get('history') or []
+        history_roles: Optional[History] = example.get('history_roles')
         system: Optional[str] = example.get('system', None)
-        template_type = getattr(self, 'template_type', None)
-        tools: Optional[list] = example.get('tools', None)
-        multi_modal: Optional[bool] = any([example.get(key) for key in Template.special_keys])
-        if history is None:
-            history = []
+        template_type: Optional[str] = getattr(self, 'template_type', None)
+        tools: List[Any] = example.get('tools') or []
+        is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
+
         if len(history) > 0:
             assert self.support_multi_round, (
                 f'The template does not support multi-round chat, template_type: {template_type}')
@@ -318,8 +317,9 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             if system is None:
                 system = ''
             system += get_tools_prompt(tools, self.tools_prompt)
-        if query is None:
-            query = ''
+        if history_roles is None:
+            history_roles = [['user', 'assistant'] for _ in range(len(history))]
+
         inputs, tokenizer_kwargs = self._encode(
             query,
             query_role,
@@ -330,7 +330,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             self.truncation_strategy,
             auto_add_bos=self.auto_add_bos,
             example=example,
-            multi_modal=multi_modal)
+            is_multi_modal=is_multi_modal)
         if inputs.get('labels') is None:
             inputs.pop('loss_scale', None)
         return inputs, tokenizer_kwargs
@@ -367,14 +367,18 @@ def _concat_context_list(
             res_context_list.append(context)
             loss_scale_list.append(0.0 if context not in self.suffix else 1.0)
 
-    @staticmethod
-    def _simplify_context_list(context_list: List[Context], loss_scale_list: List[float],
+    def _simplify_context_list(self, context_list: List[Context], loss_scale_list: List[float],
                                **kwargs) -> Tuple[List[Context], List[float]]:
         res: List[Context] = []  # result of context_list
         res_loss_scale: List[float] = []  # result of loss_scale_list
         temp: List[str] = []
         temp_index: List[int] = []
-        multi_modal: bool = kwargs.get('multi_modal', False)
+        is_multi_modal: bool = kwargs.pop('is_multi_modal', False)
+
+        if is_multi_modal:
+            context_list, loss_scale_list = self.split_special_tokens(context_list, loss_scale_list)
+        context_list, loss_scale_list = self.pre_tokenize(context_list, loss_scale_list, **kwargs)
+
         for i, (context, loss_scale) in enumerate(zip(context_list, loss_scale_list)):
             if isinstance(context, str) and loss_scale_list[i] == 0.0:
                 temp.append(context)
@@ -390,16 +394,17 @@ def _simplify_context_list(context_list: List[Context], loss_scale_list: List[fl
             res.append(''.join(temp))
             res_loss_scale.append(0.0)
 
-        if multi_modal:
+        if is_multi_modal:
             return Template.split_special_tokens(res, res_loss_scale)
         else:
             return res, res_loss_scale
 
     @staticmethod
-    def split_special_tokens(context_list, loss_scale_list):
+    def split_special_tokens(context_list: List[Context],
+                             loss_scale_list: List[float]) -> Tuple[List[Context], List[float]]:
         from swift.utils.utils import split_str_parts_by
-        res = []
-        loss_scale_res = []
+        res: List[Context] = []
+        loss_scale_res: List[float] = []
         from swift.llm.utils.utils import fetch_one
         for context, loss_scale in zip(context_list, loss_scale_list):
             contexts = []
@@ -418,65 +423,73 @@ def _tokenize(self, context, **tokenizer_kwargs):
         return self.tokenizer(
             context, return_attention_mask=False, add_special_tokens=False, **tokenizer_kwargs)['input_ids']
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
         if media_type == 'image':
-            return '<image>'
+            return ['<image>']
         if media_type == 'video':
-            return '<video_label>'
+            return ['<video_label>']
         if media_type == 'audio':
-            return '<audio_label>'
+            return ['<audio_label>']
 
-    def replace_object(self, index, example):
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
-            object = objects[index]
-            return object[0]
+            object_ = objects[index]
+            return [object_[0]]
         else:
-            return '<ref-object>'
+            return ['<ref-object>']
 
-    def replace_box(self, index, example):
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
-            object = objects[index]
-            return f'({object[1][0]},{object[1][1]}),({object[1][2]},{object[1][3]})'
+            object_ = objects[index]
+            return [f'({object_[1][0]},{object_[1][1]}),({object_[1][2]},{object_[1][3]})']
         else:
-            return '<bbox>'
-
-    def pre_tokenize(self, prompt, **kwargs):
-        example = kwargs.get('example')
-        if prompt == '<image>':
-            content = self.replace_tag('image', example.get('image_index', 0), example)
-            example['image_index'] = example.get('image_index', 0) + 1
-            return content
-        if prompt == '<video_label>':
-            content = self.replace_tag('video', example.get('video_index', 0), example)
-            example['video_index'] = example.get('video_index', 0) + 1
-            return content
-        if prompt == '<audio_label>':
-            content = self.replace_tag('audio', example.get('audio_index', 0), example)
-            example['audio_index'] = example.get('audio_index', 0) + 1
-            return content
-        if prompt == '<ref-object>':
-            content = self.replace_object(example.get('object_index', 0), example)
-            example['object_index'] = example.get('object_index', 0) + 1
-            return content
-        if prompt == '<bbox>':
-            content = self.replace_box(example.get('box_index', 0), example)
-            example['box_index'] = example.get('box_index', 0) + 1
-            return content
-        return prompt
-
-    def _encode_context_list(self, context_list: List[Context], loss_scale_list: List[float],
-                             **kwargs) -> Tuple[List[int], List[int], List[float], Dict[str, Any]]:
+            return ['<bbox>']
+
+    def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
+                     **kwargs) -> Tuple[List[Context], List[float]]:
+        # replace tag/object/box
+        example = kwargs['example']  # get x_index
+        res: List[Context] = []  # result of context_list
+        res_loss_scale: List[float] = []  # result of loss_scale_list
+
+        for context, loss_scale in zip(context_list, loss_scale_list):
+            if context == '<image>':
+                c_list = self.replace_tag('image', example.get('image_index', 0), example)
+                example['image_index'] = example.get('image_index', 0) + 1
+            elif context == '<video_label>':
+                c_list = self.replace_tag('video', example.get('video_index', 0), example)
+                example['video_index'] = example.get('video_index', 0) + 1
+            elif context == '<audio_label>':
+                c_list = self.replace_tag('audio', example.get('audio_index', 0), example)
+                example['audio_index'] = example.get('audio_index', 0) + 1
+            elif context == '<ref-object>':
+                c_list = self.replace_object(example.get('object_index', 0), example)
+                example['object_index'] = example.get('object_index', 0) + 1
+            elif context == '<bbox>':
+                c_list = self.replace_box(example.get('box_index', 0), example)
+                example['box_index'] = example.get('box_index', 0) + 1
+            else:
+                c_list = [context]
+            res += c_list
+            res_loss_scale += [loss_scale] * len(c_list)
+        return res, res_loss_scale
+
+    def _encode_context_list(self, context_list: List[Context],
+                             loss_scale_list: List[float]) -> Tuple[List[int], List[int], List[float], Dict[str, Any]]:
         """return: input_ids, labels, tokenizer_kwargs"""
         input_ids: List[int] = []
         labels: List[int] = []
         loss_scale: List[float] = []
         tokenizer_kwargs = {}
         for i, (context, loss_weight) in enumerate(zip(context_list, loss_scale_list)):
-            context = self.pre_tokenize(context, **kwargs)
             if isinstance(context, str):
-                curr_tokenizer_kwargs = {**tokenizer_kwargs, **self._get_tokenizer_kwargs(context)}
+                # tokenizer_kwargs is the returned tokenizer_kwargs,
+                # while curr_tokenizer_kwargs is the tokenizer_kwargs for the current context.
+                curr_tokenizer_kwargs = self._get_tokenizer_kwargs(context)
+                self._concat_tokenizer_kwargs(tokenizer_kwargs, curr_tokenizer_kwargs)
                 token_list = self._tokenize(context, **curr_tokenizer_kwargs)
             else:
                 token_list = context
@@ -501,14 +514,6 @@ def _encode(self,
         """
         return: inputs, tokenizer_kwargs
         """
-        if history_roles is None:
-            history_roles = [['user', 'assistant'] for _ in range(len(history))]
-
-        if query is not None:
-            if query_role is None:
-                query_role = 'user'
-            history_roles.append([query_role, 'assistant'])
-
         history = history.copy()
 
         res_context_list: List[Context] = []
@@ -525,6 +530,7 @@ def _encode(self,
         self._concat_context_list(prefix, res_context_list, loss_scale_list, system=system)
 
         history.append([query, response])
+        history_roles.append([query_role, 'assistant'])
 
         for i, ((q, r), (qr, rr)) in enumerate(zip(history, history_roles)):
             context_list = self.tool_prompt.copy() if qr == 'tool' else self.prompt.copy()
@@ -540,8 +546,7 @@ def _encode(self,
                 self._concat_context_list(
                     context_list, res_context_list, loss_scale_list, query=q, response=r, round0=i)
         res_context_list, loss_scale_list = self._simplify_context_list(res_context_list, loss_scale_list, **kwargs)
-        input_ids, labels, loss_scale, tokenizer_kwargs = self._encode_context_list(res_context_list, loss_scale_list,
-                                                                                    **kwargs)
+        input_ids, labels, loss_scale, tokenizer_kwargs = self._encode_context_list(res_context_list, loss_scale_list)
 
         if response is None:
             labels = None
@@ -566,6 +571,9 @@ def _get_tokenizer_kwargs(self, context: str) -> Dict[str, Any]:
         """return: curr_tokenizer_kwargs"""
         return {}
 
+    def _concat_tokenizer_kwargs(self, tokenizer_kwargs: Dict[str, Any], curr_tokenizer_kwargs: Dict[str, Any]) -> None:
+        assert len(tokenizer_kwargs) == 0
+
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         """
         Args:
@@ -748,26 +756,27 @@ def __init__(self, auto_add_bos: bool = False):
 class QwenVLTemplate(QwenTemplate):
 
     def check_example(self, example):
-        images = example.get('images')
+        images = example.get('images') or []
         from swift.llm.utils.utils import fetch_one
         assert not images or isinstance(fetch_one(images), str), 'QwenVL only supports datasets with images paths!'
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
         assert media_type == 'image'
-        images = example.get('images')
+        images = example.get('images') or []
         image = images[index]
         assert isinstance(image, str)
-        return f'<img>{image}</img>'
+        return [f'<img>{image}</img>']
 
-    def replace_object(self, index, example):
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example['objects']
-        object = objects[index]
-        return f'<ref>{object[0]}</ref>'
+        object_ = objects[index]
+        return [f'<ref>{object_[0]}</ref>']
 
-    def replace_box(self, index, example):
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example['objects']
-        object = objects[index]
-        return f'<box>({object[1][0]},{object[1][1]}),({object[1][2]},{object[1][3]})</box>'
+        object_ = objects[index]
+        return [f'<box>({object_[1][0]},{object_[1][1]}),({object_[1][2]},{object_[1][3]})</box>']
 
 
 register_template(TemplateType.qwen, QwenTemplate())
@@ -868,9 +877,9 @@ def _read_from_path(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image
 
 class YiVLTemplate(Template):
 
-    def replace_tag(self, media_type, index, example):
+    def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
-        return [-200] + self.tokenizer.encode('\n', add_special_tokens=False)
+        return [[-200], '\n']
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
@@ -882,7 +891,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if not hasattr(model, 'vision_tower'):
             model = model.model
         image_processor = model.vision_tower.image_processor
-        images_path = example.get('images', [])
+        images_path = example.get('images') or []
         images = []
         for image_path in images_path:
             image = _read_from_path(image_path)
@@ -925,31 +934,26 @@ def __init__(self):
                          ['<|system|>\n{{SYSTEM}}'])
 
     def check_example(self, example):
-        images = example.get('images')
-        assert not isinstance(images, (list, tuple)) or len(images) <= 1
+        images = example.get('images') or []
+        assert len(images) <= 1
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         assert media_type == 'image'
-        return [-100]
+        return [[-100]]
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         from .utils import history_to_messages
 
-        example = example.copy()
         inputs, _ = super().encode(example)
-        images_path = example.pop('images', [])
-        image = _read_from_path(images_path[0]) if images_path else None
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']
         labels = inputs['labels']
         idx_list = _findall(input_ids, -100)
         if idx_list:
-            if len(idx_list) >= 2:
-                input_ids = _remove_idx(input_ids, idx_list[1:])
-                if labels is not None:
-                    labels = _remove_idx(labels, idx_list[1:])
             idx = idx_list[0]
+            images_path = example.get('images') or []
+            image = _read_from_path(images_path[0])
             placeholder = '<|begin_of_image|><|endoftext|><|end_of_image|>'
             placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
             input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
@@ -958,7 +962,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
                 patch_size: int = self.model.config.vision_config['patch_size']
                 num_patches = (image_size // patch_size // 2)**2
                 labels = (labels[:idx] + [-100] * (len(placeholder_id) + num_patches - 1) + labels[idx + 1:])
-            messages = history_to_messages(example.get('history', []), example['query'], example.get('system', None))
+            messages = history_to_messages(example.get('history') or [], example['query'], example.get('system'))
             messages[0]['image'] = image
             inputs2: Dict[str, Any] = self.tokenizer.apply_chat_template(messages, return_dict=True)
             inputs['images'] = inputs2['images']
@@ -1118,7 +1122,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         inputs, _ = super().encode(example)
         images = []
         dtype = self.model.dtype
-        images_path.extend(example.get('images', []))
+        images_path.extend(example.get('images') or [])
         for image_path in images_path:
             image = _read_from_path(image_path)
             image = self.model.vis_processor(image)
@@ -1202,12 +1206,12 @@ def __init__(self):
                          ['<|im_end|>'], self.system, ['<|im_start|>system\n{{SYSTEM}}'])
 
     def check_example(self, example):
-        images = example.get('images')
-        assert images and (not isinstance(images, (list, tuple)) or len(images) <= 1)
+        images = example.get('images') or []
+        assert len(images) <= 1
 
-    def replace_tag(self, media_type, index, example):
+    def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
-        return [-100]
+        return [[-100]]
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
@@ -1216,14 +1220,10 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         input_ids = inputs['input_ids']
         idx_list = _findall(input_ids, -100)
         labels = inputs.get('labels')
-        if example.get('images'):
+        images_path = example.get('images') or []
+        if images_path:
             from .vision_utils import load_image
-            if len(idx_list) >= 2:
-                input_ids = _remove_idx(input_ids, idx_list[1:])
-                if labels is not None:
-                    labels = _remove_idx(labels, idx_list[1:])
 
-            images_path = example['images']
             pixel_values = []
             if isinstance(images_path, str):
                 images_path = [images_path]
@@ -1233,7 +1233,8 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             image_bs = pixel_values.shape[0]
 
             idx = idx_list[0]
-            img_tokens = self.tokenizer.encode('<img>' + '<IMG_CONTEXT>' * self.num_image_token * image_bs + '</img>\n')
+            img_tokens: List[int] = self.tokenizer.encode('<img>' + '<IMG_CONTEXT>' * self.num_image_token * image_bs
+                                                          + '</img>\n')
             input_ids = input_ids[:idx] + img_tokens + input_ids[idx + 1:]
             if labels is not None:
                 labels = labels[:idx] + [-100] * len(img_tokens) + labels[idx + 1:]
@@ -1242,10 +1243,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
 
             inputs['pixel_values'] = pixel_values.to(self.model.dtype)
             inputs['image_flags'] = torch.ones(image_bs)
-        else:
-            inputs['input_ids'] = _remove_idx(input_ids, idx_list)
-            if labels is not None:
-                inputs['labels'] = _remove_idx(labels, idx_list)
 
         inputs.pop('loss_scale', None)
         return inputs, {}
@@ -1331,15 +1328,15 @@ def __init__(self):
                          None, ['</s>'],
                          system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         assert media_type == 'image'
-        return [-200] + self.tokenizer.encode('\n', add_special_tokens=False)
+        return [[-200], '\n']
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        images_path = example.get('images', [])
+        images_path = example.get('images') or []
         images = []
         for image_path in images_path:
             image = _read_from_path(image_path)
@@ -1394,7 +1391,7 @@ class LLavaLlamaTemplate(Template):
                                 '{{QUERY}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
-        return self.tokenizer.encode('<image>\n', add_special_tokens=False)
+        return ['<image>\n']
 
     def __init__(self):
         Template.__init__(self, [], [self.llavallama_query_template], ['<|eot_id|>'], ['<|eot_id|>'])
@@ -1403,7 +1400,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        image_path = example.get('images', [])
+        image_path = example.get('images') or []
         if image_path:
             raw_image = _read_from_path(image_path[0])
             pixel_values = self.tokenizer.processor.image_processor(raw_image, return_tensors='pt')['pixel_values']
@@ -1432,21 +1429,18 @@ def __init__(self):
         Template.__init__(self, ['<bos>'], ['{{QUERY}}\n'], None, ['<eos>'])
 
     def check_example(self, example):
-        images = example.get('images')
-        assert not isinstance(images, (list, tuple)) or len(images) <= 1
+        images = example.get('images') or []
+        assert len(images) <= 1
 
-    def replace_tag(self, media_type, index, example):
+    def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
-        image_token = self.tokenizer.encode('<image>', add_special_tokens=False)
-        assert len(image_token) == 1
-        processor = self.tokenizer.processor
-        return image_token * processor.image_seq_length
+        return ['<image>' * self.tokenizer.processor.image_seq_length]
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        image_path = example.get('images', [])
+        image_path = example.get('images') or []
         processor = self.tokenizer.processor
         if inputs['labels'] is not None:
             n = upper_bound(0, len(inputs['labels']), lambda idx: inputs['labels'][idx] == -100)
@@ -1481,8 +1475,8 @@ def __init__(self):
         Template.__init__(self, ['<s>'], ['<|user|>\n{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], ['<|end|>'],
                           None, ['<s><|system|>\n{{SYSTEM}}<|end|>\n'])
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
-        return '<s>'
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return ['<s>']
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
@@ -1490,7 +1484,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if history is None:
             history = []
         example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<s>')
-        images_path.extend(example.get('images', []))
+        images_path.extend(example.get('images') or [])
         images = []
         for image_path in images_path:
             image = _read_from_path(image_path)
@@ -1591,9 +1585,9 @@ def __init__(self):
         super().__init__(['<｜begin▁of▁sentence｜>{{SYSTEM}}\n\n'], ['User: {{QUERY}}\n\nAssistant:'],
                          ['<｜end▁of▁sentence｜>'], ['<｜end▁of▁sentence｜>'], self.DEEPSEEK_VL_SYSTEM)
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         assert media_type == 'image'
-        return '<image_placeholder>'
+        return ['<image_placeholder>']
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
@@ -1604,7 +1598,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         example['query'], example['history'], images_path = replace_img_tag(example['query'], history,
                                                                             '<image_placeholder>')
         inputs, _ = super().encode(example)
-        images_path.extend(example.get('images', []))
+        images_path.extend(example.get('images') or [])
         if len(inputs) == 0:
             return inputs, {}
         images = []
@@ -1675,22 +1669,22 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 class CogTemplate(Template):
 
     def check_example(self, example):
-        images = example.get('images')
-        assert not isinstance(images, (list, tuple)) or len(images) <= 1
+        images = example.get('images') or []
+        assert len(images) <= 1
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
-        return ''
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return []
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
-        images_path = example['images']
-        image = _read_from_path(images_path[0])
+        images_path = example.get('images') or []
+        image = _read_from_path(images_path[0]) if len(images_path) >= 1 else []
         if len(inputs) == 0:
             return inputs, {}
         inputs.pop('loss_scale', None)
         model = self.model
         inputs2 = model.build_conversation_input_ids(
-            self.tokenizer, query=example['query'], history=example.get('history'), images=[image] if image else [])
+            self.tokenizer, query=example['query'], history=example.get('history'), images=[image])
         image_token_len = inputs2['token_type_ids'].sum()
         input_ids = inputs['input_ids']
         labels = inputs['labels']
@@ -1756,16 +1750,13 @@ def __init__(self, *args, **kwargs):
         self.is_v2_5 = kwargs.pop('is_v2_5', False)
         super().__init__(*args, **kwargs)
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         assert media_type == 'image'
-        return [-1]
+        return [[-1]]
 
     def check_example(self, example):
-        images = example.get('images')
-        if isinstance(images, (list, tuple)):
-            assert len(images) == 1
-        else:
-            assert images
+        images = example.get('images') or []
+        assert len(images) == 1
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
@@ -1776,10 +1767,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         input_ids = inputs['input_ids']
         labels = inputs['labels']
         idx_list = _findall(input_ids, -1)
-        if len(idx_list) >= 2:
-            input_ids = _remove_idx(input_ids, idx_list[1:])
-            if labels is not None:
-                labels = _remove_idx(labels, idx_list[1:])
         idx = idx_list[0]
         config = self.model.config
         tgt_sizes = None
@@ -1908,14 +1895,14 @@ class mPlugOwl2Template(Template):
     def __init__(self):
         super().__init__(['{{SYSTEM}}'], ['USER: ', '{{QUERY}}ASSISTANT:'], ['</s>'], [['eos_token_id']])
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         assert media_type == 'image'
-        return [-200]
+        return [[-200]]
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         from mplug_owl2.mm_utils import process_images
         processor = self.tokenizer.processor
-        images_path = example.get('images', [])
+        images_path = example.get('images') or []
         images = []
         for image_path in images_path:
             image = _read_from_path(image_path)

From 159dfacbe29ad5ba12063c3d3afd88de0b2afec5 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 25 Jun 2024 11:24:56 +0800
Subject: [PATCH 02/15] support llava 1.5 (#1217)

---
 ...14\346\225\260\346\215\256\351\233\206.md" |  5 +-
 .../LLM/Supported-models-datasets.md          |  5 +-
 swift/llm/deploy.py                           |  4 +
 swift/llm/export.py                           |  8 +-
 swift/llm/utils/dataset.py                    |  2 +-
 swift/llm/utils/media.py                      |  1 -
 swift/llm/utils/model.py                      | 20 +++++
 swift/llm/utils/protocol.py                   |  2 +-
 swift/llm/utils/template.py                   | 77 ++++++++++++-------
 swift/llm/utils/utils.py                      |  3 -
 swift/utils/utils.py                          |  2 +
 11 files changed, 87 insertions(+), 42 deletions(-)

diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 5a1c03257..b9cc49db5 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -204,8 +204,8 @@
 |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
 |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
 |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
-|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
-|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
+|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
+|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
 |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
 |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
 |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
@@ -316,6 +316,7 @@
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|self_attention.query_key_value|glm4v|&#x2718;|&#x2718;||vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|llava1_5-7b-chat|[huangjintao/llava-1.5-7b-hf](https://modelscope.cn/models/huangjintao/llava-1.5-7b-hf/summary)|q_proj, k_proj, v_proj|llava1_5|&#x2714;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
 |llava1_6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
 |llava1_6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|&#x2714;|&#x2718;||vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)|
 |llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|q_proj, k_proj, v_proj|llama-llava-next|&#x2714;|&#x2718;||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 41fe33d34..6ffb58341 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -204,8 +204,8 @@ The table below introcudes all models supported by SWIFT:
 |deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
 |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
 |deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
-|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
-|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;||coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
+|deepseek-coder-v2-instruct|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)|
+|deepseek-coder-v2-lite-instruct|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://modelscope.cn/models/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|coding|[deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)|
 |deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
 |deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
 |deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
@@ -316,6 +316,7 @@ The table below introcudes all models supported by SWIFT:
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|self_attention.query_key_value|glm4v|&#x2718;|&#x2718;||vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|llava1_5-7b-chat|[huangjintao/llava-1.5-7b-hf](https://modelscope.cn/models/huangjintao/llava-1.5-7b-hf/summary)|q_proj, k_proj, v_proj|llava1_5|&#x2714;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
 |llava1_6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
 |llava1_6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|&#x2714;|&#x2718;||vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)|
 |llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|q_proj, k_proj, v_proj|llama-llava-next|&#x2714;|&#x2718;||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
index 590197184..bc7ed6902 100644
--- a/swift/llm/deploy.py
+++ b/swift/llm/deploy.py
@@ -510,6 +510,8 @@ def _generate_stream():
 async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request) -> ChatCompletionResponse:
     global _args
     assert _args is not None
+    if request.stop is None:
+        request.stop = []
     if _args.infer_backend == 'vllm':
         return await inference_vllm_async(request, raw_request)
     else:
@@ -520,6 +522,8 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
 async def create_completion(request: CompletionRequest, raw_request: Request) -> CompletionResponse:
     global _args
     assert _args is not None
+    if request.stop is None:
+        request.stop = []
     if _args.infer_backend == 'vllm':
         return await inference_vllm_async(request, raw_request)
     else:
diff --git a/swift/llm/export.py b/swift/llm/export.py
index 5abbeadfb..5a4a901e3 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -94,13 +94,13 @@ def llm_export(args: ExportArguments) -> None:
     logger.info(f'args: {args}')
     seed_everything(args.seed)
     if args.to_peft_format:
-        assert args.sft_type == 'lora'
+        assert args.sft_type == 'lora', f'args.sft_type: {args.sft_type}'
         args.ckpt_dir = swift_to_peft_format(args.ckpt_dir)
     if args.merge_lora:
         merge_lora(args, device_map=args.merge_device_map)
     if args.quant_bits > 0:
         _args = args
-        assert args.quantization_bit == 0
+        assert args.quantization_bit == 0, f'args.quantization_bit: {args.quantization_bit}'
         assert args.sft_type == 'full', 'you need to merge lora'
         if args.quant_method == 'awq':
             from awq import AutoAWQForCausalLM
@@ -108,11 +108,13 @@ def llm_export(args: ExportArguments) -> None:
                 args, device_map=args.quant_device_map, verbose=False, automodel_class=AutoAWQForCausalLM)
             awq_model_quantize(model, template.tokenizer)
             model.save_quantized(args.quant_output_dir)
-        else:  # gptq
+        elif args.quant_method == 'gptq':
             model, template = prepare_model_template(args, device_map=args.quant_device_map, verbose=False)
             gptq_quantizer = gptq_model_quantize(model, template.tokenizer)
             model.config.quantization_config.pop('dataset', None)
             gptq_quantizer.save(model, args.quant_output_dir)
+        else:
+            raise ValueError(f'args.quant_method: {args.quant_method}')
 
         logger.info(get_model_info(model))
         show_layers(model)
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index b1cf65eb9..b54e432c6 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -2256,7 +2256,7 @@ def get_dataset(
             assert model_name is not None and model_author is not None
             dataset = _preprocess_self_cognition_dataset(dataset, model_name, model_author)
 
-        def _reduce_column(row):
+        def _reduce_column(row: Dict[str, Any]) -> Dict[str, Any]:
             res = {}
             if 'query' in row and isinstance(row['query'], (list, tuple)):
                 res['query'] = np.random.choice(row['query'])
diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
index d30b14746..5b26da107 100644
--- a/swift/llm/utils/media.py
+++ b/swift/llm/utils/media.py
@@ -1,6 +1,5 @@
 import os
 import shutil
-import time
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import numpy as np
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index e7a67e260..10d7de870 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -189,6 +189,7 @@ class ModelType:
     atom_7b = 'atom-7b'
     atom_7b_chat = 'atom-7b-chat'
     # llava
+    llava1_5_7b_chat = 'llava1_5-7b-chat'
     llava1_6_mistral_7b_instruct = 'llava1_6-mistral-7b-instruct'
     llava1_6_yi_34b_instruct = 'llava1_6-yi-34b-instruct'
     llama3_llava_next_8b = 'llama3-llava-next-8b'
@@ -4645,6 +4646,25 @@ def _new_generate(inputs=None, *args, **kwargs):
     model.generate = _new_generate
 
 
+@register_model(
+    ModelType.llava1_5_7b_chat,
+    'huangjintao/llava-1.5-7b-hf',
+    LoRATM.llama,
+    TemplateType.llava1_5,
+    eos_token='</s>',
+    support_flash_attn=True,
+    requires=['transformers>=4.36'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='llava-hf/llava-1.5-7b-hf')
+def get_model_tokenizer_llava1_5(model_dir: str, *args, **kwargs):
+    from transformers import AutoProcessor, LlavaForConditionalGeneration
+    processor = AutoProcessor.from_pretrained(model_dir)
+    model, tokenizer = get_model_tokenizer_with_flash_attn(
+        model_dir, *args, automodel_class=LlavaForConditionalGeneration, **kwargs)
+    tokenizer.processor = processor
+    return model, tokenizer
+
+
 @register_model(
     ModelType.llava1_6_yi_34b_instruct,
     'AI-ModelScope/llava-v1.6-34b',
diff --git a/swift/llm/utils/protocol.py b/swift/llm/utils/protocol.py
index 5e5aaef5b..1ba24e734 100644
--- a/swift/llm/utils/protocol.py
+++ b/swift/llm/utils/protocol.py
@@ -42,7 +42,7 @@ class XRequestConfig:
 
     n: int = 1
     seed: Optional[int] = None
-    stop: List[str] = field(default_factory=list)
+    stop: Optional[List[str]] = None
     stream: bool = False
 
     best_of: Optional[int] = None
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 7d1a988f9..d3f5f8981 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -40,6 +40,7 @@ class TemplateType:
     chatglm3 = 'chatglm3'
     llama = 'llama'  # llama2
     llama3 = 'llama3'
+    llava1_5 = 'llava1_5'
     llava_mistral_instruct = 'llava-mistral-instruct'
     llava_yi_instruct = 'llava-yi-instruct'
     llava_llama_instruct = 'llava-llama-instruct'
@@ -639,6 +640,11 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
             res['inputs_embeds'] = inputs_embeds
         else:
             res['input_ids'] = input_ids
+        # multimodal
+        pixel_values = [b['pixel_values'] for b in batch if b.get('pixel_values') is not None]
+        if len(pixel_values) > 0:
+            res['pixel_values'] = torch.concat(pixel_values)
+
         if loss_scale is not None:
             res['loss_scale'] = loss_scale
         return res
@@ -726,7 +732,7 @@ def register_template(template_type: str, template: Template, *, exist_ok: bool
 
 register_template(
     TemplateType.default,
-    Template([], ['### Human:\n', '{{QUERY}}\n\n', '### Assistant:\n'], ['\n\n'], [['eos_token_id']], DEFAULT_SYSTEM,
+    Template([], ['### Human:\n{{QUERY}}\n\n### Assistant:\n'], ['\n\n'], [['eos_token_id']], DEFAULT_SYSTEM,
              ['{{SYSTEM}}\n\n']))
 
 
@@ -930,7 +936,7 @@ def _init_template(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs) ->
 class GLM4VTemplate(GLMTemplate):
 
     def __init__(self):
-        super().__init__([], ['<|user|>\n', '{{QUERY}}<|assistant|>'], [], ['<|endoftext|>'], None,
+        super().__init__([], ['<|user|>\n{{QUERY}}<|assistant|>'], [], ['<|endoftext|>'], None,
                          ['<|system|>\n{{SYSTEM}}'])
 
     def check_example(self, example):
@@ -982,7 +988,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 
 register_template(
     TemplateType.yi_vl,
-    YiVLTemplate([], ['### Human: ', '{{QUERY}}\n### Assistant:'], ['\n'], ['\n###'], yi_vl_default_system,
+    YiVLTemplate([], ['### Human: {{QUERY}}\n### Assistant:'], ['\n'], ['\n###'], yi_vl_default_system,
                  ['{{SYSTEM}}\n\n']),
     use_model=True,
     infer_media_type='round',
@@ -1202,8 +1208,8 @@ class InternvlTemplate(Template):
     num_image_token = 256
 
     def __init__(self):
-        super().__init__(['<s>'], ['<|im_start|>user\n', '{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
-                         ['<|im_end|>'], self.system, ['<|im_start|>system\n{{SYSTEM}}'])
+        super().__init__(['<s>'], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
+                         ['<|im_end|>'], self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>'])
 
     def check_example(self, example):
         images = example.get('images') or []
@@ -1250,10 +1256,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super().data_collator(batch, padding_to)
         assert all('pixel_values' in b for b in batch), 'Temporarily, Interval only supports data with images'
-        pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b]
         image_flags = [b['image_flags'] for b in batch if 'image_flags' in b]
-        if pixel_values:
-            res['pixel_values'] = torch.concat(pixel_values)
         if image_flags:
             res['image_flags'] = torch.concat(image_flags)
         return res
@@ -1267,8 +1270,8 @@ class InternvlPhi3Template(InternvlTemplate):
     system = 'You are an AI assistant whose name is Phi-3.'
 
     def __init__(self):
-        Template.__init__(self, ['<s>'], ['<|user|>\n', [-100], '{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'],
-                          ['<|end|>'], self.system, ['<s><|system|>\n{{SYSTEM}}<|end|>\n'])
+        Template.__init__(self, ['<s>'], ['<|user|>\n{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], ['<|end|>'],
+                          self.system, ['<s><|system|>\n{{SYSTEM}}<|end|>\n'])
 
 
 register_template(
@@ -1320,6 +1323,34 @@ def __init__(self):
               'and other non-computer science questions, you will refuse to answer\n')))
 
 
+class Llava1_5Template(Template):
+
+    def __init__(self):
+        super().__init__(['<s>'], ['USER: {{QUERY}}\nASSISTANT:'], ['\n'], ['</s>'])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return ['<image>\n']
+
+    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super().encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images_path = example.get('images') or []
+        images = []
+        for image_path in images_path:
+            image = _read_from_path(image_path)
+            images.append(image)
+        image_processor = self.tokenizer.processor.image_processor
+        if images:
+            inputs['pixel_values'] = image_processor(images, return_tensors='pt')['pixel_values'].to(self.model.dtype)
+        return inputs, {}
+
+
+register_template(
+    TemplateType.llava1_5, Llava1_5Template(), use_model=True, infer_media_type='round', lazy_tokenize=True)
+
+
 class LLavaTemplate(Template):
 
     def __init__(self):
@@ -1387,8 +1418,8 @@ def __init__(self):
 
 
 class LLavaLlamaTemplate(Template):
-    llavallama_query_template = '<|start_header_id|>user<|end_header_id|>\n\n' \
-                                '{{QUERY}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+    llavallama_query_template = ('<|start_header_id|>user<|end_header_id|>\n\n'
+                                 '{{QUERY}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
         return ['<image>\n']
@@ -1407,13 +1438,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             inputs['pixel_values'] = pixel_values.to(self.model.dtype)
         return inputs, {}
 
-    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
-        res = super().data_collator(batch, padding_to)
-        pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b]
-        if pixel_values:
-            res['pixel_values'] = torch.concat(pixel_values)
-        return res
-
 
 register_template(
     TemplateType.llava_llama_instruct,
@@ -1456,9 +1480,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
 
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super().data_collator(batch, padding_to)
-        pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b]
-        if pixel_values:
-            res['pixel_values'] = torch.concat(pixel_values)
         token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch]
         token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
         res['token_type_ids'] = token_type_ids
@@ -1519,9 +1540,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
 
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super().data_collator(batch, padding_to)
-        pixel_values = [b['pixel_values'] for b in batch if 'pixel_values' in b]
-        if pixel_values:
-            res['pixel_values'] = torch.concat(pixel_values)
+        if 'pixel_values' in res:
             res['image_sizes'] = torch.concat([b['image_sizes'] for b in batch if 'image_sizes' in b])
         return res
 
@@ -1554,7 +1573,7 @@ class LLavaQwenTemplate(LLavaTemplate):
     llavayi_query_template = 'You are a helpful assistant'
 
     def __init__(self):
-        Template.__init__(self, [], ['<|im_start|>user\n', '{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'],
+        Template.__init__(self, [], ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'],
                           ['<|im_end|>\n'], ['<|im_end|>'], self.llavayi_query_template,
                           ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'])
 
@@ -1827,7 +1846,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 
 register_template(
     TemplateType.minicpm_v,
-    MiniCPMVTemplate(['<s>{{SYSTEM}}'], ['<用户>', '{{QUERY}}<AI>'], [], ['</s>']),
+    MiniCPMVTemplate(['<s>{{SYSTEM}}'], ['<用户>{{QUERY}}<AI>'], [], ['</s>']),
     use_model=True,
     lazy_tokenize=True,
     infer_media_type='dialogue',
@@ -1837,7 +1856,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 register_template(
     TemplateType.minicpm_v_v2_5,
     MiniCPMVTemplate(['<|begin_of_text|>{{SYSTEM}}'], [
-        '<|start_header_id|>user<|end_header_id|>\n\n', '{{QUERY}}<|eot_id|>'
+        '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
         '<|start_header_id|>assistant<|end_header_id|>\n\n'
     ], ['<|eot_id|>'], ['<|eot_id|>'],
                      is_v2_5=True),
@@ -1893,7 +1912,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 class mPlugOwl2Template(Template):
 
     def __init__(self):
-        super().__init__(['{{SYSTEM}}'], ['USER: ', '{{QUERY}}ASSISTANT:'], ['</s>'], [['eos_token_id']])
+        super().__init__(['{{SYSTEM}}'], ['USER: {{QUERY}}ASSISTANT:'], ['</s>'], [['eos_token_id']])
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         assert media_type == 'image'
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 43029088a..a47e63a22 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -93,9 +93,6 @@ def download_dataset(model_id: str, files: List[str], force_download: bool = Fal
     def _msdataset_ddp_load(*args, **kwargs):
         with safe_ddp_context():
             dataset = _old_msdataset_load(*args, **kwargs)
-
-        if is_dist():  # sync
-            dist.barrier()
         return dataset
 
     # monkey patching
diff --git a/swift/utils/utils.py b/swift/utils/utils.py
index ad0cae475..519a5667b 100644
--- a/swift/utils/utils.py
+++ b/swift/utils/utils.py
@@ -27,6 +27,8 @@ def safe_ddp_context():
     yield
     if is_dist() and is_local_master():
         dist.barrier()
+    if is_dist():  # sync
+        dist.barrier()
 
 
 def check_json_format(obj: Any) -> Any:

From d6f21c8386eade9574964411b518eeedd5b05e83 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:18:52 +0800
Subject: [PATCH 03/15] add in_browswer (#1220)

---
 swift/ui/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/swift/ui/app.py b/swift/ui/app.py
index 25532f944..7281a5545 100644
--- a/swift/ui/app.py
+++ b/swift/ui/app.py
@@ -71,6 +71,7 @@ def run_ui():
         concurrent = {'concurrency_count': 5}
     app.queue(**concurrent).launch(
         server_name=os.environ.get('WEBUI_SERVER', None),
+        inbrowser=True,
         server_port=port if port is None else int(port),
         height=800,
         share=bool(int(os.environ.get('WEBUI_SHARE', '0'))))

From 489a859b70e16b769a0ecf569bc19e95e3f5de05 Mon Sep 17 00:00:00 2001
From: Ang Wang <wangang.wa@alibaba-inc.com>
Date: Tue, 25 Jun 2024 16:42:15 +0800
Subject: [PATCH 04/15] [TorchAcc] Add USE_TORCH_XLA=0 flag for native swift
 scripts (#1221)

* add USE_TORCH_XLA=0 flag

* remove trim graph
---
 .../scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh   | 3 ++-
 .../llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh      | 4 ++--
 .../llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh  | 2 +-
 .../torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh        | 1 -
 .../scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh   | 2 +-
 .../scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh   | 2 +-
 .../scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh   | 2 +-
 .../llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh    | 6 +++---
 .../llm/scripts/torchacc/yi_34b_chat/swift_lora_sft.sh      | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
index c72771dbb..72d54086e 100644
--- a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
@@ -2,7 +2,8 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+
+export USE_TORCH_XLA=0
 
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1 \
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
index 0fd4e5d4b..427ca158b 100644
--- a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
@@ -2,8 +2,8 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
-# MASTER_PORT=12356 \
+export USE_TORCH_XLA=0
+
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1 \
 swift sft \
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
index 3fc24e19e..ad0789a9c 100644
--- a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
@@ -2,7 +2,7 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+export USE_TORCH_XLA=0
 
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1 \
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
index d1c983e1b..73bd3735d 100644
--- a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh
@@ -2,7 +2,6 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 export USE_TORCHACC=1
-export TORCHACC_TRIM_GRAPH=1
 export XLA_IR_SHAPE_CACHE_SIZE=100000000
 export XLA_ALLOCATOR_FRACTION=0.95
 export XLA_EXPERIMENTAL=nonzero:masked_select
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh
index 3454bdb26..bae2e2a8b 100644
--- a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh
@@ -2,7 +2,7 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+export USE_TORCH_XLA=0
 
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1 \
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh
index da4d328e0..baef03bbc 100644
--- a/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/swift_lora_sft.sh
@@ -2,7 +2,7 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+export USE_TORCH_XLA=0
 
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1 \
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh
index 25a1accc7..4d4ae9117 100644
--- a/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/qwen1half_32b_chat/swift_lora_sft.sh
@@ -2,7 +2,7 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+export USE_TORCH_XLA=0
 
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh
index 30347822f..d13cd75da 100644
--- a/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh
@@ -2,10 +2,10 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+export USE_TORCH_XLA=0
 
-NPROC_PER_NODE=1 \
-CUDA_VISIBLE_DEVICES=7 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 swift sft \
   --model_id_or_path qwen/Qwen-72B-Chat \
   --dataset codefuse-python-en \
diff --git a/examples/pytorch/llm/scripts/torchacc/yi_34b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/yi_34b_chat/swift_lora_sft.sh
index 623177e41..d9f9ef8d6 100644
--- a/examples/pytorch/llm/scripts/torchacc/yi_34b_chat/swift_lora_sft.sh
+++ b/examples/pytorch/llm/scripts/torchacc/yi_34b_chat/swift_lora_sft.sh
@@ -2,7 +2,7 @@
 # 80GB GPU memory
 # Note: TorchAcc is currently only available internally.
 
-# MASTER_ADDR=127.0.0.1 \
+export USE_TORCH_XLA=0
 
 NPROC_PER_NODE=2 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \

From d7411b7885130c6b7ea5b79dec1e19fd9cb73a68 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 25 Jun 2024 17:47:36 +0800
Subject: [PATCH 05/15] Fix glm4v batch size (#1223)

---
 README.md                   | 2 +-
 README_CN.md                | 2 +-
 swift/llm/utils/template.py | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 553139bb8..084c7c660 100644
--- a/README.md
+++ b/README.md
@@ -548,7 +548,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 | DeepSeek-VL        | [DeepSeek series vision models](https://github.com/deepseek-ai)              | Chinese<br>English | 1.3B-7B                            | chat model         |
 | MiniCPM-V<br>MiniCPM-V-2<br>MiniCPM-V-2_5  | [OpenBmB MiniCPM vision model](https://github.com/OpenBMB/MiniCPM) | Chinese<br>English | 3B-9B            | chat model          |
 | CogVLM<br>CogVLM2<br>CogAgent<br>GLM4V | [Zhipu ChatGLM visual QA and Agent model](https://github.com/THUDM/)         | Chinese<br>English | 9B-19B                            | chat model         |
-| Llava              | [Llava series models](https://github.com/haotian-liu/LLaVA)                  | English            | 7B-34B                             | chat model |
+| Llava1.5<br>Llava1.6           | [Llava series models](https://github.com/haotian-liu/LLaVA)                  | English            | 7B-34B                             | chat model |
 | Llava-Next              | [Llava-Next series models](https://github.com/LLaVA-VL/LLaVA-NeXT)                  | Chinese<br>English | 8B-110B                             | chat model |
 | mPLUG-Owl          | [mPLUG-Owl series models](https://github.com/X-PLUG/mPLUG-Owl)               | English            | 11B                                | chat model |
 | InternVL           | [InternVL](https://github.com/OpenGVLab/InternVL)                            | Chinese<br>English | 2B-25.5B<br>including quantized version                              | chat model |
diff --git a/README_CN.md b/README_CN.md
index a605a56da..95c6e9905 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -545,7 +545,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | DeepSeek-VL                               | [幻方系列视觉模型](https://github.com/deepseek-ai)                                 | 中文<br>英文 | 1.3B-7B         | chat模型          |
 | MiniCPM-V<br>MiniCPM-V-2<br>MiniCPM-V-2_5 | [OpenBmB MiniCPM视觉模型](https://github.com/OpenBMB/MiniCPM)                  | 中文<br>英文 | 3B-9B           | chat模型          |
 | CogVLM<br>CogVLM2<br>CogAgent<br>GLM4V   | [智谱ChatGLM视觉问答和Agent模型](https://github.com/THUDM/)                         | 中文<br>英文 | 9B-19B         | chat模型          |
-| Llava                                     | [Llava系列模型](https://github.com/haotian-liu/LLaVA)                          | 英文 | 7B-34B          | chat模型 |
+| Llava1.5<br>Llava1.6                       | [Llava系列模型](https://github.com/haotian-liu/LLaVA)                          | 英文 | 7B-34B          | chat模型 |
 | Llava-Next                                | [Llava-Next系列模型](https://github.com/LLaVA-VL/LLaVA-NeXT)                   | 中文<br>英文 | 8B-110B         | chat模型 |
 | mPLUG-Owl                                 | [mPLUG-Owl系列模型](https://github.com/X-PLUG/mPLUG-Owl)                       | 英文 | 11B             | chat模型 |
 | InternVL                                  | [InternVL](https://github.com/OpenGVLab/InternVL)                          | 中文<br>英文 | 2B-25.5B<br>包含量化版本 | chat模型 |
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index d3f5f8981..8ad46403d 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -395,10 +395,7 @@ def _simplify_context_list(self, context_list: List[Context], loss_scale_list: L
             res.append(''.join(temp))
             res_loss_scale.append(0.0)
 
-        if is_multi_modal:
-            return Template.split_special_tokens(res, res_loss_scale)
-        else:
-            return res, res_loss_scale
+        return res, res_loss_scale
 
     @staticmethod
     def split_special_tokens(context_list: List[Context],
@@ -978,6 +975,8 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
 
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super().data_collator(batch, padding_to)
+        pad_len = res['labels'].shape[1] - res['input_ids'].shape[1]
+        res['attention_mask'] = F.pad(res['attention_mask'], (pad_len, 0), 'constant', 1)
         images = [b['images'] for b in batch if 'images' in b]
         if images:
             res['images'] = torch.concat(images)

From 414b30877ce151e6fcba3a039f18533352f38b30 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Tue, 25 Jun 2024 22:33:42 +0800
Subject: [PATCH 06/15] Add debug log support (#1226)

---
 swift/llm/utils/model.py   |  5 +++++
 swift/llm/utils/utils.py   |  5 ++---
 swift/utils/logger.py      | 15 ++++++---------
 swift/utils/torch_utils.py |  7 ++++++-
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 10d7de870..84f5fef4c 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -910,9 +910,14 @@ def get_model_tokenizer_from_repo(model_dir: str,
                 trust_remote_code=True,
             )
         else:
+            logger.info(f'Model loading with args: model_dir: {model_dir},'
+                        f'torch_dtype: {torch_dtype},'
+                        f'model_kwargs: {model_kwargs}')
             with context:
                 model = automodel_class.from_pretrained(
                     model_dir, config=model_config, torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs)
+        if hasattr(model, 'hf_device_map'):
+            logger.debug(f'Model hf_device_map: {model.hf_device_map}')
     return model, tokenizer
 
 
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index a47e63a22..65f0557a1 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -45,11 +45,10 @@
 
 logger.handlers[0].setFormatter(logger_format)
 ms_logger.handlers[0].setFormatter(logger_format)
+log_level = os.getenv('LOG_LEVEL', 'INFO').upper()
 if is_local_master():
-    logger.setLevel(logging.INFO)
-    ms_logger.setLevel(logging.INFO)
+    ms_logger.setLevel(log_level)
 else:
-    logger.setLevel(logging.ERROR)
     ms_logger.setLevel(logging.ERROR)
 
 os.environ['TOKENIZERS_PARALLELISM'] = 'true'
diff --git a/swift/utils/logger.py b/swift/utils/logger.py
index 23157a3d7..523b08930 100644
--- a/swift/utils/logger.py
+++ b/swift/utils/logger.py
@@ -9,12 +9,7 @@
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
-def is_master():
-    rank = int(os.getenv('RANK', -1))
-    return rank in {-1, 0}
-
-
-def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = 'w'):
+def get_logger(log_file: Optional[str] = None, log_level: Optional[int] = None, file_mode: str = 'w'):
     """ Get logging logger
 
     Args:
@@ -24,7 +19,9 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
         file_mode: Specifies the mode to open the file, if filename is
             specified (if filemode is unspecified, it defaults to 'w').
     """
-
+    if log_level is None:
+        log_level = os.getenv('LOG_LEVEL', 'INFO').upper()
+        log_level = getattr(logging, log_level, logging.INFO)
     logger_name = __name__.split('.')[0]
     logger = logging.getLogger(logger_name)
     logger.propagate = False
@@ -47,7 +44,7 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
     handlers = [stream_handler]
 
     if importlib.util.find_spec('torch') is not None:
-        is_worker0 = is_master()
+        is_worker0 = int(os.getenv('LOCAL_RANK', -1)) in {-1, 0}
     else:
         is_worker0 = True
 
@@ -76,7 +73,7 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
             return
 
     if importlib.util.find_spec('torch') is not None:
-        is_worker0 = is_master()
+        is_worker0 = int(os.getenv('LOCAL_RANK', -1)) in {-1, 0}
     else:
         is_worker0 = True
 
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 4ed88b2b8..2eb02d954 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -13,7 +13,7 @@
 from torch.nn import Module
 from transformers.utils import is_torch_npu_available, strtobool
 
-from .logger import get_logger, is_master
+from .logger import get_logger
 
 logger = get_logger()
 
@@ -82,6 +82,11 @@ def is_local_master():
     return local_rank in {-1, 0}
 
 
+def is_master():
+    rank = int(os.getenv('RANK', -1))
+    return rank in {-1, 0}
+
+
 def use_torchacc() -> bool:
     return strtobool(os.getenv('USE_TORCHACC', '0'))
 

From 0d8708e63804d7140c35649b35e8391958299995 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Jun 2024 10:07:02 +0800
Subject: [PATCH 07/15] Fix qlora deploy (#1224)

---
 README.md                     |  2 +-
 swift/llm/deploy.py           | 23 ++++++++++++-----------
 swift/llm/export.py           | 10 +++++++++-
 swift/llm/infer.py            | 27 ++++++++++++++++++---------
 swift/llm/rlhf.py             |  8 ++------
 swift/llm/sft.py              | 13 ++++++-------
 swift/llm/utils/__init__.py   |  6 +++---
 swift/llm/utils/argument.py   | 27 ++++++++++++++-------------
 swift/llm/utils/model.py      | 21 ++++++++++++++-------
 swift/llm/utils/utils.py      | 15 ++++++++++++++-
 swift/llm/utils/vllm_utils.py |  4 ++--
 swift/utils/torch_utils.py    |  2 +-
 12 files changed, 96 insertions(+), 62 deletions(-)

diff --git a/README.md b/README.md
index 084c7c660..8dca51442 100644
--- a/README.md
+++ b/README.md
@@ -502,7 +502,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 |------------------------------------------------|------------------------------------------------------------------------|--------------------|----------------------------------------|------------------------------------------- |
 | Qwen<br>Qwen1.5<br>Qwen2                            | [Tongyi Qwen 1.0 and 1.5 series models](https://github.com/QwenLM)  | Chinese<br>English    | 0.5B-110B<br>including quantized versions | base model<br>chat model<br>MoE model<br>code model                      |
 | ChatGLM2<br>ChatGLM3<br>Codegeex2<br>GLM4           | [Zhipu ChatGLM series models](https://github.com/THUDM)               | Chinese<br>English    | 6B-9B                                     | base model<br>chat model<br>code model<br>long text model  |
-| Baichuan/Baichuan2                             | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc)           | Chinese<br>English    | 7B-13B<br>including quantized versions             | base model<br>chat model                       |
+| Baichuan<br>Baichuan2                             | [Baichuan 1 and Baichuan 2](https://github.com/baichuan-inc)           | Chinese<br>English    | 7B-13B<br>including quantized versions             | base model<br>chat model                       |
 | Yuan2                                          | [Langchao Yuan series models](https://github.com/IEIT-Yuan)             | Chinese<br>English    | 2B-102B                                | instruct model                                 |
 | XVerse                                         | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
 | LLaMA2                                         | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
index bc7ed6902..53ebf2f4f 100644
--- a/swift/llm/deploy.py
+++ b/swift/llm/deploy.py
@@ -157,7 +157,7 @@ async def inference_vllm_async(request: Union[ChatCompletionRequest, CompletionR
             kwargs[key] = new_value
 
     generation_config = VllmGenerationConfig(**kwargs)
-    if generation_config.use_beam_search is True and request.stream is True:
+    if generation_config.use_beam_search and request.stream:
         error_msg = 'Streaming generation does not support beam search.'
         raise ValueError(error_msg)
     tokenizer = template.tokenizer
@@ -391,16 +391,17 @@ async def inference_pt_async(request: Union[ChatCompletionRequest, CompletionReq
 
     created_time = int(time.time())
     adapter_kwargs = {}
-    if request.model != _args.model_type:
-        adapter_names = None
-        for lora_req in _args.lora_request_list:
-            if lora_req.lora_name == request.model:
-                adapter_names = request.model
-                break
-        assert adapter_names is not None
-        adapter_kwargs['adapter_names'] = [adapter_names]
-    elif isinstance(model, PeftModel):
-        adapter_kwargs['adapter_names'] = ['-']
+    if _args.lora_request_list is not None:
+        if request.model != _args.model_type:
+            adapter_names = None
+            for lora_req in _args.lora_request_list:
+                if lora_req.lora_name == request.model:
+                    adapter_names = request.model
+                    break
+            assert adapter_names is not None
+            adapter_kwargs['adapter_names'] = [adapter_names]
+        elif isinstance(model, PeftModel):
+            adapter_kwargs['adapter_names'] = ['-']  # use base model
 
     async def _generate_full():
         generation_info = {}
diff --git a/swift/llm/export.py b/swift/llm/export.py
index 5a4a901e3..3e4027386 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -121,7 +121,15 @@ def llm_export(args: ExportArguments) -> None:
         logger.info('Saving quantized weights...')
         model_cache_dir = model.model_dir
         save_checkpoint(
-            None, template.tokenizer, model_cache_dir, args.ckpt_dir, args.quant_output_dir, dtype=args.dtype)
+            None,
+            template.tokenizer,
+            model_cache_dir,
+            args.ckpt_dir,
+            args.quant_output_dir,
+            sft_args_kwargs={
+                'dtype': args.dtype,
+                'quant_method': args.quant_method
+            })
         logger.info(f'Successfully quantized the model and saved in {args.quant_output_dir}.')
         args.ckpt_dir = args.quant_output_dir
 
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 16dd435d4..0db622153 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -16,8 +16,8 @@
 from swift.utils import (append_to_jsonl, get_logger, get_main, get_model_info, read_multi_line, seed_everything,
                          show_layers)
 from .utils import (DeployArguments, InferArguments, Template, get_additional_saved_files, get_dataset,
-                    get_model_tokenizer, get_template, inference, inference_stream, is_adapter, sample_dataset,
-                    set_generation_config)
+                    get_model_tokenizer, get_template, inference, inference_stream, is_adapter, is_quant_model,
+                    sample_dataset, set_generation_config)
 
 logger = get_logger()
 
@@ -29,6 +29,7 @@ def save_checkpoint(model: Optional[PreTrainedModel],
                     target_dir: str,
                     *,
                     save_safetensors: bool = True,
+                    sft_args_kwargs: Dict[str, Any],
                     **kwargs) -> None:
     if model is not None:
         model.save_pretrained(target_dir, safe_serialization=save_safetensors)
@@ -75,9 +76,10 @@ def save_checkpoint(model: Optional[PreTrainedModel],
             with open(old_sft_args_path, 'r', encoding='utf-8') as f:
                 res = json.load(f)
             res['sft_type'] = 'full'
-            dtype = kwargs.get('dtype')
-            if dtype is not None:
-                res['dtype'] = dtype
+            for k in ['dtype', 'quant_method']:
+                v = sft_args_kwargs.get(k)
+                if v is not None:
+                    res[k] = v
             with open(new_sft_args_path, 'w', encoding='utf-8') as f:
                 json.dump(res, f, ensure_ascii=False, indent=2)
 
@@ -89,8 +91,8 @@ def merge_lora(args: InferArguments,
     logger.info(f'replace_if_exists: {replace_if_exists}')
     assert args.ckpt_dir is not None, 'args.ckpt_dir is not specified.'
     assert args.sft_type in ('lora', 'adalora', 'longlora'), 'Only supports lora series models'
-    for s in ['int4', 'int8', 'awq']:
-        assert s not in args.model_type, f'{s} model is not supported'
+    assert not is_quant_model(
+        args.model_type), f'{args.model_type} is a quantized model and does not support merge-lora.'
     if args.quantization_bit != 0:
         logger.warning('It is not recommended to merge quantized models, '
                        'as this can result in performance degradation')
@@ -117,7 +119,7 @@ def merge_lora(args: InferArguments,
             args.ckpt_dir,
             merged_lora_path,
             save_safetensors=args.save_safetensors,
-            dtype=args.dtype)
+            sft_args_kwargs={'dtype': args.dtype})
         logger.info(f'Successfully merged LoRA and saved in {merged_lora_path}.')
     logger.info("Setting args.sft_type: 'full'")
     logger.info(f'Setting args.ckpt_dir: {merged_lora_path}')
@@ -180,6 +182,7 @@ def prepare_model_template(args: InferArguments,
         model_kwargs,
         model_id_or_path=model_id_or_path,
         revision=args.model_revision,
+        quant_method=args.quant_method,
         **kwargs)
     if verbose:
         logger.info(f'model_config: {model.config}')
@@ -207,7 +210,13 @@ def prepare_model_template(args: InferArguments,
                              f'args.max_model_len: {args.max_model_len}, model.max_model_len: {model.max_model_len}')
     # Preparing LoRA
     if is_adapter(args.sft_type) and args.ckpt_dir is not None:
+        if is_quant_model(args.model_type, model):
+            # gptq awq does not support lora switching
+            args.lora_request_list = None
+            logger.warning('The current model does not support LoRA switching. '
+                           f'Setting args.lora_request_list: {args.lora_request_list}')
         if isinstance(args, DeployArguments) and args.lora_request_list is not None:
+            logger.info(f'args.lora_request_list: {args.lora_request_list}')
             for lora_request in args.lora_request_list:
                 model = Swift.from_pretrained(
                     model, lora_request.lora_local_path, lora_request.lora_name, inference_mode=True)
@@ -499,7 +508,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
                     kwargs['tools'] = tools
                 kwargs['truncation_strategy'] = args.truncation_strategy
                 if args.infer_backend == 'vllm':
-                    assert args.stream is True
+                    assert args.stream
                     if args.verbose:
                         print(f"[QUERY]{data['query']}\n[RESPONSE]", end='')
                     gen = inference_stream_vllm(llm_engine, template, [kwargs], lora_request=lora_request)
diff --git a/swift/llm/rlhf.py b/swift/llm/rlhf.py
index 9c58f43dc..598e5a744 100644
--- a/swift/llm/rlhf.py
+++ b/swift/llm/rlhf.py
@@ -94,12 +94,6 @@ def llm_rlhf(args: RLHFArguments) -> Dict[str, Any]:
         kwargs['use_flash_attn'] = args.use_flash_attn
     if args.local_repo_path:
         kwargs['local_repo_path'] = args.local_repo_path
-    if args.quant_method == 'awq':
-        kwargs['is_awq'] = True
-    elif args.quant_method == 'aqlm':
-        kwargs['is_aqlm'] = True
-    elif args.quant_method == 'gptq':
-        kwargs['is_gptq'] = True
 
     if args.rope_scaling:
         kwargs['rope_scaling'] = args.rope_scaling
@@ -111,6 +105,7 @@ def llm_rlhf(args: RLHFArguments) -> Dict[str, Any]:
         model_kwargs,
         model_id_or_path=args.model_id_or_path,
         revision=args.model_revision,
+        quant_method=args.quant_method,
         is_training=True,
         **kwargs)
     logger.info(f'model_config: {model.config}')
@@ -155,6 +150,7 @@ def llm_rlhf(args: RLHFArguments) -> Dict[str, Any]:
                 model_kwargs,
                 model_id_or_path=args.ref_model_id_or_path,
                 revision=args.model_revision,
+                quant_method=args.quant_method,
                 **kwargs)
     else:
         ref_model = None
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index b6fef5699..641bf3616 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -100,16 +100,9 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         kwargs['use_flash_attn'] = args.use_flash_attn
     if args.local_repo_path:
         kwargs['local_repo_path'] = args.local_repo_path
-    if args.quant_method == 'awq':
-        kwargs['is_awq'] = True
-    elif args.quant_method == 'aqlm':
-        kwargs['is_aqlm'] = True
-    elif args.quant_method == 'gptq':
-        kwargs['is_gptq'] = True
 
     if args.rope_scaling:
         kwargs['rope_scaling'] = args.rope_scaling
-        kwargs['max_length'] = args.max_length
 
     model, tokenizer = get_model_tokenizer(
         args.model_type,
@@ -117,8 +110,14 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         model_kwargs,
         model_id_or_path=args.model_id_or_path,
         revision=args.model_revision,
+        quant_method=args.quant_method,
         is_training=True,
         **kwargs)
+    for k in ['gptq', 'awq', 'aqlm']:
+        if getattr(model, f'is_{k}', None):
+            args.quant_method = k
+            logger.info(f'Setting args.quant_method: {args.quant_method}')
+            break
     logger.info(f'model_config: {model.config}')
     generation_config = GenerationConfig(
         max_new_tokens=args.max_new_tokens,
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index 23f71be5f..b9412f3d5 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -21,9 +21,9 @@
                        get_template, register_template)
 from .utils import (LazyLLMDataset, LLMDataset, dataset_map, download_dataset, find_all_linears, find_embedding,
                     find_ln, get_max_model_len, get_time_info, history_to_messages, inference, inference_stream,
-                    is_vllm_available, limit_history_length, messages_join_observation, messages_to_history,
-                    print_example, safe_tokenizer_decode, set_generation_config, sort_by_max_length, stat_dataset,
-                    to_device)
+                    is_quant_model, is_vllm_available, limit_history_length, messages_join_observation,
+                    messages_to_history, print_example, safe_tokenizer_decode, set_generation_config,
+                    sort_by_max_length, stat_dataset, to_device)
 
 try:
     if is_vllm_available():
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index c8176ad3c..2af040ea4 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -30,7 +30,7 @@
 from .model import (MODEL_MAPPING, dtype_mapping, get_additional_saved_files, get_default_lora_target_modules,
                     get_default_template_type)
 from .template import TEMPLATE_MAPPING
-from .utils import is_vllm_available
+from .utils import is_quant_model, is_vllm_available
 
 logger = get_logger()
 
@@ -675,15 +675,15 @@ def load_from_checkpoint(self) -> None:
         with open(sft_args_path, 'r', encoding='utf-8') as f:
             sft_args = json.load(f)
         imported_keys = [
-            'model_type', 'model_revision', 'quantization_bit', 'dtype', 'bnb_4bit_comp_dtype', 'bnb_4bit_quant_type',
-            'bnb_4bit_use_double_quant', 'model_id_or_path'
+            'model_type', 'model_revision', 'quant_method', 'quantization_bit', 'dtype', 'bnb_4bit_comp_dtype',
+            'bnb_4bit_quant_type', 'bnb_4bit_use_double_quant', 'model_id_or_path'
         ]
 
         for key in imported_keys:
             value = getattr(self, key)
             if key in {'dtype', 'bnb_4bit_comp_dtype'} and value != 'AUTO':
                 continue
-            if key in {'model_type', 'model_revision', 'model_id_or_path'} and value is not None:
+            if key in {'model_type', 'model_revision', 'model_id_or_path', 'quant_method'} and value is not None:
                 continue
             setattr(self, key, sft_args.get(key))
 
@@ -820,8 +820,9 @@ def __post_init__(self) -> None:
                 'lora does not support `freeze_parameters`, please set `--sft_type full`')
             assert len(self.additional_trainable_parameters) == 0, (
                 'lora does not support `additional_trainable_parameters`, please set `--sft_type full`')
-            if 'int4' in self.model_type or 'int8' in self.model_type or 'awq' in self.model_type:
-                assert self.quantization_bit == 0, 'int4, int8 or awq models do not need to be quantized again.'
+            if is_quant_model(self.model_type):
+                assert self.quantization_bit == 0, (
+                    f'{self.model_type} is already a quantized model and does not need to be quantized again.')
             if self.learning_rate is None:
                 self.learning_rate = 1e-4
             if self.save_only_model is None:
@@ -1026,7 +1027,7 @@ def _init_training_args(self) -> None:
         self.training_args = training_args
 
     def _handle_pai_compat(self) -> None:
-        assert is_pai_training_job() is True
+        assert is_pai_training_job()
         logger.info('Handle pai compat...')
         pai_tensorboard_dir = get_pai_tensorboard_dir()
         if self.logging_dir is None and pai_tensorboard_dir is not None:
@@ -1075,7 +1076,8 @@ class InferArguments(ArgumentsBase):
     model_name: List[str] = field(default_factory=lambda: [None, None], metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
     model_author: List[str] = field(
         default_factory=lambda: [None, None], metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
-    quant_method: Literal['bnb', 'hqq', 'eetq'] = None
+    # 'awq', 'gptq', 'aqlm' are used for inference on pre-quantized models.
+    quant_method: Literal['bnb', 'hqq', 'eetq', 'awq', 'gptq', 'aqlm'] = None
     quantization_bit: Literal[0, 1, 2, 3, 4, 8] = 0  # hqq: 1,2,3,4,8. bnb: 4,8
     hqq_axis: Literal[0, 1] = 0
     hqq_dynamic_config_path: Optional[str] = None
@@ -1211,14 +1213,13 @@ def handle_infer_backend(self):
             if not support_vllm:
                 logger.warning(f'vllm not support `{self.model_type}`')
             if self.sft_type == 'lora' and not self.vllm_enable_lora:
-                assert self.merge_lora is True, ('To use VLLM, you need to provide the complete weight parameters. '
-                                                 'Please set `--merge_lora true`.')
+                assert self.merge_lora, ('To use VLLM, you need to provide the complete weight parameters. '
+                                         'Please set `--merge_lora true`.')
         if (self.infer_backend == 'vllm' and self.vllm_enable_lora
                 or self.infer_backend == 'pt' and isinstance(self, DeployArguments) and self.sft_type == 'lora'):
             assert self.ckpt_dir is not None
             self.lora_modules.append(f'default-lora={self.ckpt_dir}')
             self.lora_request_list = _parse_lora_modules(self.lora_modules, self.infer_backend == 'vllm')
-            logger.info(f'args.lora_request_list: {self.lora_request_list}')
 
         template_info = TEMPLATE_MAPPING[self.template_type]
         if self.num_beams != 1:
@@ -1236,7 +1237,7 @@ def load_from_ckpt_dir(self) -> None:
         with open(sft_args_path, 'r', encoding='utf-8') as f:
             sft_args = json.load(f)
         imported_keys = [
-            'model_type', 'model_revision', 'sft_type', 'template_type', 'system', 'quantization_bit',
+            'model_type', 'model_revision', 'sft_type', 'template_type', 'system', 'quant_method', 'quantization_bit',
             'bnb_4bit_comp_dtype', 'bnb_4bit_quant_type', 'bnb_4bit_use_double_quant', 'rope_scaling'
         ]
         if self.load_dataset_config:
@@ -1248,7 +1249,7 @@ def load_from_ckpt_dir(self) -> None:
             value = getattr(self, key)
             if key in {'dataset', 'val_dataset'} and len(value) > 0:
                 continue
-            if key in {'dataset_test_ratio', 'system'} and value is not None:
+            if key in {'dataset_test_ratio', 'system', 'quant_method'} and value is not None:
                 continue
             setattr(self, key, sft_args.get(key))
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 84f5fef4c..0bee8cd5a 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -6,7 +6,7 @@
 from contextlib import nullcontext
 from functools import partial, update_wrapper, wraps
 from types import MethodType
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional, Tuple, Type, Union
 
 import torch
 import torch.distributed as dist
@@ -910,12 +910,13 @@ def get_model_tokenizer_from_repo(model_dir: str,
                 trust_remote_code=True,
             )
         else:
-            logger.info(f'Model loading with args: model_dir: {model_dir},'
-                        f'torch_dtype: {torch_dtype},'
-                        f'model_kwargs: {model_kwargs}')
+            logger.info(f'model_kwargs: {model_kwargs}')
             with context:
                 model = automodel_class.from_pretrained(
                     model_dir, config=model_config, torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs)
+        model.is_gptq = is_gptq
+        model.is_awq = is_awq
+        model.is_aqlm = is_aqlm
         if hasattr(model, 'hf_device_map'):
             logger.debug(f'Model hf_device_map: {model.hf_device_map}')
     return model, tokenizer
@@ -1109,7 +1110,7 @@ def get_model_tokenizer_cogagent(model_dir: str,
                                  load_model: bool = True,
                                  **kwargs):
     tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/vicuna-7b-v1.5', revision='master', trust_remote_code=True)
-    if load_model is True:
+    if load_model:
         logger.warning('CogAgent with FusedLayerNorm will cause an training loss of NAN, '
                        'to avoid this, please uninstall apex.')
     model, tokenizer = get_model_tokenizer_from_repo(
@@ -4932,6 +4933,7 @@ def get_model_tokenizer(model_type: str,
                         *,
                         model_id_or_path: Optional[str] = None,
                         revision: Optional[str] = None,
+                        quant_method: Literal['gptq', 'awq', 'aqlm', None] = None,
                         **kwargs) -> Tuple[Optional[PreTrainedModel], PreTrainedTokenizerBase]:
     """
     torch_dtype: If you use None, it will retrieve the torch_dtype from the config.json file.
@@ -4949,10 +4951,14 @@ def get_model_tokenizer(model_type: str,
     get_function = model_info['get_function']
     if model_kwargs is None:
         model_kwargs = {}
-    if 'device_map' not in model_kwargs and not use_torchacc():
-        model_kwargs['device_map'] = 'auto'
 
     if load_model:
+        if 'device_map' not in model_kwargs and not use_torchacc():
+            model_kwargs['device_map'] = 'auto'
+        for k in ['gptq', 'awq', 'aqlm']:
+            if quant_method == k:
+                kwargs[f'is_{k}'] = True
+                break
         if model_info.get('torch_dtype') is not None:
             model_torch_dtype = model_info['torch_dtype']
             if torch_dtype is None:
@@ -4969,6 +4975,7 @@ def get_model_tokenizer(model_type: str,
                         and quantization_config.bnb_4bit_compute_dtype is None):
                     quantization_config.bnb_4bit_compute_dtype = torch_dtype
                     logger.info(f'Setting quantization_config.bnb_4bit_compute_dtype: {torch_dtype}')
+
     kwargs['eos_token'] = model_info['eos_token']
     pad_token = model_info.get('pad_token')
     if pad_token is not None:
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 65f0557a1..b5a44c857 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -424,6 +424,19 @@ def find_embedding(model: Module) -> List[str]:
     return _find_layers(model, torch.nn.Embedding)
 
 
+def is_quant_model(model_type: Optional[str] = None, model=None) -> bool:
+    # Check if the model is gptq, awq, aqlm model. Do not check for other quantization situations such as bnb.
+    if model_type is not None:
+        for k in ['int4', 'int8', 'awq', 'aqlm']:
+            if k in model_type:
+                return True
+    if model is not None:
+        for k in ['gptq', 'awq', 'aqlm']:
+            if getattr(model, f'is_{k}', None):
+                return True
+    return False
+
+
 def find_all_linears(model: Module, quantization_bit: int, model_type: str) -> List[str]:
     """ref: https://github.com/artidoro/qlora"""
     head_module_name = 'lm_head'
@@ -734,7 +747,7 @@ def inference(model: PreTrainedModel,
     if generation_config is None:
         generation_config = getattr(model, 'generation_config', None)
     generation_config = deepcopy(generation_config)
-    if stream is True and verbose is False:
+    if stream and not verbose:
         logger.warning('Please set verbose to True to support TextStreamer, or use `inference_stream.`')
         stream = False
     streamer = None
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index 747fe7f31..a2820e9ef 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -227,7 +227,7 @@ def inference_stream_vllm(llm_engine: LLMEngine,
     assert isinstance(generation_config, VllmGenerationConfig)
     request_list = deepcopy(request_list)
     generation_config = deepcopy(generation_config)
-    if generation_config.use_beam_search is True:
+    if generation_config.use_beam_search:
         error_msg = 'Streaming generation does not support beam search.'
         raise ValueError(error_msg)
 
@@ -368,7 +368,7 @@ def inference_vllm(llm_engine: LLMEngine,
         else:
             llm_engine.add_request(str(i), None, generation_config, input_ids, **add_request_kwargs)
 
-    if use_tqdm is True:
+    if use_tqdm:
         assert verbose is False
     prog_bar = tqdm(total=len(request_list), dynamic_ncols=True, disable=not use_tqdm)
     outputs = []
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 2eb02d954..f10e1a086 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -83,7 +83,7 @@ def is_local_master():
 
 
 def is_master():
-    rank = int(os.getenv('RANK', -1))
+    rank = get_dist_setting()[0]
     return rank in {-1, 0}
 
 

From ab51afd1cae827f5c9303e73f811630d76b0f601 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Jun 2024 10:39:45 +0800
Subject: [PATCH 08/15] fix readme

---
 README.md    | 6 +++---
 README_CN.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 8dca51442..b746ce5be 100644
--- a/README.md
+++ b/README.md
@@ -388,7 +388,7 @@ NODE_RANK=0 \
 MASTER_ADDR=127.0.0.1 \
 NPROC_PER_NODE=8 \
 swift sft \
-    --model_id_or_path qwen1half-32b-chat \
+    --model_type qwen1half-32b-chat \
     --sft_type full \
     --dataset blossom-math-zh \
     --output_dir output \
@@ -401,7 +401,7 @@ NODE_RANK=1 \
 MASTER_ADDR=xxx.xxx.xxx.xxx \
 NPROC_PER_NODE=8 \
 swift sft \
-    --model_id_or_path qwen1half-32b-chat \
+    --model_type qwen1half-32b-chat \
     --sft_type full \
     --dataset blossom-math-zh \
     --output_dir output \
@@ -415,7 +415,7 @@ In DLC product, WORLD_SIZE is the node number, RANK is the node index, this is d
 NNODES=$WORLD_SIZE \
 NODE_RANK=$RANK \
 swift sft \
-    --model_id_or_path qwen1half-32b-chat \
+    --model_type qwen1half-32b-chat \
     --sft_type full \
     --dataset blossom-math-zh \
     --output_dir output \
diff --git a/README_CN.md b/README_CN.md
index 95c6e9905..fb0f48cc5 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -385,7 +385,7 @@ NODE_RANK=0 \
 MASTER_ADDR=127.0.0.1 \
 NPROC_PER_NODE=8 \
 swift sft \
-    --model_id_or_path qwen1half-32b-chat \
+    --model_type qwen1half-32b-chat \
     --sft_type full \
     --dataset blossom-math-zh \
     --output_dir output \
@@ -398,7 +398,7 @@ NODE_RANK=1 \
 MASTER_ADDR=xxx.xxx.xxx.xxx \
 NPROC_PER_NODE=8 \
 swift sft \
-    --model_id_or_path qwen1half-32b-chat \
+    --model_type qwen1half-32b-chat \
     --sft_type full \
     --dataset blossom-math-zh \
     --output_dir output \
@@ -411,7 +411,7 @@ DLC环境变量中，WORLD_SIZE指代node数量，RANK指代node序号，这一
 NNODES=$WORLD_SIZE \
 NODE_RANK=$RANK \
 swift sft \
-    --model_id_or_path qwen1half-32b-chat \
+    --model_type qwen1half-32b-chat \
     --sft_type full \
     --dataset blossom-math-zh \
     --output_dir output \

From a3a52d01f28cb2f79e211f95699b9436b0c78d98 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:58:46 +0800
Subject: [PATCH 09/15] Add new dataset (#1227)

---
 swift/llm/utils/dataset.py | 77 +++++++++++++++++++++++++++++++++++++-
 swift/llm/utils/media.py   | 11 +++---
 2 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index b54e432c6..63bf5053b 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -21,7 +21,7 @@
 
 from swift.utils import get_logger, get_seed, is_dist, is_local_master, read_from_jsonl, transform_jsonl_to_df
 from swift.utils.torch_utils import _find_local_mac
-from .media import MediaCache
+from .media import MediaCache, MediaTag
 from .preprocess import (AlpacaPreprocessor, ClsPreprocessor, ComposePreprocessor, ConversationsPreprocessor,
                          ListPreprocessor, PreprocessFunc, RenameColumnsPreprocessor, SmartPreprocessor,
                          TextGenerationPreprocessor, preprocess_sharegpt)
@@ -162,6 +162,8 @@ class DatasetName:
     midefics = 'midefics'
     gqa = 'gqa'
     text_caps = 'text-caps'
+    refcoco_unofficial_caption = 'refcoco-unofficial-caption'
+    refcoco_unofficial_grounding = 'refcoco-unofficial-grounding'
     a_okvqa = 'a-okvqa'
     okvqa = 'okvqa'
     ocr_vqa = 'ocr-vqa'
@@ -1112,6 +1114,79 @@ def preprocess(row):
         load_from_cache_file=False).filter(lambda row: row.get('response')).rename_columns({'image': 'images'})
 
 
+def preprocess_refcoco_unofficial_caption(dataset):
+
+    cache_dir = MediaCache.download(
+        'https://www.modelscope.cn/api/v1/datasets/we_dont_produce_water/'
+        'coco_res/repo?Revision=master&FilePath=coco_2014.zip', 'coco2014')
+
+    def preprocess(row):
+        caption = row['captions'][0]
+        bbox = row['bbox']
+        image_path = os.path.join(cache_dir, row['image_path'].replace('coco/train2014', 'train2014'))
+        media_tag = MediaTag(media_type='image', task_type='grounding_caption')
+        for i in range(len(bbox)):
+            bbox[i] = round(float(bbox[i]))
+        res = {}
+
+        objects = [[caption, bbox]]
+        media_tag(res, [image_path])
+        res['images'] = [image_path]
+        res['objects'] = json.dumps(objects)
+        if not os.path.exists(image_path):
+            res['response'] = ''
+        return res
+
+    return dataset.map(preprocess, load_from_cache_file=False).filter(lambda row: row.get('response'))
+
+
+register_dataset(
+    DatasetName.refcoco_unofficial,
+    'swift/refcoco', [],
+    preprocess_func=preprocess_refcoco_unofficial_caption,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcoco',
+    huge_dataset=True,
+    tags=['multi-modal', 'en', 'caption'])
+
+
+def preprocess_refcoco_unofficial_grounding(dataset):
+
+    cache_dir = MediaCache.download(
+        'https://www.modelscope.cn/api/v1/datasets/we_dont_produce_water/'
+        'coco_res/repo?Revision=master&FilePath=coco_2014.zip', 'coco2014')
+
+    def preprocess(row):
+        caption = row['captions'][0]
+        bbox = row['bbox']
+        image_path = os.path.join(cache_dir, row['image_path'].replace('coco/train2014', 'train2014'))
+        media_tag = MediaTag(media_type='image', task_type='ref_grounding')
+        for i in range(len(bbox)):
+            bbox[i] = round(float(bbox[i]))
+        res = {}
+
+        objects = [[caption, bbox]]
+        media_tag(res, [image_path])
+        res['images'] = [image_path]
+        res['objects'] = json.dumps(objects)
+        if not os.path.exists(image_path):
+            res['response'] = ''
+        return res
+
+    return dataset.map(preprocess, load_from_cache_file=False).filter(lambda row: row.get('response'))
+
+
+register_dataset(
+    DatasetName.refcoco_unofficial_grounding,
+    'swift/refcoco', [],
+    preprocess_func=preprocess_refcoco_unofficial_grounding,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcoco',
+    huge_dataset=True,
+    tags=['multi-modal', 'en', 'grounding'])
+
 register_dataset(
     DatasetName.text_caps,
     'swift/TextCaps', [],
diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
index 5b26da107..bb1453291 100644
--- a/swift/llm/utils/media.py
+++ b/swift/llm/utils/media.py
@@ -24,7 +24,7 @@ class MediaTag:
                 ('<bbox>', '<ref-object>'),
                 ('The object at position <bbox>', '<ref-object>'),
                 ('This <bbox> is', '<ref-object>'),
-                ('What is the thing at <bbox>', '<ref-object>'),
+                ('What is the object at <bbox>', '<ref-object>'),
                 ('Describe <bbox>', '<ref-object>'),
                 ('<bbox> is', '<ref-object>'),
                 ('The bounding box coordinate <bbox> contains', '<ref-object>'),
@@ -62,14 +62,13 @@ def __init__(self,
         self.task_type = task_type
         self.media_tag = media_tag or '<unused_tag>'
 
-    def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List = None) -> None:
+    def __call__(self, d: Dict[str, Any], medias: Union[tuple, list]) -> None:
         """Format the query/response/history with medias
 
         Args:
             d: A dict contains history/query/response
             medias: A list of medias(one round, multiple medias),
                     a single media(one round, one media), or a tuple of media list(multiple rounds)
-            objects: A list of object-bbox pairs(one round), or a tuple of object-bbox lists(multiple rounds)
         """
         if not self.media_type:
             return
@@ -83,7 +82,8 @@ def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List
             pass
         elif self.task_type in ('ref_grounding', 'grounding_caption'):
             lang = np.random.choice(['en', 'zh'], p=[0.8, 0.2])
-            query, response = np.random.choice(self.task_prompts[self.task_type][lang])
+            prompts = self.task_prompts[self.task_type][lang]
+            query, response = prompts[np.random.choice(range(len(prompts)))]
         elif self.task_type == 'ocr':
             raise NotImplementedError
         else:
@@ -101,8 +101,7 @@ def __call__(self, d: Dict[str, Any], medias: Union[tuple, list], objects: List
         if 'history' in d:
             d['history'] = history
         d['query'] = query
-        if 'response' in d:
-            d['response'] = response
+        d['response'] = response
 
 
 class MediaCache:

From ff24d84cb16b07b7527619214344ba0da738a8f0 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Wed, 26 Jun 2024 16:37:44 +0800
Subject: [PATCH 10/15] fix (#1232)

---
 swift/llm/utils/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 63bf5053b..bd4716a79 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -1141,7 +1141,7 @@ def preprocess(row):
 
 
 register_dataset(
-    DatasetName.refcoco_unofficial,
+    DatasetName.refcoco_unofficial_caption,
     'swift/refcoco', [],
     preprocess_func=preprocess_refcoco_unofficial_caption,
     get_function=get_dataset_from_repo,

From d8682d038d013b933c20ed4d0fda4abd3f42af56 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Thu, 27 Jun 2024 14:20:00 +0800
Subject: [PATCH 11/15] Fix bugs (#1241)

---
 .../DPO\350\256\255\347\273\203\346\226\207\346\241\243.md"  | 2 +-
 docs/source_en/LLM/DPO.md                                    | 2 +-
 swift/llm/utils/utils.py                                     | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git "a/docs/source/LLM/DPO\350\256\255\347\273\203\346\226\207\346\241\243.md" "b/docs/source/LLM/DPO\350\256\255\347\273\203\346\226\207\346\241\243.md"
index fa5a95361..172fe650d 100644
--- "a/docs/source/LLM/DPO\350\256\255\347\273\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/DPO\350\256\255\347\273\203\346\226\207\346\241\243.md"
@@ -80,7 +80,7 @@ cd examples/pytorch/llm
 - 如果用带有history的数据训练base模型，需要指定支持多轮对话的template(base模型往往不支持多轮对话)，对于这种情况我们默认设置了`chatml`template，你也可以支持--model_type 来选择训练模型的template
 - 我们默认在训练时设置`--gradient_checkpointing true`来**节约显存**, 这会略微降低训练速度.
 - 如果你使用的是**V100**等较老的GPU, 你需要设置`--dtype AUTO`或者`--dtype fp16`, 因为其不支持bf16.
-- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[**flash-attn**](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练). 支持flash-attn的模型可以查看[LLM支持的模型](支持的模型和数据集.md#模型)
+- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[**flash-attn**](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(3090, V100等显卡不支持flash-attn进行训练). 支持flash-attn的模型可以查看[LLM支持的模型](支持的模型和数据集.md#模型)
 - 如果你需要断网进行训练, 请使用`--model_id_or_path <model_dir>`和设置`--check_model_is_latest false`. 具体参数含义请查看[命令行参数](命令行参数.md).
 - 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`.
 
diff --git a/docs/source_en/LLM/DPO.md b/docs/source_en/LLM/DPO.md
index 0bae5a075..ee8c013a5 100644
--- a/docs/source_en/LLM/DPO.md
+++ b/docs/source_en/LLM/DPO.md
@@ -78,7 +78,7 @@ cd examples/pytorch/llm
 
 - We default to setting `--gradient_checkpointing true` during training to **save memory**, which will slightly reduce training speed.
 - If you are using older GPUs such as **V100**, you need to set `--dtype AUTO` or `--dtype fp16`, because they do not support bf16.
-- If your machine has high-performance graphics cards like A100 and you are using the qwen series models, we recommend installing [**flash-attn**](https://github.com/Dao-AILab/flash-attention), which will speed up training and inference as well as reduce memory usage (A10, 3090, V100, etc. graphics cards do not support training with flash-attn). Models that support flash-attn can be viewed in [LLM Supported Models](Supported-models-datasets.md#models)
+- If your machine has high-performance graphics cards like A100 and you are using the qwen series models, we recommend installing [**flash-attn**](https://github.com/Dao-AILab/flash-attention), which will speed up training and inference as well as reduce memory usage (3090, V100, etc. graphics cards do not support training with flash-attn). Models that support flash-attn can be viewed in [LLM Supported Models](Supported-models-datasets.md#models)
 - If you need to train offline, please use `--model_id_or_path <model_dir>` and set `--check_model_is_latest false`. For specific parameter meanings, please see [Command Line Arguments](Command-line-parameters.md).
 - If you want to push weights to the ModelScope Hub during training, you need to set `--push_to_hub true`.
 
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index b5a44c857..ab28d3c55 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -269,7 +269,10 @@ def _try_fetch(self, first_idx: int) -> Optional[Dict[str, Any]]:
         idx = np.random.permutation(len(self))[:self.try_fetch_time - 1]
         for i in [first_idx] + idx.tolist():
             data = self.dataset[i]
-            res = self.template.encode(data)
+            try:
+                res = self.template.encode(data)
+            except OSError:
+                continue
             if len(res[0]) > 0:
                 return res
 

From 8728264522312d097c332631964e6bab53e2eb28 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 27 Jun 2024 16:48:13 +0800
Subject: [PATCH 12/15] fix bugs (#1242)

---
 ...14\346\225\260\346\215\256\351\233\206.md" |  8 +-
 .../LLM/Supported-models-datasets.md          |  8 +-
 requirements/llm.txt                          |  1 +
 swift/llm/eval.py                             |  8 +-
 swift/llm/infer.py                            |  2 +-
 swift/llm/sft.py                              | 14 ++--
 swift/llm/utils/client_utils.py               |  2 +-
 swift/llm/utils/model.py                      |  9 ++-
 swift/llm/utils/template.py                   | 73 ++++++++-----------
 9 files changed, 59 insertions(+), 66 deletions(-)

diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index b9cc49db5..f5c20d029 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -162,7 +162,7 @@
 |yi-1_5-6b-chat|[01ai/Yi-1.5-6B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-6B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)|
 |yi-1_5-9b|[01ai/Yi-1.5-9B](https://modelscope.cn/models/01ai/Yi-1.5-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B](https://huggingface.co/01-ai/Yi-1.5-9B)|
 |yi-1_5-9b-chat|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)|
-|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
+|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
 |yi-1_5-34b|[01ai/Yi-1.5-34B](https://modelscope.cn/models/01ai/Yi-1.5-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B](https://huggingface.co/01-ai/Yi-1.5-34B)|
 |yi-1_5-34b-chat|[01ai/Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)|
 |yi-1_5-34b-chat-16k|[01ai/Yi-1.5-34B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-34B-Chat-16K)|
@@ -311,7 +311,7 @@
 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
 | ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen-vl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
@@ -330,8 +330,8 @@
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
 |mini-internvl-chat-4b-v1_5|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5/summary)|qkv_proj|internvl-phi3|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
-|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
-|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
+|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
+|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|
 |paligemma-3b-pt-448|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
 |paligemma-3b-pt-896|[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 6ffb58341..49ed03fb4 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -162,7 +162,7 @@ The table below introcudes all models supported by SWIFT:
 |yi-1_5-6b-chat|[01ai/Yi-1.5-6B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-6B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)|
 |yi-1_5-9b|[01ai/Yi-1.5-9B](https://modelscope.cn/models/01ai/Yi-1.5-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B](https://huggingface.co/01-ai/Yi-1.5-9B)|
 |yi-1_5-9b-chat|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)|
-|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
+|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
 |yi-1_5-34b|[01ai/Yi-1.5-34B](https://modelscope.cn/models/01ai/Yi-1.5-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B](https://huggingface.co/01-ai/Yi-1.5-34B)|
 |yi-1_5-34b-chat|[01ai/Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)|
 |yi-1_5-34b-chat-16k|[01ai/Yi-1.5-34B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-34B-Chat-16K)|
@@ -311,7 +311,7 @@ The table below introcudes all models supported by SWIFT:
 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
 | ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen-vl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
@@ -330,8 +330,8 @@ The table below introcudes all models supported by SWIFT:
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
 |mini-internvl-chat-4b-v1_5|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5/summary)|qkv_proj|internvl-phi3|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
-|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
-|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
+|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
+|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|
 |paligemma-3b-pt-448|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
 |paligemma-3b-pt-896|[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|
diff --git a/requirements/llm.txt b/requirements/llm.txt
index 773a104c8..c5e3f812a 100644
--- a/requirements/llm.txt
+++ b/requirements/llm.txt
@@ -1,3 +1,4 @@
+attrdict
 charset_normalizer
 cpm_kernels
 fastapi
diff --git a/swift/llm/eval.py b/swift/llm/eval.py
index 8d3c85406..f04f8b9d9 100644
--- a/swift/llm/eval.py
+++ b/swift/llm/eval.py
@@ -6,7 +6,11 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import json
+from llmuses.config import TaskConfig
+from llmuses.constants import DEFAULT_ROOT_CACHE_DIR
 from llmuses.models.custom import CustomModel
+from llmuses.run import run_task
+from llmuses.summarizer import Summarizer
 from modelscope import GenerationConfig
 from tqdm import tqdm
 
@@ -130,9 +134,6 @@ def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
 
 
 def llm_eval(args: EvalArguments) -> List[Dict[str, Any]]:
-    from llmuses.run import run_task
-    from llmuses.config import TaskConfig
-    from llmuses.summarizer import Summarizer
     logger.info(f'args: {args}')
     seed_everything(args.seed)
     model_name = args.model_type
@@ -150,6 +151,7 @@ def llm_eval(args: EvalArguments) -> List[Dict[str, Any]]:
 
     task_configs = TaskConfig.load(custom_model=eval_model, tasks=args.eval_dataset + custom_names)
     for task_config in task_configs:
+        task_config.dataset_dir = DEFAULT_ROOT_CACHE_DIR
         task_config.use_cache = args.eval_use_cache
         if args.eval_limit is not None:
             task_config.limit = args.eval_limit
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 0db622153..4ba517d9b 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -369,7 +369,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
             if system is None and template.use_default_system:
                 system = template.default_system
             if args.infer_backend == 'vllm':
-                request_list = [{'query': query, 'history': history, 'system': system}]
+                request_list = [{'query': query, 'history': history, 'system': system, **infer_kwargs}]
                 if args.stream:
                     gen = inference_stream_vllm(llm_engine, template, request_list, lora_request=lora_request)
                     print_idx = 0
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index 641bf3616..bf58d596e 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -190,10 +190,6 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     logger.info(f'train_dataset: {train_dataset}')
     logger.info(f'val_dataset: {val_dataset}')
     template_kwargs = {}
-    template_info = TEMPLATE_MAPPING[args.template_type]
-    use_model = template_info.get('use_model', False)
-    if use_model:
-        template_kwargs['model'] = model
     template_kwargs['use_loss_scale'] = args.use_loss_scale
     if args.loss_scale_config_path is not None:
         cwd = os.getcwd()
@@ -204,8 +200,14 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     template_kwargs['tools_prompt'] = args.tools_prompt
     if args.sequence_parallel_size and args.sequence_parallel_size > 1:
         template_kwargs['sequence_parallel_size'] = args.sequence_parallel_size
-    template: Template = get_template(args.template_type, tokenizer, args.system, args.max_length,
-                                      args.truncation_strategy, **template_kwargs)
+    template: Template = get_template(
+        args.template_type,
+        tokenizer,
+        args.system,
+        args.max_length,
+        args.truncation_strategy,
+        model=model,
+        **template_kwargs)
     args.system = template.default_system
     logger.info(f'system: {args.system}')
     logger.info(f'args.lazy_tokenize: {args.lazy_tokenize}')
diff --git a/swift/llm/utils/client_utils.py b/swift/llm/utils/client_utils.py
index 7ee3f3679..e80f6887b 100644
--- a/swift/llm/utils/client_utils.py
+++ b/swift/llm/utils/client_utils.py
@@ -33,7 +33,7 @@ def _parse_stream_data(data: bytes) -> Optional[str]:
     data = data.strip()
     if len(data) == 0:
         return
-    assert data.startswith('data:')
+    assert data.startswith('data:'), f'data: {data}'
     return data[5:].strip()
 
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 0bee8cd5a..8dfe89b56 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -914,6 +914,9 @@ def get_model_tokenizer_from_repo(model_dir: str,
             with context:
                 model = automodel_class.from_pretrained(
                     model_dir, config=model_config, torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs)
+        if is_training:
+            model.train()
+            model.requires_grad_(True)
         model.is_gptq = is_gptq
         model.is_awq = is_awq
         model.is_aqlm = is_aqlm
@@ -2153,7 +2156,7 @@ def _output_device_map_hook(module, input, output):
     hf_model_id='01-ai/Yi-1.5-9B-Chat')
 @register_model(
     ModelType.yi_1_5_9b_chat_16k,
-    '01ai/Yi-1.5-9B-Chat',
+    '01ai/Yi-1.5-9B-Chat-16K',
     LoRATM.llama,
     TemplateType.yi1_5,
     support_flash_attn=True,
@@ -3537,7 +3540,6 @@ def _new_forward(*args, **kwargs) -> Tensor:
     TemplateType.deepseek_vl,
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
-    requires=['attrdict'],
     hf_model_id='deepseek-ai/deepseek-vl-7b-chat')
 @register_model(
     ModelType.deepseek_vl_1_3b_chat,
@@ -3546,7 +3548,6 @@ def _new_forward(*args, **kwargs) -> Tensor:
     TemplateType.deepseek_vl,
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
-    requires=['attrdict'],
     hf_model_id='deepseek-ai/deepseek-vl-1.3b-chat')
 def get_model_tokenizer_deepseek_vl(model_dir: str,
                                     torch_dtype: Dtype,
@@ -4003,7 +4004,7 @@ def _qwen_vl_audio_decode(self, *args, skip_special_tokens=False, **kwargs) -> s
     ModelType.qwen_vl_chat,
     'qwen/Qwen-VL-Chat',
     LoRATM.qwen,
-    TemplateType.qwenvl,
+    TemplateType.qwen_vl,
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
     hf_model_id='Qwen/Qwen-VL-Chat')
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 8ad46403d..718ce0a1d 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -32,7 +32,7 @@ class TemplateType:
     # chat
     default = 'default'
     qwen = 'qwen'
-    qwenvl = 'qwenvl'
+    qwen_vl = 'qwen-vl'
     qwen_audio = 'qwen-audio'
     modelscope_agent = 'modelscope-agent'
     baichuan = 'baichuan'
@@ -267,11 +267,8 @@ def add_default_tags(self, example: Dict[str, Any]) -> None:
                             h[0] = media_tag + h[0]
                     if example[media_key][-1]:
                         query = media_tag + query
-                    example[media_key] = [m for m in example[media_key] if m]
                 else:
-                    example[media_key] = [m for m in example[media_key] if m]
-                    media_len = len(example[media_key]) if isinstance(example[media_key],
-                                                                      (tuple, list)) else 1 if example[media_key] else 0
+                    media_len = len([m for m in example[media_key] if m])
                     if history:
                         history[0][0] = media_tag * media_len + history[0][0]
                     else:
@@ -282,6 +279,7 @@ def add_default_tags(self, example: Dict[str, Any]) -> None:
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """return: inputs, tokenizer_kwargs"""
+        example = example.copy()
         if not self._is_init:
             raise ValueError(
                 'Template is not initialized, please use the `get_template` function to obtain the template.')
@@ -783,7 +781,7 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
 
 
 register_template(TemplateType.qwen, QwenTemplate())
-register_template(TemplateType.qwenvl, QwenVLTemplate())
+register_template(TemplateType.qwen_vl, QwenVLTemplate())
 register_template(TemplateType.chatml, QwenTemplate(auto_add_bos=True))
 
 register_template(
@@ -878,6 +876,15 @@ def _read_from_path(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image
     return image
 
 
+def _read_batch(path_list: List[Union[str, 'PIL.Image.Image', None]]) -> List['PIL.Image.Image']:
+    res = []
+    for path in path_list:
+        if path is None:  # ignore None
+            continue
+        res.append(_read_from_path(path))
+    return res
+
+
 class YiVLTemplate(Template):
 
     def replace_tag(self, media_type, index, example) -> List[Context]:
@@ -895,12 +902,11 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             model = model.model
         image_processor = model.vision_tower.image_processor
         images_path = example.get('images') or []
-        images = []
-        for image_path in images_path:
-            image = _read_from_path(image_path)
+        images = _read_batch(images_path)
+        for i, image in enumerate(images):
             background_color = tuple(int(x * 255) for x in image_processor.image_mean)
             image = expand2square(image, background_color)
-            images.append(image)
+            images[i] = image
         if images:
             image_tensor = image_processor.preprocess(images, return_tensors='pt')['pixel_values']
             inputs['images'] = image_tensor.to(model.dtype)
@@ -1125,13 +1131,12 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             history = []
         example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '</s>')
         inputs, _ = super().encode(example)
-        images = []
         dtype = self.model.dtype
         images_path.extend(example.get('images') or [])
-        for image_path in images_path:
-            image = _read_from_path(image_path)
+        images = _read_batch(images_path)
+        for i, image in enumerate(images):
             image = self.model.vis_processor(image)
-            images.append(image.to(dtype))
+            images[i] = image.to(dtype)
         if len(inputs) == 0:
             return inputs, {}
         inputs.pop('loss_scale', None)
@@ -1208,11 +1213,7 @@ class InternvlTemplate(Template):
 
     def __init__(self):
         super().__init__(['<s>'], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
-                         ['<|im_end|>'], self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>'])
-
-    def check_example(self, example):
-        images = example.get('images') or []
-        assert len(images) <= 1
+                         ['<|im_end|>'], self.system, ['<s><|im_start|>system\n{{SYSTEM}}<|im_end|>'])
 
     def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
@@ -1237,12 +1238,12 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             pixel_values = torch.cat(pixel_values, dim=0)
             image_bs = pixel_values.shape[0]
 
-            idx = idx_list[0]
+            idx, idx2 = idx_list[0], idx_list[-1]  # remove [-100, -100]
             img_tokens: List[int] = self.tokenizer.encode('<img>' + '<IMG_CONTEXT>' * self.num_image_token * image_bs
                                                           + '</img>\n')
-            input_ids = input_ids[:idx] + img_tokens + input_ids[idx + 1:]
+            input_ids = input_ids[:idx] + img_tokens + input_ids[idx2 + 1:]
             if labels is not None:
-                labels = labels[:idx] + [-100] * len(img_tokens) + labels[idx + 1:]
+                labels = labels[:idx] + [-100] * len(img_tokens) + labels[idx2 + 1:]
             inputs['input_ids'] = input_ids
             inputs['labels'] = labels
 
@@ -1336,10 +1337,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if len(inputs) == 0:
             return inputs, {}
         images_path = example.get('images') or []
-        images = []
-        for image_path in images_path:
-            image = _read_from_path(image_path)
-            images.append(image)
+        images = _read_batch(images_path)
         image_processor = self.tokenizer.processor.image_processor
         if images:
             inputs['pixel_values'] = image_processor(images, return_tensors='pt')['pixel_values'].to(self.model.dtype)
@@ -1367,10 +1365,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if len(inputs) == 0:
             return inputs, {}
         images_path = example.get('images') or []
-        images = []
-        for image_path in images_path:
-            image = _read_from_path(image_path)
-            images.append(image)
+        images = _read_batch(images_path)
         image_sizes = [x.size for x in images]
         from llava.mm_utils import process_images
         model = self.model.model
@@ -1505,10 +1500,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             history = []
         example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<s>')
         images_path.extend(example.get('images') or [])
-        images = []
-        for image_path in images_path:
-            image = _read_from_path(image_path)
-            images.append(image)
+        images = _read_batch(images_path)
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1619,11 +1611,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         images_path.extend(example.get('images') or [])
         if len(inputs) == 0:
             return inputs, {}
-        images = []
-        for image_path in images_path:
-            image = _read_from_path(image_path)
-            images.append(image)
-
+        images = _read_batch(images_path)
         processor = self.tokenizer.processor
         input_ids, labels = inputs['input_ids'], inputs['labels']
         idx_list = _findall(input_ids, processor.image_id)
@@ -1921,13 +1909,12 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         from mplug_owl2.mm_utils import process_images
         processor = self.tokenizer.processor
         images_path = example.get('images') or []
-        images = []
-        for image_path in images_path:
-            image = _read_from_path(image_path)
+        images = _read_batch(images_path)
+        for i, image in enumerate(images):
             # ref: https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary
             max_edge = max(image.size)
             image = image.resize((max_edge, max_edge))
-            images.append(image)
+            images[i] = image
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}

From da582550769f80dc95807f8fd65687e4be75999b Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 28 Jun 2024 11:13:32 +0800
Subject: [PATCH 13/15] refactor inference (#1245)

---
 swift/llm/infer.py            |   2 +-
 swift/llm/utils/argument.py   |   2 +-
 swift/llm/utils/preprocess.py |   2 +-
 swift/llm/utils/utils.py      | 205 ++++++++++++++++------------------
 swift/llm/utils/vllm_utils.py | 126 +++++++++------------
 swift/trainers/mixin.py       |   2 +-
 6 files changed, 152 insertions(+), 187 deletions(-)

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 4ba517d9b..25a9a2c13 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -243,7 +243,7 @@ def prepare_model_template(args: InferArguments,
 
 def read_media_file(infer_kwargs: Dict[str, Any], infer_media_type: Literal['none', 'round', 'dialogue']) -> None:
     text = 'Input a media path or URL <<< '
-    images = infer_kwargs.get('images', [])
+    images = infer_kwargs.get('images') or []
     if infer_media_type == 'none':
         return
     if infer_media_type == 'round' or len(images) == 0:
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 2af040ea4..066ae12bb 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -1299,7 +1299,7 @@ class DeployArguments(InferArguments):
     def __post_init__(self):
         super().__post_init__()
         model_info = MODEL_MAPPING[self.model_type]
-        tags = model_info.get('tags', [])
+        tags = model_info.get('tags') or []
         self.is_multimodal = 'multi-modal' in tags
 
 
diff --git a/swift/llm/utils/preprocess.py b/swift/llm/utils/preprocess.py
index a2af4021a..2436b0e91 100644
--- a/swift/llm/utils/preprocess.py
+++ b/swift/llm/utils/preprocess.py
@@ -197,7 +197,7 @@ def preprocess(self, d: Dict[str, Any]) -> Dict[str, Any]:
             response = conversations[-1][self.value_key]
             system = sys
             history = h
-            tools = d.get('tools', [])
+            tools = d.get('tools') or []
             row = {'system': system, 'history': history, 'history_roles': hr}
             row.update({
                 'query': query,
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index ab28d3c55..86f5f557d 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -542,38 +542,22 @@ def __next__(self) -> List[int]:
             return value
 
 
-@torch.inference_mode()
-def inference_stream(model: PreTrainedModel,
-                     template: Template,
-                     query: str,
-                     history: Optional[History] = None,
-                     system: Optional[str] = None,
-                     images: Optional[List[str]] = None,
-                     *,
-                     generation_config: Optional[GenerationConfig] = None,
-                     stop_words: Optional[StopWords] = None,
-                     generation_info: Optional[Dict[str, int]] = None,
-                     adapter_names: Optional[List[str]] = None,
-                     **kwargs) -> Iterator[Tuple[str, History]]:
-    """
-    generation_config: Priority: generation_config > model.generation_config.
-    """
+def _prepare_inputs(model: PreTrainedModel,
+                    template: Template,
+                    query: str,
+                    history: History,
+                    system: Optional[str] = None,
+                    images: Optional[List[str]] = None,
+                    *,
+                    generation_config: Optional[GenerationConfig] = None,
+                    stop_words: Optional[StopWords] = None,
+                    adapter_names: Optional[List[str]] = None,
+                    **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any], int]:
     if stop_words is None:
         stop_words = []
-    if history is None:
-        history = []
-    else:
-        history = deepcopy(history)
     if images is None:
         images = []
 
-    # agent support
-    is_observation = history[-1][-1].endswith('Observation:') if history and history[-1][-1] else False
-    if is_observation:
-        history[-1][-1] = history[-1][-1] + query
-        act_length = len(history[-1][-1])
-        query = None
-
     example = {
         'query': query,
         'history': history,
@@ -587,7 +571,7 @@ def inference_stream(model: PreTrainedModel,
     truncation_strategy = kwargs.pop('truncation_strategy', 'delete')
     if len(inputs) == 0 and truncation_strategy == 'delete':
         # input_ids exceeds `max_length`. Please increase the value of `max_length`.
-        return '', history
+        return {}, tokenizer_kwargs, 0
 
     inputs.pop('labels', None)
     tokenizer = template.tokenizer
@@ -606,11 +590,8 @@ def inference_stream(model: PreTrainedModel,
         inputs['token_type_ids'] = torch.tensor(inputs['token_type_ids'])[None]
     model.eval()
     if generation_config is None:
-        generation_config = getattr(model, 'generation_config', None)
+        generation_config = getattr(model, 'generation_config')
     generation_config = deepcopy(generation_config)
-    if generation_config.num_beams != 1:
-        error_msg = 'Streaming generation does not support beam search.'
-        raise ValueError(error_msg)
 
     if tokenizer.eos_token_id is not None:
         generation_config.eos_token_id = tokenizer.eos_token_id
@@ -627,21 +608,69 @@ def inference_stream(model: PreTrainedModel,
                 raise AssertionError('Current sentence length exceeds' f'the model max_length: {max_length}')
     if template.suffix[-1] not in stop_words:
         stop_words.append(template.suffix[-1])
-    stopping_criteria = StoppingCriteriaList([StopWordsCriteria(tokenizer, stop_words, **tokenizer_kwargs)])
     inputs = to_device(inputs, device)
-    if generation_info is not None:
-        generation_info['num_prompt_tokens'] = token_len
     if 'inputs_embeds' in inputs:
         inputs.pop('input_ids', None)
-    streamer = TokenListIteratorStreamer()
     if adapter_names is not None:
         inputs['adapter_names'] = adapter_names
-    generation_kwargs = {
-        'streamer': streamer,
-        'generation_config': generation_config,
-        'stopping_criteria': stopping_criteria,
-        **inputs
-    }
+
+    stopping_criteria = StoppingCriteriaList([StopWordsCriteria(tokenizer, stop_words, **tokenizer_kwargs)])
+    inputs['stopping_criteria'] = stopping_criteria
+    inputs['generation_config'] = generation_config
+    return inputs, tokenizer_kwargs, token_len
+
+
+@torch.inference_mode()
+def inference_stream(model: PreTrainedModel,
+                     template: Template,
+                     query: str,
+                     history: Optional[History] = None,
+                     system: Optional[str] = None,
+                     images: Optional[List[str]] = None,
+                     *,
+                     generation_config: Optional[GenerationConfig] = None,
+                     stop_words: Optional[StopWords] = None,
+                     generation_info: Optional[Dict[str, int]] = None,
+                     adapter_names: Optional[List[str]] = None,
+                     **kwargs) -> Iterator[Tuple[str, History]]:
+    """
+    generation_config: Priority: generation_config > model.generation_config.
+    """
+    if history is None:
+        history = []
+    else:
+        history = deepcopy(history)
+    inputs, tokenizer_kwargs, token_len = _prepare_inputs(
+        model,
+        template,
+        query,
+        history,
+        system,
+        images,
+        generation_config=generation_config,
+        stop_words=stop_words,
+        adapter_names=adapter_names,
+        **kwargs)
+    if len(inputs) == 0:
+        return '', history
+    if generation_info is None:
+        generation_info = {}
+    generation_info['num_prompt_tokens'] = token_len
+
+    # agent support
+    is_observation = history[-1][-1].endswith('Observation:') if history and history[-1][-1] else False
+    if is_observation:
+        history[-1][-1] = history[-1][-1] + query
+        act_length = len(history[-1][-1])
+        query = None
+
+    generation_config = inputs['generation_config']
+    if generation_config.num_beams != 1:
+        error_msg = 'Streaming generation does not support beam search.'
+        raise ValueError(error_msg)
+
+    streamer = TokenListIteratorStreamer()
+    generation_kwargs = {'streamer': streamer, **inputs}
     _model_generate = model.generate
     if is_torch_npu_available():
 
@@ -667,8 +696,7 @@ def _model_generate(*args, **kwargs):
         except StopIteration:
             is_finished = True
         generate_ids = template.get_generate_ids(torch.tensor(raw_generate_ids)[None], token_len)
-        if generation_info is not None:
-            generation_info['num_generated_tokens'] = len(generate_ids)
+        generation_info['num_generated_tokens'] = len(generate_ids)
         response = template.generate_ids_to_response(
             generate_ids,
             is_finished,
@@ -702,58 +730,38 @@ def inference(model: PreTrainedModel,
     """
     generation_config: Priority: generation_config > model.generation_config.
     """
-    if stop_words is None:
-        stop_words = []
     if history is None:
         history = []
     else:
         history = deepcopy(history)
-    if images is None:
-        images = []
+    inputs, tokenizer_kwargs, token_len = _prepare_inputs(
+        model,
+        template,
+        query,
+        history,
+        system,
+        images,
+        generation_config=generation_config,
+        stop_words=stop_words,
+        adapter_names=adapter_names,
+        **kwargs)
+    if len(inputs) == 0:
+        return '', history
+    if generation_info is None:
+        generation_info = {}
+    generation_info['num_prompt_tokens'] = token_len
 
+    # agent support
     is_observation = history[-1][-1].endswith('Observation:') if history and history[-1][-1] else False
     if is_observation:
         history[-1][-1] = history[-1][-1] + query
         query = None
 
-    example = {
-        'query': query,
-        'history': history,
-        'system': system,
-        'images': images,  # for vl. str.
-        'tools': kwargs.pop('tools', None)
-    }
-    template.model = model
-    inputs, tokenizer_kwargs = template.encode(example)
-
-    truncation_strategy = kwargs.pop('truncation_strategy', 'delete')
-    if len(inputs) == 0 and truncation_strategy == 'delete':
-        # input_ids exceeds `max_length`. Please increase the value of `max_length`.
-        return '', history
-
-    inputs.pop('labels', None)
-    tokenizer = template.tokenizer
-    device = next(model.parameters()).device
-    if 'input_ids' in inputs:
-        input_ids = torch.tensor(inputs['input_ids'])[None]
-        inputs['input_ids'] = input_ids
-        token_len = input_ids.shape[1]
-    if 'inputs_embeds' in inputs:
-        inputs_embeds = inputs['inputs_embeds'][None]
-        inputs['inputs_embeds'] = inputs_embeds
-        token_len = inputs_embeds.shape[1]
-
-    inputs['attention_mask'] = torch.ones(token_len)[None]
-    if 'token_type_ids' in inputs:
-        inputs['token_type_ids'] = torch.tensor(inputs['token_type_ids'])[None]
-    model.eval()
-    if generation_config is None:
-        generation_config = getattr(model, 'generation_config', None)
-    generation_config = deepcopy(generation_config)
     if stream and not verbose:
         logger.warning('Please set verbose to True to support TextStreamer, or use `inference_stream.`')
         stream = False
     streamer = None
+    tokenizer = template.tokenizer
     if stream:
         streamer = TextStreamer(tokenizer, skip_prompt=True)
     if verbose:
@@ -762,37 +770,12 @@ def inference(model: PreTrainedModel,
             print(
                 f'{prompt_prefix}{safe_tokenizer_decode(tokenizer, input_ids[0], **tokenizer_kwargs)}{output_prefix}',
                 end='')
-        elif 'query' in example:
-            query = example['query']
+        else:
             print(f'[QUERY]{query}\n{output_prefix}', end='')
-    if tokenizer.eos_token_id is not None:
-        generation_config.eos_token_id = tokenizer.eos_token_id
-    if tokenizer.pad_token_id is not None:
-        generation_config.pad_token_id = tokenizer.pad_token_id
-    if tokenizer.bos_token_id is not None:
-        generation_config.bos_token_id = tokenizer.bos_token_id
-    if generation_config.max_new_tokens is not None:
-        generation_config.max_length = 20  # fix max_length, max_new_tokens warning
-        max_length = get_max_model_len(model.config)
-        if max_length and token_len + generation_config.max_new_tokens > max_length:
-            generation_config.max_new_tokens = max_length - token_len
-            if generation_config.max_new_tokens <= 0:
-                raise AssertionError('Current sentence length exceeds' f'the model max_length: {max_length}')
-    if template.suffix[-1] not in stop_words:
-        stop_words.append(template.suffix[-1])
-    stopping_criteria = StoppingCriteriaList([StopWordsCriteria(tokenizer, stop_words, **tokenizer_kwargs)])
-    inputs = to_device(inputs, device)
-    if generation_info is not None:
-        generation_info['num_prompt_tokens'] = token_len
-    if 'inputs_embeds' in inputs:
-        inputs.pop('input_ids', None)
-    if adapter_names is not None:
-        inputs['adapter_names'] = adapter_names
-    generate_ids = model.generate(
-        streamer=streamer, generation_config=generation_config, stopping_criteria=stopping_criteria, **inputs)
+
+    generate_ids = model.generate(streamer=streamer, **inputs)
     generate_ids = template.get_generate_ids(generate_ids, token_len)
-    if generation_info is not None:
-        generation_info['num_generated_tokens'] = len(generate_ids)
+    generation_info['num_generated_tokens'] = len(generate_ids)
     if verbose and stream is False:
         response = tokenizer.decode(generate_ids, **tokenizer_kwargs)
         print(response)
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index a2820e9ef..bc476efac 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -206,31 +206,22 @@ def __setattr__(self, key: str, value: str) -> None:
             super().__setattr__(key, value)
 
 
-@torch.inference_mode()
-def inference_stream_vllm(llm_engine: LLMEngine,
+def _add_vllm_request(llm_engine: LLMEngine, inputs: Dict[str, Any], *, request_id: str,
+                      generation_config: VllmGenerationConfig, **kwargs) -> None:
+    input_ids = inputs['input_ids']
+    if version.parse(vllm.__version__) >= version.parse('0.4.3'):
+        llm_engine.add_request(request_id, {'prompt_token_ids': input_ids}, generation_config, **kwargs)
+    else:
+        llm_engine.add_request(request_id, None, generation_config, input_ids, **kwargs)
+
+
+def _prepare_vllm_request(llm_engine: LLMEngine,
                           template: Template,
                           request_list: List[Dict[str, Any]],
                           *,
-                          generation_config: Optional[VllmGenerationConfig] = None,
+                          generation_config: VllmGenerationConfig,
                           lora_request: Optional['LoRARequest'] = None,
-                          use_tqdm: bool = False,
-                          **kwargs) -> Iterator[List[Dict[str, Any]]]:
-    """
-    request_list: e.g. [{'query': 'hello!'}].
-        The keys that can be included are: 'query', 'history', 'system'.
-    generation_config: Priority: generation_config > model.generation_config.
-    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
-        The keys to be included will be: 'response', 'history'.
-    """
-    if generation_config is None:
-        generation_config = getattr(llm_engine, 'generation_config', VllmGenerationConfig())
-    assert isinstance(generation_config, VllmGenerationConfig)
-    request_list = deepcopy(request_list)
-    generation_config = deepcopy(generation_config)
-    if generation_config.use_beam_search:
-        error_msg = 'Streaming generation does not support beam search.'
-        raise ValueError(error_msg)
-
+                          **kwargs) -> Tuple[List[Optional[Dict[str, Any]]], List[Tuple[bool, int]]]:
     tokenizer = template.tokenizer
     if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
         generation_config.stop.append(tokenizer.eos_token)
@@ -248,8 +239,9 @@ def inference_stream_vllm(llm_engine: LLMEngine,
     else:
         assert lora_request is None, (
             'The current version of VLLM does not support `lora_request`. Please upgrade VLLM.')
-    request_temp = []
+
     resp_list: List[Optional[Dict[str, Any]]] = [None] * len(request_list)
+    agent_state = []
     for i, request in enumerate(request_list):
         history = request.get('history', None)
         if history is None:
@@ -262,7 +254,7 @@ def inference_stream_vllm(llm_engine: LLMEngine,
             history[-1][-1] = history[-1][-1] + request['query']
             act_length = len(history[-1][-1])
             request['query'] = None
-        request_temp.append((is_observation, act_length))
+        agent_state.append((is_observation, act_length))
 
         request['history'] = history
         inputs = template.encode(request)[0]
@@ -271,11 +263,39 @@ def inference_stream_vllm(llm_engine: LLMEngine,
             # input_ids exceeds `max_length`. Please increase the value of `max_length`.
             resp_list[i] = {'response': '', 'history': history}
             continue
-        input_ids = inputs['input_ids']
-        if version.parse(vllm.__version__) >= version.parse('0.4.3'):
-            llm_engine.add_request(str(i), {'prompt_token_ids': input_ids}, generation_config, **add_request_kwargs)
-        else:
-            llm_engine.add_request(str(i), None, generation_config, input_ids, **add_request_kwargs)
+
+        _add_vllm_request(
+            llm_engine, inputs, request_id=str(i), generation_config=generation_config, **add_request_kwargs)
+    return resp_list, agent_state
+
+
+@torch.inference_mode()
+def inference_stream_vllm(llm_engine: LLMEngine,
+                          template: Template,
+                          request_list: List[Dict[str, Any]],
+                          *,
+                          generation_config: Optional[VllmGenerationConfig] = None,
+                          lora_request: Optional['LoRARequest'] = None,
+                          use_tqdm: bool = False,
+                          **kwargs) -> Iterator[List[Dict[str, Any]]]:
+    """
+    request_list: e.g. [{'query': 'hello!'}].
+        The keys that can be included are: 'query', 'history', 'system'.
+    generation_config: Priority: generation_config > model.generation_config.
+    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
+        The keys to be included will be: 'response', 'history'.
+    """
+    if generation_config is None:
+        generation_config = getattr(llm_engine, 'generation_config', VllmGenerationConfig())
+    assert isinstance(generation_config, VllmGenerationConfig)
+    request_list = deepcopy(request_list)
+    generation_config = deepcopy(generation_config)
+    resp_list, agent_state = _prepare_vllm_request(
+        llm_engine, template, request_list, generation_config=generation_config, lora_request=lora_request, **kwargs)
+
+    if generation_config.use_beam_search:
+        error_msg = 'Streaming generation does not support beam search.'
+        raise ValueError(error_msg)
 
     print_idx_list = [[0] for _ in range(len(request_list))]
     prog_bar = tqdm(total=len(request_list), dynamic_ncols=True, disable=not use_tqdm)
@@ -289,12 +309,12 @@ def inference_stream_vllm(llm_engine: LLMEngine,
                 generate_ids, output.finished, print_idx=print_idx_list[i])
             query = request['query']
             history = request['history']
-            if resp_list[i] is None and not request_temp[i][0]:
+            if resp_list[i] is None and not agent_state[i][0]:
                 history.append(None)
-            if not request_temp[i][0]:
+            if not agent_state[i][0]:
                 history[-1] = [query, safe_response]
             else:
-                history[-1][-1] = history[-1][-1][:request_temp[i][1]] + safe_response
+                history[-1][-1] = history[-1][-1][:agent_state[i][1]] + safe_response
             resp_list[i] = {'response': safe_response, 'history': history}
             if output.finished:
                 prog_bar.update()
@@ -326,48 +346,10 @@ def inference_vllm(llm_engine: LLMEngine,
     assert isinstance(generation_config, VllmGenerationConfig)
     request_list = deepcopy(request_list)
     generation_config = deepcopy(generation_config)
+    resp_list, agent_state = _prepare_vllm_request(
+        llm_engine, template, request_list, generation_config=generation_config, lora_request=lora_request, **kwargs)
 
     tokenizer = template.tokenizer
-    if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
-        generation_config.stop.append(tokenizer.eos_token)
-    if isinstance(template.suffix[-1], str) and template.suffix[-1] not in generation_config.stop:
-        generation_config.stop.append(template.suffix[-1])
-    if isinstance(template.suffix[-1], list):
-        token_str = tokenizer.decode(template.suffix[-1])
-        if token_str not in generation_config.stop:
-            generation_config.stop.append(token_str)
-
-    parameters = inspect.signature(llm_engine.add_request).parameters
-    add_request_kwargs = {}
-    if 'lora_request' in parameters:
-        add_request_kwargs['lora_request'] = lora_request
-    else:
-        assert lora_request is None, (
-            'The current version of VLLM does not support `lora_request`. Please upgrade VLLM.')
-
-    resp_list: List[Optional[Dict[str, Any]]] = [None] * len(request_list)
-    for i, request in enumerate(request_list):
-        history = request.get('history', None)
-        if history is None:
-            history = []
-
-        is_observation = history[-1][-1].endswith('Observation:') if history and history[-1][-1] else False
-        if is_observation:
-            history[-1][-1] = history[-1][-1] + request['query']
-            request['query'] = None
-        request['history'] = history
-        inputs = template.encode(request)[0]
-        truncation_strategy = kwargs.pop('truncation_strategy', 'delete')
-        if len(inputs) == 0 and truncation_strategy == 'delete':
-            # input_ids exceeds `max_length`. Please increase the value of `max_length`.
-            resp_list[i] = {'response': '', 'history': history}
-            continue
-        input_ids = inputs['input_ids']
-        if version.parse(vllm.__version__) >= version.parse('0.4.3'):
-            llm_engine.add_request(str(i), {'prompt_token_ids': input_ids}, generation_config, **add_request_kwargs)
-        else:
-            llm_engine.add_request(str(i), None, generation_config, input_ids, **add_request_kwargs)
-
     if use_tqdm:
         assert verbose is False
     prog_bar = tqdm(total=len(request_list), dynamic_ncols=True, disable=not use_tqdm)
@@ -386,7 +368,7 @@ def inference_vllm(llm_engine: LLMEngine,
         response = template.generate_ids_to_response(generate_ids)
         query = request['query']
         history = request['history']
-        if not is_observation:
+        if not agent_state[i][0]:
             history.append([query, response])
         else:
             history[-1][-1] = history[-1][-1] + response
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index e87e4923f..4d1b46d52 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -403,7 +403,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
         torch.save(self.args, os.path.join(output_dir, 'training_args.bin'))
         # additional files
         if sft_args is not None and sft_args.sft_type == 'full':
-            additional_files = getattr(self.args, 'additional_saved_files', []) + ['preprocessor_config.json']
+            additional_files = getattr(self.args, 'additional_saved_files', None) or [] + ['preprocessor_config.json']
             if model_dir is not None:
                 for file in additional_files:
                     src_path = os.path.join(model_dir, file)

From 4e650e2fb68f0ba5f34d584d84f05672b7b9a9af Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Fri, 28 Jun 2024 11:50:24 +0800
Subject: [PATCH 14/15] Add more datasets (#1246)

---
 ...14\346\225\260\346\215\256\351\233\206.md" | 12 +++-
 .../LLM/Supported-models-datasets.md          | 12 +++-
 swift/llm/data/dataset_info.json              | 29 ++++++++++
 swift/llm/utils/dataset.py                    | 55 ++++++++++++++++++-
 swift/llm/utils/template.py                   |  4 +-
 swift/llm/utils/utils.py                      |  3 +-
 6 files changed, 109 insertions(+), 6 deletions(-)

diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index f5c20d029..cc350cbfe 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -378,6 +378,7 @@
 |🔥coig-cqia|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|chinese_traditional<br>coig_pc<br>exam<br>finance<br>douban<br>human_value<br>logi_qa<br>ruozhiba<br>segmentfault<br>wiki<br>wikihow<br>xhs<br>zhihu|44694|703.8±654.2, min=33, max=19288|general|-|
 |🔥ruozhiba|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|post-annual<br>title-good<br>title-norm|85658|39.9±13.1, min=21, max=559|pretrain|-|
 |long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)||11998|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
+|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)||26336|650.9±217.2, min=209, max=2740|chat, agent, multi-round|-|
 |🔥ms-agent-for-agentfabric|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|default<br>addition|30000|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-|
 |ms-agent-multirole|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole/summary)||9500|447.6±84.9, min=145, max=1101|chat, agent, multi-round, role-play, multi-agent|-|
@@ -385,6 +386,8 @@
 |damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||386984|956.5±407.3, min=326, max=19001|chat, agent, multi-round|-|
 |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-|
 |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld<br>db<br>kg<br>mind2web<br>os<br>webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
+|🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-|
+|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||124345|3669.5±1600.9, min=1047, max=22581|chat, agent, multi-round|-|
 |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
 |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-|
 |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-|
@@ -427,12 +430,17 @@
 |orpo-dpo-mix-40k|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k/summary)|default|43666|548.3±397.4, min=28, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)|
 |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
 |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
+|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
 |pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
 |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
 |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
 |midefics|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset/summary)||3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)|
 |gqa|[None](https://modelscope.cn/datasets/None/summary)|train_all_instructions|-|Dataset is too huge, please click the original link to view the dataset stat.|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)|
 |text-caps|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps/summary)||18145|38.2±4.4, min=31, max=73|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)|
+|refcoco-unofficial-caption|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|44.7±3.2, min=36, max=71|multi-modal, en, caption|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcoco-unofficial-grounding|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|45.2±3.1, min=37, max=69|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcocog-unofficial-caption|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|49.7±4.7, min=37, max=88|multi-modal, en, caption|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
+|refcocog-unofficial-grounding|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|50.1±4.7, min=37, max=90|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
 |a-okvqa|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA/summary)||18201|45.8±7.9, min=32, max=100|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
 |okvqa|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train/summary)||9009|34.4±3.3, min=28, max=59|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)|
 |ocr-vqa|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA/summary)||186753|35.6±6.6, min=29, max=193|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)|
@@ -443,6 +451,7 @@
 |guanaco|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset/summary)|default|31561|250.1±70.3, min=89, max=1436|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)|
 |mind2web|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web/summary)||1009|297522.4±325496.2, min=8592, max=3499715|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)|
 |sharegpt-4o-image|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o/summary)|image_caption|57289|638.7±157.9, min=47, max=4640|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)|
+|pixelprose|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
 |m3it|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT/summary)|coco<br>vqa-v2<br>shapes<br>shapes-rephrased<br>coco-goi-rephrased<br>snli-ve<br>snli-ve-rephrased<br>okvqa<br>a-okvqa<br>viquae<br>textcap<br>docvqa<br>science-qa<br>imagenet<br>imagenet-open-ended<br>imagenet-rephrased<br>coco-goi<br>clevr<br>clevr-rephrased<br>nlvr<br>coco-itm<br>coco-itm-rephrased<br>vsr<br>vsr-rephrased<br>mocheg<br>mocheg-rephrased<br>coco-text<br>fm-iqa<br>activitynet-qa<br>msrvtt<br>ss<br>coco-cn<br>refcoco<br>refcoco-rephrased<br>multi30k<br>image-paragraph-captioning<br>visual-dialog<br>visual-dialog-rephrased<br>iqa<br>vcr<br>visual-mrc<br>ivqa<br>msrvtt-qa<br>msvd-qa<br>gqa<br>text-vqa<br>ocr-vqa<br>st-vqa<br>flickr8k-cn|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
@@ -467,11 +476,12 @@
 |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored<br>flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
 |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
 |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
+|gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
 |github-code|[swift/github-code](https://modelscope.cn/datasets/swift/github-code/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)|
 |gpt4v-dataset|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset/summary)||12356|217.9±68.3, min=35, max=596|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)|
 |guanaco-belle-merge|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0/summary)||693987|134.2±92.0, min=24, max=6507|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)|
+|infinity-instruct|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
 |llava-med-zh-instruct|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k/summary)||56649|207.7±67.6, min=37, max=657|zh, medical, vqa|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)|
-|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, en|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |math-instruct|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct/summary)||262283|254.4±183.5, min=11, max=4383|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)|
 |math-plus|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus/summary)|train|893929|287.1±158.7, min=24, max=2919|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)|
 |moondream2-coyo-5M|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 49ed03fb4..2b7b4eae0 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -378,6 +378,7 @@ The table below introduces the datasets supported by SWIFT:
 |🔥coig-cqia|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|chinese_traditional<br>coig_pc<br>exam<br>finance<br>douban<br>human_value<br>logi_qa<br>ruozhiba<br>segmentfault<br>wiki<br>wikihow<br>xhs<br>zhihu|44694|703.8±654.2, min=33, max=19288|general|-|
 |🔥ruozhiba|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|post-annual<br>title-good<br>title-norm|85658|39.9±13.1, min=21, max=559|pretrain|-|
 |long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)||11998|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
+|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)||26336|650.9±217.2, min=209, max=2740|chat, agent, multi-round|-|
 |🔥ms-agent-for-agentfabric|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|default<br>addition|30000|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-|
 |ms-agent-multirole|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole/summary)||9500|447.6±84.9, min=145, max=1101|chat, agent, multi-round, role-play, multi-agent|-|
@@ -385,6 +386,8 @@ The table below introduces the datasets supported by SWIFT:
 |damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||386984|956.5±407.3, min=326, max=19001|chat, agent, multi-round|-|
 |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-|
 |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld<br>db<br>kg<br>mind2web<br>os<br>webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
+|🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-|
+|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||124345|3669.5±1600.9, min=1047, max=22581|chat, agent, multi-round|-|
 |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
 |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-|
 |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-|
@@ -427,12 +430,17 @@ The table below introduces the datasets supported by SWIFT:
 |orpo-dpo-mix-40k|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k/summary)|default|43666|548.3±397.4, min=28, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)|
 |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
 |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
+|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
 |pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
 |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
 |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
 |midefics|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset/summary)||3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)|
 |gqa|[None](https://modelscope.cn/datasets/None/summary)|train_all_instructions|-|Dataset is too huge, please click the original link to view the dataset stat.|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)|
 |text-caps|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps/summary)||18145|38.2±4.4, min=31, max=73|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)|
+|refcoco-unofficial-caption|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|44.7±3.2, min=36, max=71|multi-modal, en, caption|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcoco-unofficial-grounding|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|45.2±3.1, min=37, max=69|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcocog-unofficial-caption|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|49.7±4.7, min=37, max=88|multi-modal, en, caption|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
+|refcocog-unofficial-grounding|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|50.1±4.7, min=37, max=90|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
 |a-okvqa|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA/summary)||18201|45.8±7.9, min=32, max=100|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
 |okvqa|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train/summary)||9009|34.4±3.3, min=28, max=59|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)|
 |ocr-vqa|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA/summary)||186753|35.6±6.6, min=29, max=193|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)|
@@ -443,6 +451,7 @@ The table below introduces the datasets supported by SWIFT:
 |guanaco|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset/summary)|default|31561|250.1±70.3, min=89, max=1436|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)|
 |mind2web|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web/summary)||1009|297522.4±325496.2, min=8592, max=3499715|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)|
 |sharegpt-4o-image|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o/summary)|image_caption|57289|638.7±157.9, min=47, max=4640|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)|
+|pixelprose|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
 |m3it|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT/summary)|coco<br>vqa-v2<br>shapes<br>shapes-rephrased<br>coco-goi-rephrased<br>snli-ve<br>snli-ve-rephrased<br>okvqa<br>a-okvqa<br>viquae<br>textcap<br>docvqa<br>science-qa<br>imagenet<br>imagenet-open-ended<br>imagenet-rephrased<br>coco-goi<br>clevr<br>clevr-rephrased<br>nlvr<br>coco-itm<br>coco-itm-rephrased<br>vsr<br>vsr-rephrased<br>mocheg<br>mocheg-rephrased<br>coco-text<br>fm-iqa<br>activitynet-qa<br>msrvtt<br>ss<br>coco-cn<br>refcoco<br>refcoco-rephrased<br>multi30k<br>image-paragraph-captioning<br>visual-dialog<br>visual-dialog-rephrased<br>iqa<br>vcr<br>visual-mrc<br>ivqa<br>msrvtt-qa<br>msvd-qa<br>gqa<br>text-vqa<br>ocr-vqa<br>st-vqa<br>flickr8k-cn|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
@@ -467,11 +476,12 @@ The table below introduces the datasets supported by SWIFT:
 |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored<br>flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
 |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
 |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
+|gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
 |github-code|[swift/github-code](https://modelscope.cn/datasets/swift/github-code/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)|
 |gpt4v-dataset|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset/summary)||12356|217.9±68.3, min=35, max=596|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)|
 |guanaco-belle-merge|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0/summary)||693987|134.2±92.0, min=24, max=6507|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)|
+|infinity-instruct|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
 |llava-med-zh-instruct|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k/summary)||56649|207.7±67.6, min=37, max=657|zh, medical, vqa|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)|
-|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, en|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |math-instruct|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct/summary)||262283|254.4±183.5, min=11, max=4383|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)|
 |math-plus|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus/summary)|train|893929|287.1±158.7, min=24, max=2919|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)|
 |moondream2-coyo-5M|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)|
diff --git a/swift/llm/data/dataset_info.json b/swift/llm/data/dataset_info.json
index 8ed68e7c6..702bf7f93 100644
--- a/swift/llm/data/dataset_info.json
+++ b/swift/llm/data/dataset_info.json
@@ -376,6 +376,35 @@
         "tags": ["pretrain", "quality"],
         "huge_dataset": true
     },
+    "gen-qa": {
+        "dataset_id": "swift/GenQA",
+        "hf_dataset_id": "tomg-group-umd/GenQA",
+        "conversations": {
+            "user_role": "user",
+            "assistant_role": "assistant",
+            "conversations_key": "text",
+            "from_key": "role",
+            "value_key": "content",
+            "error_strategy": "delete"
+        },
+        "split": ["code", "dialog", "general", "math", "mmlu", "multiple_choice", "writing", "academic", "task"],
+        "tags": ["qa", "quality", "multi-task"],
+        "huge_dataset": true
+    },
+    "infinity-instruct": {
+        "dataset_id": "swift/Infinity-Instruct",
+        "hf_dataset_id": "BAAI/Infinity-Instruct",
+        "conversations": {
+            "user_role": "human",
+            "assistant_role": "gpt",
+            "conversations_key": "conversations",
+            "from_key": "from",
+            "value_key": "value",
+            "error_strategy": "delete"
+        },
+        "tags": ["qa", "quality", "multi-task"],
+        "huge_dataset": true
+    },
     "wikipedia": {
         "dataset_id": "swift/wikipedia",
         "hf_dataset_id": "wikipedia",
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index bd4716a79..ba8e79396 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -164,6 +164,8 @@ class DatasetName:
     text_caps = 'text-caps'
     refcoco_unofficial_caption = 'refcoco-unofficial-caption'
     refcoco_unofficial_grounding = 'refcoco-unofficial-grounding'
+    refcocog_unofficial_caption = 'refcocog-unofficial-caption'
+    refcocog_unofficial_grounding = 'refcocog-unofficial-grounding'
     a_okvqa = 'a-okvqa'
     okvqa = 'okvqa'
     ocr_vqa = 'ocr-vqa'
@@ -174,6 +176,7 @@ class DatasetName:
     guanaco = 'guanaco'
     mind2web = 'mind2web'
     sharegpt_4o_image = 'sharegpt-4o-image'
+    pixelprose = 'pixelprose'
 
     m3it = 'm3it'
     # additional images
@@ -643,6 +646,38 @@ def _preprocess_vision_dataset2(dataset: HfDataset) -> HfDataset:
     is_main=False)
 
 
+def _preprocess_pixelprose(dataset: HfDataset):
+
+    caption_prompt = [
+        'Give the description of this image.', 'Describe this picture', 'What is the proper title of this image?'
+    ]
+
+    def preprocess(row):
+        vlm_caption = row['vlm_caption']
+        if vlm_caption.startswith('This image displays:'):
+            vlm_caption = vlm_caption[len('This image displays:'):].strip()
+        return {
+            'response': vlm_caption,
+            'images': row['url'],
+            'query': np.random.choice(caption_prompt),
+        }
+
+    return dataset.map(preprocess, load_from_cache_file=False)
+
+
+register_dataset(
+    DatasetName.pixelprose,
+    'swift/pixelprose',
+    None,
+    _preprocess_pixelprose,
+    get_dataset_from_repo,
+    split=['train', 'cc12m', 'commonpool', 'redcaps'],
+    hf_dataset_id='tomg-group-umd/pixelprose',
+    tags=['caption', 'multi-modal', 'vision'],
+    huge_dataset=True,
+    is_main=False)
+
+
 def _preprocess_aishell1_dataset(dataset: HfDataset) -> HfDataset:
     prompt = '语音转文本'
     audio_key = 'Audio:FILE'
@@ -1147,7 +1182,15 @@ def preprocess(row):
     get_function=get_dataset_from_repo,
     split=['train', 'validation'],
     hf_dataset_id='jxu124/refcoco',
-    huge_dataset=True,
+    tags=['multi-modal', 'en', 'caption'])
+
+register_dataset(
+    DatasetName.refcocog_unofficial_caption,
+    'swift/refcocog', [],
+    preprocess_func=preprocess_refcoco_unofficial_caption,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcocog',
     tags=['multi-modal', 'en', 'caption'])
 
 
@@ -1184,7 +1227,15 @@ def preprocess(row):
     get_function=get_dataset_from_repo,
     split=['train', 'validation'],
     hf_dataset_id='jxu124/refcoco',
-    huge_dataset=True,
+    tags=['multi-modal', 'en', 'grounding'])
+
+register_dataset(
+    DatasetName.refcocog_unofficial_grounding,
+    'swift/refcocog', [],
+    preprocess_func=preprocess_refcoco_unofficial_grounding,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcocog',
     tags=['multi-modal', 'en', 'grounding'])
 
 register_dataset(
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 718ce0a1d..8b7df2812 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -298,7 +298,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         history_roles: Optional[History] = example.get('history_roles')
         system: Optional[str] = example.get('system', None)
         template_type: Optional[str] = getattr(self, 'template_type', None)
-        tools: List[Any] = example.get('tools') or []
+        tools: Union[List[Any], str] = example.get('tools') or []
         is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
 
         if len(history) > 0:
@@ -313,6 +313,8 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             assert self.system_prefix is not None, (
                 f'The template does not support `system`, template_type: {template_type}')
         if tools:
+            if isinstance(tools, str):
+                tools = json.loads(tools)
             if system is None:
                 system = ''
             system += get_tools_prompt(tools, self.tools_prompt)
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 86f5f557d..1d056ca99 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -271,7 +271,8 @@ def _try_fetch(self, first_idx: int) -> Optional[Dict[str, Any]]:
             data = self.dataset[i]
             try:
                 res = self.template.encode(data)
-            except OSError:
+            except OSError as e:
+                logger.error('Error occurs in lazy tokenize:', e)
                 continue
             if len(res[0]) > 0:
                 return res

From bd420f2e9ef6d04a8b4c62c1ab27d3b8b7fb9b67 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 28 Jun 2024 12:53:56 +0800
Subject: [PATCH 15/15] Support gemma2 (#1247)

---
 README.md                                     |  3 +-
 README_CN.md                                  |  3 +-
 ...14\346\225\260\346\215\256\351\233\206.md" |  5 +++
 .../LLM/Supported-models-datasets.md          |  5 +++
 swift/llm/data/dataset_info.json              |  6 +++
 swift/llm/utils/dataset.py                    |  1 +
 swift/llm/utils/model.py                      | 40 +++++++++++++++++++
 7 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b746ce5be..58932b957 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ SWIFT has rich documentations for users, please check [here](https://github.com/
 SWIFT web-ui is available both on [Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) and [ModelScope studio](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary), please feel free to try!
 
 ## 🎉 News
+- 🔥2024.06.28: Support for Gemma2 series models: gemma2-9b, gemma2-9b-instruct, gemma2-27b, gemma2-27b-instruct.
 - 🔥2024.06.18: Supports **DeepSeek-Coder-v2** series model! Use model_type `deepseek-coder-v2-instruct` and `deepseek-coder-v2-lite-instruct` to begin.
 - 🔥2024.06.16: Supports **KTO** and **CPO** training! See [document](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Human-Preference-Alignment-Training-Documentation.md) to start training!
 - 2024.06.11: Support for tool-calling agent deployment that conform to the OpenAI interface.You can refer to [Agent deployment best practice](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md)
@@ -512,7 +513,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 | InternLM<br>InternLM2<br>InternLM2-Math              | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
 | DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2<br>DeepSeek-Coder-V2          | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
 | MAMBA                                          | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
-| Gemma                                          | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-7B                                  | base model<br>instruct model                       |
+| Gemma<br>Gemma2                                     | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-27B                                  | base model<br>instruct model                       |
 | MiniCPM                                        | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM)     | Chinese<br>English    | 2B-3B                                  | chat model<br>MoE model                                 |
 | OpenBuddy                                      | [OpenBuddy series models](https://github.com/OpenBuddy/OpenBuddy)       | Chinese<br>English    | 7B-70B                                 | base model<br>chat model                       |
 | Orion                                          | [OrionStar AI series models](https://github.com/OrionStarAI)            | Chinese<br>English    | 14B                                    | base model<br>chat model                       |
diff --git a/README_CN.md b/README_CN.md
index fb0f48cc5..6091f3886 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -48,6 +48,7 @@ SWIFT具有丰富的文档体系，如有使用问题请请查看[这里](https:
 可以在[Huggingface space](https://huggingface.co/spaces/tastelikefeet/swift) 和 [ModelScope创空间](https://www.modelscope.cn/studios/iic/Scalable-lightWeight-Infrastructure-for-Fine-Tuning/summary) 中体验SWIFT web-ui功能了。
 
 ## 🎉 新闻
+- 🔥2024.06.28: 支持**Gemma2**系列模型: gemma2-9b, gemma2-9b-instruct, gemma2-27b, gemma2-27b-instruct.
 - 🔥2024.06.18: 支持**DeepSeek-Coder-v2**系列模型! 使用model_type`deepseek-coder-v2-instruct`和`deepseek-coder-v2-lite-instruct`来开启训练和推理.
 - 🔥2024.06.16: 支持**KTO**和**CPO**训练，使用`swift rlhf --rlhf_type kto`和`swift rlhf --rlhf_type cpo`来开始训练，可以参考[文档](./docs/source/LLM/人类偏好对齐训练文档.md).
 - 2024.06.11: 支持符合OpenAI接口的工具调用Agent部署, 可以查看[Agent部署最佳实践](docs/source/LLM/Agent部署最佳实践.md).
@@ -508,7 +509,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | InternLM<br>InternLM2<br>InternLM2-Math                   | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文<br>英文 | 1.8B-20B                  | base模型<br>chat模型<br>数学模型                  |
 | DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2<br>DeepSeek-Coder-V2       | [幻方系列模型](https://github.com/deepseek-ai)               | 中文<br>英文 | 1.3B-236B                  | base模型<br>chat模型<br>MoE模型<br>代码模型<br>数学模型 |
 | MAMBA                                               | [MAMBA时序卷积模型](https://github.com/state-spaces/mamba)   | 英文       | 130M-2.8B                 | base模型                                    |
-| Gemma                                               | [Google Gemma系列模型](https://github.com/google/gemma_pytorch) | 英文       | 2B-7B                     | base模型<br>instruct模型                      |
+| Gemma<br>Gemma2                                       | [Google Gemma系列模型](https://github.com/google/gemma_pytorch) | 英文       | 2B-27B                     | base模型<br>instruct模型                      |
 | MiniCPM                                             | [OpenBmB MiniCPM系列模型](https://github.com/OpenBMB/MiniCPM) | 中文<br>英文 | 2B-3B                     | chat模型<br>MoE模型                                    |
 | OpenBuddy                                           | [OpenBuddy系列模型](https://github.com/OpenBuddy/OpenBuddy)  | 中文<br>英文 | 7B-70B                    | base模型<br>chat模型                          |
 | Orion                                               | [猎户星空系列模型](https://github.com/OrionStarAI)           | 中文<br>英文 | 14B                       | base模型<br>chat模型                          |
diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index cc350cbfe..c776a2922 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -216,6 +216,10 @@
 |gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
 |gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
 |gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)|
+|gemma2-9b|[LLM-Research/gemma-2-9b](https://modelscope.cn/models/LLM-Research/gemma-2-9b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b)|
+|gemma2-27b|[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)|
+|gemma2-9b-instruct|[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)|
+|gemma2-27b-instruct|[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
 |minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)|
 |minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)|
 |minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)|
@@ -416,6 +420,7 @@
 |webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)||50000|1478.9±11526.1, min=100, max=490484|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)|
 |generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)||396004|273.3±52.0, min=32, max=873|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
 |🔥self-cognition|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition/summary)||134|53.6±18.6, min=29, max=121|chat, self-cognition|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
+|🔥swift-mix|[swift/swift-sft-mixture](https://modelscope.cn/datasets/swift/swift-sft-mixture/summary)|sharegpt<br>firefly<br>codefuse<br>metamathqa|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, sft, general|-|
 |cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)||4959|3234.4±2547.5, min=91, max=19548|chat, classification|-|
 |ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)||1266|118.3±45.5, min=44, max=223|chat, ner|-|
 |coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|coco_2014_caption|454617|299.8±2.8, min=295, max=352|chat, multi-modal, vision|-|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 2b7b4eae0..86637ed85 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -216,6 +216,10 @@ The table below introcudes all models supported by SWIFT:
 |gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
 |gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
 |gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)|
+|gemma2-9b|[LLM-Research/gemma-2-9b](https://modelscope.cn/models/LLM-Research/gemma-2-9b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b)|
+|gemma2-27b|[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)|
+|gemma2-9b-instruct|[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)|
+|gemma2-27b-instruct|[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.42|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
 |minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)|
 |minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)|
 |minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)|
@@ -416,6 +420,7 @@ The table below introduces the datasets supported by SWIFT:
 |webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)||50000|1478.9±11526.1, min=100, max=490484|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)|
 |generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)||396004|273.3±52.0, min=32, max=873|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
 |🔥self-cognition|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition/summary)||134|53.6±18.6, min=29, max=121|chat, self-cognition|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
+|🔥swift-mix|[swift/swift-sft-mixture](https://modelscope.cn/datasets/swift/swift-sft-mixture/summary)|sharegpt<br>firefly<br>codefuse<br>metamathqa|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, sft, general|-|
 |cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)||4959|3234.4±2547.5, min=91, max=19548|chat, classification|-|
 |ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)||1266|118.3±45.5, min=44, max=223|chat, ner|-|
 |coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|coco_2014_caption|454617|299.8±2.8, min=295, max=352|chat, multi-modal, vision|-|
diff --git a/swift/llm/data/dataset_info.json b/swift/llm/data/dataset_info.json
index 702bf7f93..df33f3f52 100644
--- a/swift/llm/data/dataset_info.json
+++ b/swift/llm/data/dataset_info.json
@@ -726,5 +726,11 @@
         "hf_dataset_id": "modelscope/self-cognition",
         "remove_useless_columns": false,
         "tags": ["chat", "self-cognition", "🔥"]
+    },
+    "swift-mix": {
+        "dataset_id": "swift/swift-sft-mixture",
+        "subsets": ["sharegpt", "firefly", "codefuse", "metamathqa"],
+        "tags": ["chat", "sft", "general", "🔥"],
+        "huge_dataset": true
     }
 }
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index ba8e79396..4637b1105 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -129,6 +129,7 @@ class DatasetName:
     webnovel_zh = 'webnovel-zh'
     generated_chat_zh = 'generated-chat-zh'
     self_cognition = 'self-cognition'
+    swift_mix = 'swift-mix'
 
     # example dataset for specific model
     cls_fudan_news_zh = 'cls-fudan-news-zh'  # seqgpt-560m
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 8dfe89b56..e02b65aef 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -291,6 +291,10 @@ class ModelType:
     gemma_7b = 'gemma-7b'
     gemma_2b_instruct = 'gemma-2b-instruct'
     gemma_7b_instruct = 'gemma-7b-instruct'
+    gemma2_9b = 'gemma2-9b'
+    gemma2_27b = 'gemma2-27b'
+    gemma2_9b_instruct = 'gemma2-9b-instruct'
+    gemma2_27b_instruct = 'gemma2-27b-instruct'
     # paligemma
     paligemma_3b_pt_224 = 'paligemma-3b-pt-224'
     paligemma_3b_pt_448 = 'paligemma-3b-pt-448'
@@ -1532,6 +1536,42 @@ def _output_device_map_hook(module, input, output):
     return model, tokenizer
 
 
+@register_model(
+    ModelType.gemma2_9b,
+    'LLM-Research/gemma-2-9b',
+    LoRATM.llama,
+    TemplateType.default_generation,
+    requires=['transformers>=4.42'],
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='google/gemma-2-9b')
+@register_model(
+    ModelType.gemma2_27b,
+    'LLM-Research/gemma-2-27b',
+    LoRATM.llama,
+    TemplateType.default_generation,
+    requires=['transformers>=4.42'],
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='google/gemma-2-27b')
+@register_model(
+    ModelType.gemma2_9b_instruct,
+    'LLM-Research/gemma-2-9b-it',
+    LoRATM.llama,
+    TemplateType.gemma,
+    requires=['transformers>=4.42'],
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='google/gemma-2-9b-it')
+@register_model(
+    ModelType.gemma2_27b_instruct,
+    'LLM-Research/gemma-2-27b-it',
+    LoRATM.llama,
+    TemplateType.gemma,
+    requires=['transformers>=4.42'],
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='google/gemma-2-27b-it')
 @register_model(
     ModelType.qwen2_57b_a14b,
     'qwen/Qwen2-57B-A14B',