diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index b9cc49db5..92b12a081 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -378,6 +378,7 @@
 |🔥coig-cqia|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|chinese_traditional<br>coig_pc<br>exam<br>finance<br>douban<br>human_value<br>logi_qa<br>ruozhiba<br>segmentfault<br>wiki<br>wikihow<br>xhs<br>zhihu|44694|703.8±654.2, min=33, max=19288|general|-|
 |🔥ruozhiba|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|post-annual<br>title-good<br>title-norm|85658|39.9±13.1, min=21, max=559|pretrain|-|
 |long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)||11998|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
+|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)||26336|650.9±217.2, min=209, max=2740|chat, agent, multi-round|-|
 |🔥ms-agent-for-agentfabric|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|default<br>addition|30000|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-|
 |ms-agent-multirole|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole/summary)||9500|447.6±84.9, min=145, max=1101|chat, agent, multi-round, role-play, multi-agent|-|
@@ -385,6 +386,8 @@
 |damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||386984|956.5±407.3, min=326, max=19001|chat, agent, multi-round|-|
 |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-|
 |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld<br>db<br>kg<br>mind2web<br>os<br>webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
+|🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-|
+|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||124345|3669.5±1600.9, min=1047, max=22581|chat, agent, multi-round|-|
 |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
 |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-|
 |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-|
@@ -427,12 +430,17 @@
 |orpo-dpo-mix-40k|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k/summary)|default|43666|548.3±397.4, min=28, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)|
 |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
 |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
+|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
 |pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
 |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
 |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
 |midefics|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset/summary)||3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)|
 |gqa|[None](https://modelscope.cn/datasets/None/summary)|train_all_instructions|-|Dataset is too huge, please click the original link to view the dataset stat.|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)|
 |text-caps|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps/summary)||18145|38.2±4.4, min=31, max=73|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)|
+|refcoco-unofficial-caption|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|44.7±3.2, min=36, max=71|multi-modal, en, caption|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcoco-unofficial-grounding|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|45.2±3.1, min=37, max=69|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcocog-unofficial-caption|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|49.7±4.7, min=37, max=88|multi-modal, en, caption|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
+|refcocog-unofficial-grounding|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|50.1±4.7, min=37, max=90|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
 |a-okvqa|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA/summary)||18201|45.8±7.9, min=32, max=100|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
 |okvqa|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train/summary)||9009|34.4±3.3, min=28, max=59|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)|
 |ocr-vqa|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA/summary)||186753|35.6±6.6, min=29, max=193|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)|
@@ -443,6 +451,7 @@
 |guanaco|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset/summary)|default|31561|250.1±70.3, min=89, max=1436|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)|
 |mind2web|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web/summary)||1009|297522.4±325496.2, min=8592, max=3499715|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)|
 |sharegpt-4o-image|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o/summary)|image_caption|57289|638.7±157.9, min=47, max=4640|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)|
+|pixelprose|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
 |m3it|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT/summary)|coco<br>vqa-v2<br>shapes<br>shapes-rephrased<br>coco-goi-rephrased<br>snli-ve<br>snli-ve-rephrased<br>okvqa<br>a-okvqa<br>viquae<br>textcap<br>docvqa<br>science-qa<br>imagenet<br>imagenet-open-ended<br>imagenet-rephrased<br>coco-goi<br>clevr<br>clevr-rephrased<br>nlvr<br>coco-itm<br>coco-itm-rephrased<br>vsr<br>vsr-rephrased<br>mocheg<br>mocheg-rephrased<br>coco-text<br>fm-iqa<br>activitynet-qa<br>msrvtt<br>ss<br>coco-cn<br>refcoco<br>refcoco-rephrased<br>multi30k<br>image-paragraph-captioning<br>visual-dialog<br>visual-dialog-rephrased<br>iqa<br>vcr<br>visual-mrc<br>ivqa<br>msrvtt-qa<br>msvd-qa<br>gqa<br>text-vqa<br>ocr-vqa<br>st-vqa<br>flickr8k-cn|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
@@ -467,11 +476,12 @@
 |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored<br>flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
 |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
 |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
+|gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
 |github-code|[swift/github-code](https://modelscope.cn/datasets/swift/github-code/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)|
 |gpt4v-dataset|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset/summary)||12356|217.9±68.3, min=35, max=596|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)|
 |guanaco-belle-merge|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0/summary)||693987|134.2±92.0, min=24, max=6507|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)|
+|infinity-instruct|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
 |llava-med-zh-instruct|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k/summary)||56649|207.7±67.6, min=37, max=657|zh, medical, vqa|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)|
-|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, en|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |math-instruct|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct/summary)||262283|254.4±183.5, min=11, max=4383|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)|
 |math-plus|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus/summary)|train|893929|287.1±158.7, min=24, max=2919|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)|
 |moondream2-coyo-5M|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 6ffb58341..a2d64a65a 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -378,6 +378,7 @@ The table below introduces the datasets supported by SWIFT:
 |🔥coig-cqia|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|chinese_traditional<br>coig_pc<br>exam<br>finance<br>douban<br>human_value<br>logi_qa<br>ruozhiba<br>segmentfault<br>wiki<br>wikihow<br>xhs<br>zhihu|44694|703.8±654.2, min=33, max=19288|general|-|
 |🔥ruozhiba|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|post-annual<br>title-good<br>title-norm|85658|39.9±13.1, min=21, max=559|pretrain|-|
 |long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)||11998|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
+|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)||26336|650.9±217.2, min=209, max=2740|chat, agent, multi-round|-|
 |🔥ms-agent-for-agentfabric|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|default<br>addition|30000|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-|
 |ms-agent-multirole|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole/summary)||9500|447.6±84.9, min=145, max=1101|chat, agent, multi-round, role-play, multi-agent|-|
@@ -385,6 +386,8 @@ The table below introduces the datasets supported by SWIFT:
 |damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||386984|956.5±407.3, min=326, max=19001|chat, agent, multi-round|-|
 |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-|
 |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld<br>db<br>kg<br>mind2web<br>os<br>webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
+|🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-|
+|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||124345|3669.5±1600.9, min=1047, max=22581|chat, agent, multi-round|-|
 |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
 |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-|
 |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-|
@@ -427,12 +430,17 @@ The table below introduces the datasets supported by SWIFT:
 |orpo-dpo-mix-40k|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k/summary)|default|43666|548.3±397.4, min=28, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)|
 |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
 |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
+|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
 |pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
 |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
 |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
 |midefics|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset/summary)||3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)|
 |gqa|[None](https://modelscope.cn/datasets/None/summary)|train_all_instructions|-|Dataset is too huge, please click the original link to view the dataset stat.|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)|
 |text-caps|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps/summary)||18145|38.2±4.4, min=31, max=73|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)|
+|refcoco-unofficial-caption|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|44.7±3.2, min=36, max=71|multi-modal, en, caption|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcoco-unofficial-grounding|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|45.2±3.1, min=37, max=69|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
+|refcocog-unofficial-caption|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|49.7±4.7, min=37, max=88|multi-modal, en, caption|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
+|refcocog-unofficial-grounding|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|50.1±4.7, min=37, max=90|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
 |a-okvqa|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA/summary)||18201|45.8±7.9, min=32, max=100|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
 |okvqa|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train/summary)||9009|34.4±3.3, min=28, max=59|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)|
 |ocr-vqa|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA/summary)||186753|35.6±6.6, min=29, max=193|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)|
@@ -443,6 +451,7 @@ The table below introduces the datasets supported by SWIFT:
 |guanaco|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset/summary)|default|31561|250.1±70.3, min=89, max=1436|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)|
 |mind2web|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web/summary)||1009|297522.4±325496.2, min=8592, max=3499715|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)|
 |sharegpt-4o-image|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o/summary)|image_caption|57289|638.7±157.9, min=47, max=4640|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)|
+|pixelprose|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
 |m3it|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT/summary)|coco<br>vqa-v2<br>shapes<br>shapes-rephrased<br>coco-goi-rephrased<br>snli-ve<br>snli-ve-rephrased<br>okvqa<br>a-okvqa<br>viquae<br>textcap<br>docvqa<br>science-qa<br>imagenet<br>imagenet-open-ended<br>imagenet-rephrased<br>coco-goi<br>clevr<br>clevr-rephrased<br>nlvr<br>coco-itm<br>coco-itm-rephrased<br>vsr<br>vsr-rephrased<br>mocheg<br>mocheg-rephrased<br>coco-text<br>fm-iqa<br>activitynet-qa<br>msrvtt<br>ss<br>coco-cn<br>refcoco<br>refcoco-rephrased<br>multi30k<br>image-paragraph-captioning<br>visual-dialog<br>visual-dialog-rephrased<br>iqa<br>vcr<br>visual-mrc<br>ivqa<br>msrvtt-qa<br>msvd-qa<br>gqa<br>text-vqa<br>ocr-vqa<br>st-vqa<br>flickr8k-cn|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
@@ -467,11 +476,12 @@ The table below introduces the datasets supported by SWIFT:
 |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored<br>flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)|
 |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)|
 |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)|
+|gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
 |github-code|[swift/github-code](https://modelscope.cn/datasets/swift/github-code/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)|
 |gpt4v-dataset|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset/summary)||12356|217.9±68.3, min=35, max=596|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)|
 |guanaco-belle-merge|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0/summary)||693987|134.2±92.0, min=24, max=6507|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)|
+|infinity-instruct|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
 |llava-med-zh-instruct|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k/summary)||56649|207.7±67.6, min=37, max=657|zh, medical, vqa|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)|
-|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, en|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)|
 |math-instruct|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct/summary)||262283|254.4±183.5, min=11, max=4383|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)|
 |math-plus|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus/summary)|train|893929|287.1±158.7, min=24, max=2919|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)|
 |moondream2-coyo-5M|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)|
diff --git a/swift/llm/data/dataset_info.json b/swift/llm/data/dataset_info.json
index 8ed68e7c6..702bf7f93 100644
--- a/swift/llm/data/dataset_info.json
+++ b/swift/llm/data/dataset_info.json
@@ -376,6 +376,35 @@
         "tags": ["pretrain", "quality"],
         "huge_dataset": true
     },
+    "gen-qa": {
+        "dataset_id": "swift/GenQA",
+        "hf_dataset_id": "tomg-group-umd/GenQA",
+        "conversations": {
+            "user_role": "user",
+            "assistant_role": "assistant",
+            "conversations_key": "text",
+            "from_key": "role",
+            "value_key": "content",
+            "error_strategy": "delete"
+        },
+        "split": ["code", "dialog", "general", "math", "mmlu", "multiple_choice", "writing", "academic", "task"],
+        "tags": ["qa", "quality", "multi-task"],
+        "huge_dataset": true
+    },
+    "infinity-instruct": {
+        "dataset_id": "swift/Infinity-Instruct",
+        "hf_dataset_id": "BAAI/Infinity-Instruct",
+        "conversations": {
+            "user_role": "human",
+            "assistant_role": "gpt",
+            "conversations_key": "conversations",
+            "from_key": "from",
+            "value_key": "value",
+            "error_strategy": "delete"
+        },
+        "tags": ["qa", "quality", "multi-task"],
+        "huge_dataset": true
+    },
     "wikipedia": {
         "dataset_id": "swift/wikipedia",
         "hf_dataset_id": "wikipedia",
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index bd4716a79..ba8e79396 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -164,6 +164,8 @@ class DatasetName:
     text_caps = 'text-caps'
     refcoco_unofficial_caption = 'refcoco-unofficial-caption'
     refcoco_unofficial_grounding = 'refcoco-unofficial-grounding'
+    refcocog_unofficial_caption = 'refcocog-unofficial-caption'
+    refcocog_unofficial_grounding = 'refcocog-unofficial-grounding'
     a_okvqa = 'a-okvqa'
     okvqa = 'okvqa'
     ocr_vqa = 'ocr-vqa'
@@ -174,6 +176,7 @@ class DatasetName:
     guanaco = 'guanaco'
     mind2web = 'mind2web'
     sharegpt_4o_image = 'sharegpt-4o-image'
+    pixelprose = 'pixelprose'
 
     m3it = 'm3it'
     # additional images
@@ -643,6 +646,38 @@ def _preprocess_vision_dataset2(dataset: HfDataset) -> HfDataset:
     is_main=False)
 
 
+def _preprocess_pixelprose(dataset: HfDataset):
+
+    caption_prompt = [
+        'Give the description of this image.', 'Describe this picture', 'What is the proper title of this image?'
+    ]
+
+    def preprocess(row):
+        vlm_caption = row['vlm_caption']
+        if vlm_caption.startswith('This image displays:'):
+            vlm_caption = vlm_caption[len('This image displays:'):].strip()
+        return {
+            'response': vlm_caption,
+            'images': row['url'],
+            'query': np.random.choice(caption_prompt),
+        }
+
+    return dataset.map(preprocess, load_from_cache_file=False)
+
+
+register_dataset(
+    DatasetName.pixelprose,
+    'swift/pixelprose',
+    None,
+    _preprocess_pixelprose,
+    get_dataset_from_repo,
+    split=['train', 'cc12m', 'commonpool', 'redcaps'],
+    hf_dataset_id='tomg-group-umd/pixelprose',
+    tags=['caption', 'multi-modal', 'vision'],
+    huge_dataset=True,
+    is_main=False)
+
+
 def _preprocess_aishell1_dataset(dataset: HfDataset) -> HfDataset:
     prompt = '语音转文本'
     audio_key = 'Audio:FILE'
@@ -1147,7 +1182,15 @@ def preprocess(row):
     get_function=get_dataset_from_repo,
     split=['train', 'validation'],
     hf_dataset_id='jxu124/refcoco',
-    huge_dataset=True,
+    tags=['multi-modal', 'en', 'caption'])
+
+register_dataset(
+    DatasetName.refcocog_unofficial_caption,
+    'swift/refcocog', [],
+    preprocess_func=preprocess_refcoco_unofficial_caption,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcocog',
     tags=['multi-modal', 'en', 'caption'])
 
 
@@ -1184,7 +1227,15 @@ def preprocess(row):
     get_function=get_dataset_from_repo,
     split=['train', 'validation'],
     hf_dataset_id='jxu124/refcoco',
-    huge_dataset=True,
+    tags=['multi-modal', 'en', 'grounding'])
+
+register_dataset(
+    DatasetName.refcocog_unofficial_grounding,
+    'swift/refcocog', [],
+    preprocess_func=preprocess_refcoco_unofficial_grounding,
+    get_function=get_dataset_from_repo,
+    split=['train', 'validation'],
+    hf_dataset_id='jxu124/refcocog',
     tags=['multi-modal', 'en', 'grounding'])
 
 register_dataset(
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 8ad46403d..88bc27c93 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -300,7 +300,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         history_roles: Optional[History] = example.get('history_roles')
         system: Optional[str] = example.get('system', None)
         template_type: Optional[str] = getattr(self, 'template_type', None)
-        tools: List[Any] = example.get('tools') or []
+        tools: Union[List[Any], str] = example.get('tools') or []
         is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
 
         if len(history) > 0:
@@ -315,6 +315,8 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             assert self.system_prefix is not None, (
                 f'The template does not support `system`, template_type: {template_type}')
         if tools:
+            if isinstance(tools, str):
+                tools = json.loads(tools)
             if system is None:
                 system = ''
             system += get_tools_prompt(tools, self.tools_prompt)
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index ab28d3c55..a56508697 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -271,7 +271,8 @@ def _try_fetch(self, first_idx: int) -> Optional[Dict[str, Any]]:
             data = self.dataset[i]
             try:
                 res = self.template.encode(data)
-            except OSError:
+            except OSError as e:
+                logger.error('Error occurs in lazy tokenize:', e)
                 continue
             if len(res[0]) > 0:
                 return res