From 26547584ca3c89116ec45c2139284089e874def0 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 27 Jun 2024 15:46:40 +0800 Subject: [PATCH 01/10] add more datasets --- swift/llm/data/dataset_info.json | 29 ++++++++++++++++ swift/llm/utils/dataset.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/swift/llm/data/dataset_info.json b/swift/llm/data/dataset_info.json index 8ed68e7c6..702bf7f93 100644 --- a/swift/llm/data/dataset_info.json +++ b/swift/llm/data/dataset_info.json @@ -376,6 +376,35 @@ "tags": ["pretrain", "quality"], "huge_dataset": true }, + "gen-qa": { + "dataset_id": "swift/GenQA", + "hf_dataset_id": "tomg-group-umd/GenQA", + "conversations": { + "user_role": "user", + "assistant_role": "assistant", + "conversations_key": "text", + "from_key": "role", + "value_key": "content", + "error_strategy": "delete" + }, + "split": ["code", "dialog", "general", "math", "mmlu", "multiple_choice", "writing", "academic", "task"], + "tags": ["qa", "quality", "multi-task"], + "huge_dataset": true + }, + "infinity-instruct": { + "dataset_id": "swift/Infinity-Instruct", + "hf_dataset_id": "BAAI/Infinity-Instruct", + "conversations": { + "user_role": "human", + "assistant_role": "gpt", + "conversations_key": "conversations", + "from_key": "from", + "value_key": "value", + "error_strategy": "delete" + }, + "tags": ["qa", "quality", "multi-task"], + "huge_dataset": true + }, "wikipedia": { "dataset_id": "swift/wikipedia", "hf_dataset_id": "wikipedia", diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index bd4716a79..ed377b884 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -164,6 +164,8 @@ class DatasetName: text_caps = 'text-caps' refcoco_unofficial_caption = 'refcoco-unofficial-caption' refcoco_unofficial_grounding = 'refcoco-unofficial-grounding' + refcocog_unofficial_caption = 'refcocog-unofficial-caption' + refcocog_unofficial_grounding = 'refcocog-unofficial-grounding' a_okvqa = 'a-okvqa' okvqa = 'okvqa' ocr_vqa = 'ocr-vqa' @@ -174,6 +176,7 @@ class DatasetName: guanaco = 'guanaco' mind2web = 'mind2web' sharegpt_4o_image = 'sharegpt-4o-image' + pixelprose = 'pixelprose' m3it = 'm3it' # additional images @@ -643,6 +646,38 @@ def _preprocess_vision_dataset2(dataset: HfDataset) -> HfDataset: is_main=False) +def _preprocess_pixelprose(dataset: HfDataset): + + caption_prompt = [ + 'Give the description of this image.', + "Describe this picture", + 'What is the proper title of this image?' + ] + + def preprocess(row): + vlm_caption = row['vlm_caption'] + if vlm_caption.startswith('This image displays:'): + vlm_caption = vlm_caption[len('This image displays:'):].strip() + return { + 'response': vlm_caption, + 'images': row['url'], + 'request': np.random.choice(caption_prompt), + } + + return dataset.map(preprocess, load_from_cache_file=False) + + +register_dataset( + DatasetName.pixelprose, + "swift/pixelprose", None, + _preprocess_pixelprose, + get_dataset_from_repo, + split=['train', 'cc12m', 'commonpool', 'redcaps'], + hf_dataset_id="tomg-group-umd/pixelprose", + tags=['caption', 'multi-modal', 'vision'], + is_main=False) + + def _preprocess_aishell1_dataset(dataset: HfDataset) -> HfDataset: prompt = '语音转文本' audio_key = 'Audio:FILE' @@ -1151,6 +1186,17 @@ def preprocess(row): tags=['multi-modal', 'en', 'caption']) +register_dataset( + DatasetName.refcocog_unofficial_caption, + 'swift/refcocog', [], + preprocess_func=preprocess_refcoco_unofficial_caption, + get_function=get_dataset_from_repo, + split=['train', 'validation'], + hf_dataset_id='jxu124/refcocog', + huge_dataset=True, + tags=['multi-modal', 'en', 'caption']) + + def preprocess_refcoco_unofficial_grounding(dataset): cache_dir = MediaCache.download( @@ -1187,6 +1233,18 @@ def preprocess(row): huge_dataset=True, tags=['multi-modal', 'en', 'grounding']) + +register_dataset( + DatasetName.refcocog_unofficial_grounding, + 'swift/refcocog', [], + preprocess_func=preprocess_refcoco_unofficial_grounding, + get_function=get_dataset_from_repo, + split=['train', 'validation'], + hf_dataset_id='jxu124/refcocog', + huge_dataset=True, + tags=['multi-modal', 'en', 'grounding']) + + register_dataset( DatasetName.text_caps, 'swift/TextCaps', [], From 2f831747098e80b469206e2b8a8ffd63e9701e1a Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 27 Jun 2024 17:16:12 +0800 Subject: [PATCH 02/10] add log --- swift/llm/utils/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index ab28d3c55..a56508697 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -271,7 +271,8 @@ def _try_fetch(self, first_idx: int) -> Optional[Dict[str, Any]]: data = self.dataset[i] try: res = self.template.encode(data) - except OSError: + except OSError as e: + logger.error('Error occurs in lazy tokenize:', e) continue if len(res[0]) > 0: return res From 9c51850c903f5d30c074cbb8364f6dab60e12cc6 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 27 Jun 2024 22:26:34 +0800 Subject: [PATCH 03/10] fix --- swift/llm/utils/dataset.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index ed377b884..e1ff34cb1 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -661,7 +661,7 @@ def preprocess(row): return { 'response': vlm_caption, 'images': row['url'], - 'request': np.random.choice(caption_prompt), + 'query': np.random.choice(caption_prompt), } return dataset.map(preprocess, load_from_cache_file=False) @@ -1182,7 +1182,6 @@ def preprocess(row): get_function=get_dataset_from_repo, split=['train', 'validation'], hf_dataset_id='jxu124/refcoco', - huge_dataset=True, tags=['multi-modal', 'en', 'caption']) @@ -1193,7 +1192,6 @@ def preprocess(row): get_function=get_dataset_from_repo, split=['train', 'validation'], hf_dataset_id='jxu124/refcocog', - huge_dataset=True, tags=['multi-modal', 'en', 'caption']) @@ -1230,7 +1228,6 @@ def preprocess(row): get_function=get_dataset_from_repo, split=['train', 'validation'], hf_dataset_id='jxu124/refcoco', - huge_dataset=True, tags=['multi-modal', 'en', 'grounding']) @@ -1241,7 +1238,6 @@ def preprocess(row): get_function=get_dataset_from_repo, split=['train', 'validation'], hf_dataset_id='jxu124/refcocog', - huge_dataset=True, tags=['multi-modal', 'en', 'grounding']) From fd00e2041ad16c08bf0551c885c68a63929b062d Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 27 Jun 2024 22:53:10 +0800 Subject: [PATCH 04/10] fix --- ...5\222\214\346\225\260\346\215\256\351\233\206.md" | 12 +++++++++++- docs/source_en/LLM/Supported-models-datasets.md | 12 +++++++++++- swift/llm/utils/dataset.py | 2 ++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index b9cc49db5..cb9d32a8d 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -378,6 +378,7 @@ |🔥coig-cqia|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|chinese_traditional
coig_pc
exam
finance
douban
human_value
logi_qa
ruozhiba
segmentfault
wiki
wikihow
xhs
zhihu|44694|703.8±654.2, min=33, max=19288|general|-| |🔥ruozhiba|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|post-annual
title-good
title-norm|85658|39.9±13.1, min=21, max=559|pretrain|-| |long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)||11998|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)| +|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)| |🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)||26336|650.9±217.2, min=209, max=2740|chat, agent, multi-round|-| |🔥ms-agent-for-agentfabric|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|default
addition|30000|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-| |ms-agent-multirole|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole/summary)||9500|447.6±84.9, min=145, max=1101|chat, agent, multi-round, role-play, multi-agent|-| @@ -385,6 +386,8 @@ |damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||386984|956.5±407.3, min=326, max=19001|chat, agent, multi-round|-| |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-| |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld
db
kg
mind2web
os
webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-| +|🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-| +|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, agent, multi-round|-| |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)| |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-| |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-| @@ -427,12 +430,17 @@ |orpo-dpo-mix-40k|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k/summary)|default|43666|548.3±397.4, min=28, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)| |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)| |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-| +|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-| |pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)| |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words
chartqa
coinstruct
contrastive_caption
docvqa
dreamsim
dvqa
iconqa
imagecode
llava_665k_multi
lrv_multi
multi_vqa
nextqa
nlvr2
spot-the-diff
star
visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)| |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)| |midefics|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset/summary)||3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)| |gqa|[None](https://modelscope.cn/datasets/None/summary)|train_all_instructions|-|Dataset is too huge, please click the original link to view the dataset stat.|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)| |text-caps|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps/summary)||18145|38.2±4.4, min=31, max=73|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)| +|refcoco-unofficial-caption|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|44.7±3.2, min=36, max=71|multi-modal, en, caption|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)| +|refcoco-unofficial-grounding|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|45.2±3.1, min=37, max=69|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)| +|refcocog-unofficial-caption|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|49.7±4.7, min=37, max=88|multi-modal, en, caption|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)| +|refcocog-unofficial-grounding|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|50.1±4.7, min=37, max=90|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)| |a-okvqa|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA/summary)||18201|45.8±7.9, min=32, max=100|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)| |okvqa|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train/summary)||9009|34.4±3.3, min=28, max=59|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)| |ocr-vqa|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA/summary)||186753|35.6±6.6, min=29, max=193|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)| @@ -443,6 +451,7 @@ |guanaco|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset/summary)|default|31561|250.1±70.3, min=89, max=1436|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)| |mind2web|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web/summary)||1009|297522.4±325496.2, min=8592, max=3499715|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)| |sharegpt-4o-image|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o/summary)|image_caption|57289|638.7±157.9, min=47, max=4640|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)| +|pixelprose|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)| |m3it|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT/summary)|coco
vqa-v2
shapes
shapes-rephrased
coco-goi-rephrased
snli-ve
snli-ve-rephrased
okvqa
a-okvqa
viquae
textcap
docvqa
science-qa
imagenet
imagenet-open-ended
imagenet-rephrased
coco-goi
clevr
clevr-rephrased
nlvr
coco-itm
coco-itm-rephrased
vsr
vsr-rephrased
mocheg
mocheg-rephrased
coco-text
fm-iqa
activitynet-qa
msrvtt
ss
coco-cn
refcoco
refcoco-rephrased
multi30k
image-paragraph-captioning
visual-dialog
visual-dialog-rephrased
iqa
vcr
visual-mrc
ivqa
msrvtt-qa
msvd-qa
gqa
text-vqa
ocr-vqa
st-vqa
flickr8k-cn|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-| |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V
ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-| |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-| @@ -467,11 +476,12 @@ |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored
flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)| |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)| |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)| +|gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)| |github-code|[swift/github-code](https://modelscope.cn/datasets/swift/github-code/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)| |gpt4v-dataset|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset/summary)||12356|217.9±68.3, min=35, max=596|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)| |guanaco-belle-merge|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0/summary)||693987|134.2±92.0, min=24, max=6507|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)| +|infinity-instruct|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)| |llava-med-zh-instruct|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k/summary)||56649|207.7±67.6, min=37, max=657|zh, medical, vqa|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)| -|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, en|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)| |math-instruct|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct/summary)||262283|254.4±183.5, min=11, max=4383|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)| |math-plus|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus/summary)|train|893929|287.1±158.7, min=24, max=2919|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)| |moondream2-coyo-5M|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)| diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md index 6ffb58341..f2f57294c 100644 --- a/docs/source_en/LLM/Supported-models-datasets.md +++ b/docs/source_en/LLM/Supported-models-datasets.md @@ -378,6 +378,7 @@ The table below introduces the datasets supported by SWIFT: |🔥coig-cqia|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|chinese_traditional
coig_pc
exam
finance
douban
human_value
logi_qa
ruozhiba
segmentfault
wiki
wikihow
xhs
zhihu|44694|703.8±654.2, min=33, max=19288|general|-| |🔥ruozhiba|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|post-annual
title-good
title-norm|85658|39.9±13.1, min=21, max=559|pretrain|-| |long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)||11998|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)| +|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, em|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)| |🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)||26336|650.9±217.2, min=209, max=2740|chat, agent, multi-round|-| |🔥ms-agent-for-agentfabric|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|default
addition|30000|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-| |ms-agent-multirole|[iic/MSAgent-MultiRole](https://modelscope.cn/datasets/iic/MSAgent-MultiRole/summary)||9500|447.6±84.9, min=145, max=1101|chat, agent, multi-round, role-play, multi-agent|-| @@ -385,6 +386,8 @@ The table below introduces the datasets supported by SWIFT: |damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||386984|956.5±407.3, min=326, max=19001|chat, agent, multi-round|-| |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-| |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld
db
kg
mind2web
os
webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-| +|🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-| +|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, agent, multi-round|-| |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)| |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-| |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-| @@ -427,12 +430,17 @@ The table below introduces the datasets supported by SWIFT: |orpo-dpo-mix-40k|[AI-ModelScope/orpo-dpo-mix-40k](https://modelscope.cn/datasets/AI-ModelScope/orpo-dpo-mix-40k/summary)|default|43666|548.3±397.4, min=28, max=8483|dpo, orpo, en, quality|[mlabonne/orpo-dpo-mix-40k](https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k)| |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)| |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-| +|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-| |pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)| |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words
chartqa
coinstruct
contrastive_caption
docvqa
dreamsim
dvqa
iconqa
imagecode
llava_665k_multi
lrv_multi
multi_vqa
nextqa
nlvr2
spot-the-diff
star
visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)| |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)| |midefics|[swift/MideficsDataset](https://modelscope.cn/datasets/swift/MideficsDataset/summary)||3800|201.3±70.2, min=60, max=454|medical, en, vqa|[WinterSchool/MideficsDataset](https://huggingface.co/datasets/WinterSchool/MideficsDataset)| |gqa|[None](https://modelscope.cn/datasets/None/summary)|train_all_instructions|-|Dataset is too huge, please click the original link to view the dataset stat.|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)| |text-caps|[swift/TextCaps](https://modelscope.cn/datasets/swift/TextCaps/summary)||18145|38.2±4.4, min=31, max=73|multi-modal, en, caption, quality|[HuggingFaceM4/TextCaps](https://huggingface.co/datasets/HuggingFaceM4/TextCaps)| +|refcoco-unofficial-caption|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|44.7±3.2, min=36, max=71|multi-modal, en, caption|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)| +|refcoco-unofficial-grounding|[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco/summary)||46215|45.2±3.1, min=37, max=69|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)| +|refcocog-unofficial-caption|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|49.7±4.7, min=37, max=88|multi-modal, en, caption|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)| +|refcocog-unofficial-grounding|[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog/summary)||44799|50.1±4.7, min=37, max=90|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)| |a-okvqa|[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA/summary)||18201|45.8±7.9, min=32, max=100|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)| |okvqa|[swift/OK-VQA_train](https://modelscope.cn/datasets/swift/OK-VQA_train/summary)||9009|34.4±3.3, min=28, max=59|multi-modal, en, vqa, quality|[Multimodal-Fatima/OK-VQA_train](https://huggingface.co/datasets/Multimodal-Fatima/OK-VQA_train)| |ocr-vqa|[swift/OCR-VQA](https://modelscope.cn/datasets/swift/OCR-VQA/summary)||186753|35.6±6.6, min=29, max=193|multi-modal, en, ocr-vqa|[howard-hou/OCR-VQA](https://huggingface.co/datasets/howard-hou/OCR-VQA)| @@ -443,6 +451,7 @@ The table below introduces the datasets supported by SWIFT: |guanaco|[AI-ModelScope/GuanacoDataset](https://modelscope.cn/datasets/AI-ModelScope/GuanacoDataset/summary)|default|31561|250.1±70.3, min=89, max=1436|chat, zh|[JosephusCheung/GuanacoDataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)| |mind2web|[swift/Multimodal-Mind2Web](https://modelscope.cn/datasets/swift/Multimodal-Mind2Web/summary)||1009|297522.4±325496.2, min=8592, max=3499715|agent, multi-modal|[osunlp/Multimodal-Mind2Web](https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web)| |sharegpt-4o-image|[AI-ModelScope/ShareGPT-4o](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT-4o/summary)|image_caption|57289|638.7±157.9, min=47, max=4640|vqa, multi-modal|[OpenGVLab/ShareGPT-4o](https://huggingface.co/datasets/OpenGVLab/ShareGPT-4o)| +|pixelprose|[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)| |m3it|[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT/summary)|coco
vqa-v2
shapes
shapes-rephrased
coco-goi-rephrased
snli-ve
snli-ve-rephrased
okvqa
a-okvqa
viquae
textcap
docvqa
science-qa
imagenet
imagenet-open-ended
imagenet-rephrased
coco-goi
clevr
clevr-rephrased
nlvr
coco-itm
coco-itm-rephrased
vsr
vsr-rephrased
mocheg
mocheg-rephrased
coco-text
fm-iqa
activitynet-qa
msrvtt
ss
coco-cn
refcoco
refcoco-rephrased
multi30k
image-paragraph-captioning
visual-dialog
visual-dialog-rephrased
iqa
vcr
visual-mrc
ivqa
msrvtt-qa
msvd-qa
gqa
text-vqa
ocr-vqa
st-vqa
flickr8k-cn|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-| |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V
ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-| |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-| @@ -467,11 +476,12 @@ The table below introduces the datasets supported by SWIFT: |dolphin|[swift/dolphin](https://modelscope.cn/datasets/swift/dolphin/summary)|flan1m-alpaca-uncensored
flan5m-alpaca-uncensored|-|Dataset is too huge, please click the original link to view the dataset stat.|en|[cognitivecomputations/dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin)| |evol-instruct-v2|[AI-ModelScope/WizardLM_evol_instruct_V2_196k](https://modelscope.cn/datasets/AI-ModelScope/WizardLM_evol_instruct_V2_196k/summary)||109184|480.9±333.1, min=26, max=4942|chat, en|[WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)| |fineweb|[None](https://modelscope.cn/datasets/None/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)| +|gen-qa|[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)| |github-code|[swift/github-code](https://modelscope.cn/datasets/swift/github-code/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|pretrain, quality|[codeparrot/github-code](https://huggingface.co/datasets/codeparrot/github-code)| |gpt4v-dataset|[swift/gpt4v-dataset](https://modelscope.cn/datasets/swift/gpt4v-dataset/summary)||12356|217.9±68.3, min=35, max=596|en, caption, multi-modal, quality|[laion/gpt4v-dataset](https://huggingface.co/datasets/laion/gpt4v-dataset)| |guanaco-belle-merge|[AI-ModelScope/guanaco_belle_merge_v1.0](https://modelscope.cn/datasets/AI-ModelScope/guanaco_belle_merge_v1.0/summary)||693987|134.2±92.0, min=24, max=6507|QA, zh|[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)| +|infinity-instruct|[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)| |llava-med-zh-instruct|[swift/llava-med-zh-instruct-60k](https://modelscope.cn/datasets/swift/llava-med-zh-instruct-60k/summary)||56649|207.7±67.6, min=37, max=657|zh, medical, vqa|[BUAADreamer/llava-med-zh-instruct-60k](https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k)| -|lmsys-chat-1m|[AI-ModelScope/lmsys-chat-1m](https://modelscope.cn/datasets/AI-ModelScope/lmsys-chat-1m/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, en|[lmsys/lmsys-chat-1m](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)| |math-instruct|[AI-ModelScope/MathInstruct](https://modelscope.cn/datasets/AI-ModelScope/MathInstruct/summary)||262283|254.4±183.5, min=11, max=4383|math, cot, en, quality|[TIGER-Lab/MathInstruct](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)| |math-plus|[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus/summary)|train|893929|287.1±158.7, min=24, max=2919|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)| |moondream2-coyo-5M|[swift/moondream2-coyo-5M-captions](https://modelscope.cn/datasets/swift/moondream2-coyo-5M-captions/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|caption, pretrain, quality|[isidentical/moondream2-coyo-5M-captions](https://huggingface.co/datasets/isidentical/moondream2-coyo-5M-captions)| diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index e1ff34cb1..4c1439cf7 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -675,6 +675,7 @@ def preprocess(row): split=['train', 'cc12m', 'commonpool', 'redcaps'], hf_dataset_id="tomg-group-umd/pixelprose", tags=['caption', 'multi-modal', 'vision'], + huge_dataset=True, is_main=False) @@ -2092,6 +2093,7 @@ def reorganize_row(row): _preprocess_toolbench, get_dataset_from_repo, remove_useless_columns=False, + huge_dataset=True, tags=['chat', 'agent', 'multi-round']) From b492ea9fee9c07de568c8eb658b9893cfde41ae7 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 27 Jun 2024 22:55:57 +0800 Subject: [PATCH 05/10] lint --- swift/llm/utils/dataset.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 4c1439cf7..6c97e3309 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -649,9 +649,7 @@ def _preprocess_vision_dataset2(dataset: HfDataset) -> HfDataset: def _preprocess_pixelprose(dataset: HfDataset): caption_prompt = [ - 'Give the description of this image.', - "Describe this picture", - 'What is the proper title of this image?' + 'Give the description of this image.', 'Describe this picture', 'What is the proper title of this image?' ] def preprocess(row): @@ -669,11 +667,12 @@ def preprocess(row): register_dataset( DatasetName.pixelprose, - "swift/pixelprose", None, + 'swift/pixelprose', + None, _preprocess_pixelprose, get_dataset_from_repo, split=['train', 'cc12m', 'commonpool', 'redcaps'], - hf_dataset_id="tomg-group-umd/pixelprose", + hf_dataset_id='tomg-group-umd/pixelprose', tags=['caption', 'multi-modal', 'vision'], huge_dataset=True, is_main=False) @@ -1185,7 +1184,6 @@ def preprocess(row): hf_dataset_id='jxu124/refcoco', tags=['multi-modal', 'en', 'caption']) - register_dataset( DatasetName.refcocog_unofficial_caption, 'swift/refcocog', [], @@ -1231,7 +1229,6 @@ def preprocess(row): hf_dataset_id='jxu124/refcoco', tags=['multi-modal', 'en', 'grounding']) - register_dataset( DatasetName.refcocog_unofficial_grounding, 'swift/refcocog', [], @@ -1241,7 +1238,6 @@ def preprocess(row): hf_dataset_id='jxu124/refcocog', tags=['multi-modal', 'en', 'grounding']) - register_dataset( DatasetName.text_caps, 'swift/TextCaps', [], From 39ed551b3abce8c36457cde2c409b31690c06252 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Jun 2024 10:45:11 +0800 Subject: [PATCH 06/10] fix tools --- swift/llm/utils/template.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 8ad46403d..88bc27c93 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -300,7 +300,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any history_roles: Optional[History] = example.get('history_roles') system: Optional[str] = example.get('system', None) template_type: Optional[str] = getattr(self, 'template_type', None) - tools: List[Any] = example.get('tools') or [] + tools: Union[List[Any], str] = example.get('tools') or [] is_multi_modal: bool = any([example.get(key) for key in Template.special_keys]) if len(history) > 0: @@ -315,6 +315,8 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any assert self.system_prefix is not None, ( f'The template does not support `system`, template_type: {template_type}') if tools: + if isinstance(tools, str): + tools = json.loads(tools) if system is None: system = '' system += get_tools_prompt(tools, self.tools_prompt) From 35133718a443c6e2ba0437ea225447e6bdbee6f2 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 28 Jun 2024 10:54:38 +0800 Subject: [PATCH 07/10] fix --- swift/llm/utils/template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 88bc27c93..3c605acce 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -3,7 +3,7 @@ from copy import deepcopy from io import BytesIO from typing import Any, Dict, List, Literal, Optional, Tuple, Union - +import ast import json import requests import torch @@ -316,7 +316,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any f'The template does not support `system`, template_type: {template_type}') if tools: if isinstance(tools, str): - tools = json.loads(tools) + tools = ast.literal_eval(tools) if system is None: system = '' system += get_tools_prompt(tools, self.tools_prompt) From f1117bba13a0faf3b71ac2ff21e37de7b30e1f8d Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 28 Jun 2024 11:20:14 +0800 Subject: [PATCH 08/10] fix --- swift/llm/utils/template.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 3c605acce..9a7e91da7 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -3,7 +3,6 @@ from copy import deepcopy from io import BytesIO from typing import Any, Dict, List, Literal, Optional, Tuple, Union -import ast import json import requests import torch @@ -316,7 +315,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any f'The template does not support `system`, template_type: {template_type}') if tools: if isinstance(tools, str): - tools = ast.literal_eval(tools) + tools = json.loads(tools) if system is None: system = '' system += get_tools_prompt(tools, self.tools_prompt) From d3f73a936e74dd9ed3d8fb478d52023c33427555 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 28 Jun 2024 11:22:59 +0800 Subject: [PATCH 09/10] fix --- swift/llm/utils/template.py | 1 + 1 file changed, 1 insertion(+) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 9a7e91da7..88bc27c93 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -3,6 +3,7 @@ from copy import deepcopy from io import BytesIO from typing import Any, Dict, List, Literal, Optional, Tuple, Union + import json import requests import torch From 32d7079fae9026c4a3cb336fec66bc9dd1b479eb Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 28 Jun 2024 11:49:50 +0800 Subject: [PATCH 10/10] fix --- ...\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" | 2 +- docs/source_en/LLM/Supported-models-datasets.md | 2 +- swift/llm/utils/dataset.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index cb9d32a8d..92b12a081 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -387,7 +387,7 @@ |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-| |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld
db
kg
mind2web
os
webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-| |🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-| -|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, agent, multi-round|-| +|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||124345|3669.5±1600.9, min=1047, max=22581|chat, agent, multi-round|-| |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)| |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-| |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-| diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md index f2f57294c..a2d64a65a 100644 --- a/docs/source_en/LLM/Supported-models-datasets.md +++ b/docs/source_en/LLM/Supported-models-datasets.md @@ -387,7 +387,7 @@ The table below introduces the datasets supported by SWIFT: |damo-agent-zh-mini|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)||20845|1326.4±329.6, min=571, max=4304|chat, agent, multi-round|-| |agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|alfworld
db
kg
mind2web
os
webshop|1866|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-| |🔥msagent-pro|[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro/summary)||21905|1524.5±921.3, min=64, max=16770|chat, agent, multi-round|-| -|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|chat, agent, multi-round|-| +|toolbench|[swift/ToolBench](https://modelscope.cn/datasets/swift/ToolBench/summary)||124345|3669.5±1600.9, min=1047, max=22581|chat, agent, multi-round|-| |code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)||20016|100.2±60.1, min=29, max=1776|-|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)| |🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)||2359|727.1±235.9, min=259, max=2146|chat, coding|-| |🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)||27224|483.6±193.9, min=45, max=3082|chat, coding|-| diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 6c97e3309..ba8e79396 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -2089,7 +2089,6 @@ def reorganize_row(row): _preprocess_toolbench, get_dataset_from_repo, remove_useless_columns=False, - huge_dataset=True, tags=['chat', 'agent', 'multi-round'])