From e881e09a5cd11826c81c2e2b597d39eab26dc50a Mon Sep 17 00:00:00 2001 From: LZHgrla <36994684+LZHgrla@users.noreply.github.com> Date: Thu, 31 Aug 2023 21:28:59 +0800 Subject: [PATCH] [Doc] Add data_prepare.md docs (#82) * add prepare * Update dataset_prepare.md * Update dataset_prepare.md * modify default data path * Update dataset_prepare.md * fix pre-commit * move docs to user_guide * move zh docs to user_guide * add zh docs * fix typo * Update dataset_prepare.md --- README.md | 6 +-- README_zh-CN.md | 6 +-- docs/en/{ => user_guides}/chat.md | 0 .../dataset_format.md | 0 docs/en/user_guides/dataset_prepare.md | 51 +++++++++++++++++++ docs/en/{ => user_guides}/finetune.md | 0 .../incremental_pretraining.md | 0 .../multi_turn_conversation.md | 0 .../single_turn_conversation.md | 0 docs/zh_cn/{ => user_guides}/chat.md | 0 .../dataset_format.md | 0 docs/zh_cn/user_guides/dataset_prepare.md | 51 +++++++++++++++++++ docs/zh_cn/{ => user_guides}/finetune.md | 0 .../incremental_pretraining.md | 0 .../multi_turn_conversation.md | 0 .../single_turn_conversation.md | 0 ...ichuan_13b_base_qlora_arxiv_gentitle_e3.py | 4 +- .../baichuan_13b_base_qlora_lawyer_e3.py | 4 +- ...ichuan_13b_chat_qlora_arxiv_gentitle_e3.py | 4 +- .../baichuan_13b_chat_qlora_lawyer_e3.py | 4 +- .../baichuan_7b_qlora_arxiv_gentitle_e3.py | 4 +- .../baichuan_7b_qlora_lawyer_e3.py | 4 +- .../chatglm2_6b_qlora_arxiv_gentitle_e3.py | 4 +- .../chatglm2_6b_qlora_lawyer_e3.py | 4 +- .../internlm_7b_qlora_arxiv_gentitle_e3.py | 4 +- .../internlm_7b_qlora_lawyer_e3.py | 4 +- ...nternlm_chat_7b_qlora_arxiv_gentitle_e3.py | 4 +- .../internlm_chat_7b_qlora_lawyer_e3.py | 4 +- .../llama2_7b_qlora_arxiv_gentitle_e3.py | 4 +- .../llama2_7b/llama2_7b_qlora_lawyer_e3.py | 4 +- .../llama2_7b_chat_qlora_arxiv_gentitle_e3.py | 4 +- .../llama2_7b_chat_qlora_lawyer_e3.py | 4 +- .../llama_7b_qlora_arxiv_gentitle_e3.py | 4 +- .../llama_7b/llama_7b_qlora_lawyer_e3.py | 4 +- .../qwen_7b_qlora_arxiv_gentitle_e3.py | 4 +- .../qwen/qwen_7b/qwen_7b_qlora_lawyer_e3.py | 4 +- .../qwen_7b_chat_qlora_arxiv_gentitle_e3.py | 4 +- .../qwen_7b_chat_qlora_lawyer_e3.py | 4 +- 38 files changed, 152 insertions(+), 50 deletions(-) rename docs/en/{ => user_guides}/chat.md (100%) rename docs/en/{dataset => user_guides}/dataset_format.md (100%) create mode 100644 docs/en/user_guides/dataset_prepare.md rename docs/en/{ => user_guides}/finetune.md (100%) rename docs/en/{dataset => user_guides}/incremental_pretraining.md (100%) rename docs/en/{dataset => user_guides}/multi_turn_conversation.md (100%) rename docs/en/{dataset => user_guides}/single_turn_conversation.md (100%) rename docs/zh_cn/{ => user_guides}/chat.md (100%) rename docs/zh_cn/{dataset => user_guides}/dataset_format.md (100%) create mode 100644 docs/zh_cn/user_guides/dataset_prepare.md rename docs/zh_cn/{ => user_guides}/finetune.md (100%) rename docs/zh_cn/{dataset => user_guides}/incremental_pretraining.md (100%) rename docs/zh_cn/{dataset => user_guides}/multi_turn_conversation.md (100%) rename docs/zh_cn/{dataset => user_guides}/single_turn_conversation.md (100%) diff --git a/README.md b/README.md index 2a854b765..7879ea4d4 100644 --- a/README.md +++ b/README.md @@ -150,11 +150,11 @@ XTuner provides tools to chat with pretrained / fine-tuned LLMs. xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "" --answer-stop-word "" --no-streamer ``` -For more examples, please see [chat.md](./docs/en/chat.md). +For more examples, please see [chat.md](./docs/en/user_guides/chat.md). ### Fine-tune [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QAEZVBfQ7LZURkMUtaq0b-5nEQII9G9Z?usp=sharing) -XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. +XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. Dataset prepare guides can be found on [dataset_prepare.md](./docs/en/user_guides/dataset_prepare.md). - **Step 0**, prepare the config. XTuner provides many ready-to-use configs and we can view all configs by @@ -178,7 +178,7 @@ XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. (SLURM) srun ${SRUN_ARGS} xtuner train internlm_7b_qlora_oasst1_e3 --launcher slurm ``` - For more examples, please see [finetune.md](./docs/en/finetune.md). + For more examples, please see [finetune.md](./docs/en/user_guides/finetune.md). ### Deployment diff --git a/README_zh-CN.md b/README_zh-CN.md index 435bad86d..bce94d4e2 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -150,11 +150,11 @@ XTuner 提供与大语言模型对话的工具。 xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "" --answer-stop-word "" --no-streamer ``` -更多示例,请查阅[文档](./docs/zh_cn/chat.md)。 +更多示例,请查阅[文档](./docs/zh_cn/user_guides/chat.md)。 ### 微调 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QAEZVBfQ7LZURkMUtaq0b-5nEQII9G9Z?usp=sharing) -XTuner 支持微调大语言模型。 +XTuner 支持微调大语言模型。数据集预处理指南请查阅[文档](./docs/zh_cn/user_guides/dataset_prepare.md)。 - **步骤 0**,准备配置文件。XTuner 提供多个开箱即用的配置文件,用户可以通过下列命令查看: @@ -177,7 +177,7 @@ XTuner 支持微调大语言模型。 NPROC_PER_NODE=${GPU_NUM} xtuner train internlm_7b_qlora_oasst1_e3 ``` - 更多示例,请查阅[文档](./docs/zh_cn/finetune.md). + 更多示例,请查阅[文档](./docs/zh_cn/user_guides/finetune.md). ### 部署 diff --git a/docs/en/chat.md b/docs/en/user_guides/chat.md similarity index 100% rename from docs/en/chat.md rename to docs/en/user_guides/chat.md diff --git a/docs/en/dataset/dataset_format.md b/docs/en/user_guides/dataset_format.md similarity index 100% rename from docs/en/dataset/dataset_format.md rename to docs/en/user_guides/dataset_format.md diff --git a/docs/en/user_guides/dataset_prepare.md b/docs/en/user_guides/dataset_prepare.md new file mode 100644 index 000000000..64113bf09 --- /dev/null +++ b/docs/en/user_guides/dataset_prepare.md @@ -0,0 +1,51 @@ +# Dataset Prepare + +## HuggingFace datasets + +For datasets on HuggingFace Hub, such as [alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca), you can quickly utilize them. For more details, please refer to [single_turn_conversation.md](./single_turn_conversation.md) and [multi_turn_conversation.md](./multi_turn_conversation.md). + +## Others + +### Arxiv Gentitle + +Arxiv dataset is not released on HuggingFace Hub, but you can download it from Kaggle. + +**Step 0**, download raw data from https://kaggle.com/datasets/Cornell-University/arxiv. + +**Step 1**, process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ${SAVE_DATA_PATH} [optional arguments]`. + +For example, get all `cs.AI`, `cs.CL`, `cs.CV` papers from `2020-01-01`: + +```shell +xtuner preprocess arxiv ${DOWNLOADED_DATA} ${SAVE_DATA_PATH} --categories cs.AI cs.CL cs.CV --start-date 2020-01-01 +``` + +**Step 2**, all Arixv Gentitle configs assume the dataset path to be `./data/arxiv_data.json`. You can move and rename your data, or make changes to these configs. + +### MOSS-003-SFT + +MOSS-003-SFT dataset can be downloaded from https://huggingface.co/datasets/fnlp/moss-003-sft-data. + +**Step 0**, download data. + +```shell +# Make sure you have git-lfs installed (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/datasets/fnlp/moss-003-sft-data +``` + +**Step 1**, unzip. + +```shell +cd moss-003-sft-data +unzip moss-003-sft-no-tools.jsonl.zip +unzip moss-003-sft-with-tools-no-text2image.zip +``` + +**Step 2**, all moss-003-sft configs assume the dataset path to be `./data/moss-003-sft-no-tools.jsonl` and `./data/conversations_with_tools_with_inner_instruction_no_text2image_train_all_random_meta0.5_0.1_0.01_moss_0709.jsonl`. You can move and rename your data, or make changes to these configs. + +### Chinese Lawyer + +Chinese Lawyer dataset has two sub-dataset, and can be downloaded form https://github.com/LiuHC0428/LAW-GPT. + +All lawyer configs assume the dataset path to be `./data/CrimeKgAssitant清洗后_52k.json` and `./data/训练数据_带法律依据_92k.json`. You can move and rename your data, or make changes to these configs. diff --git a/docs/en/finetune.md b/docs/en/user_guides/finetune.md similarity index 100% rename from docs/en/finetune.md rename to docs/en/user_guides/finetune.md diff --git a/docs/en/dataset/incremental_pretraining.md b/docs/en/user_guides/incremental_pretraining.md similarity index 100% rename from docs/en/dataset/incremental_pretraining.md rename to docs/en/user_guides/incremental_pretraining.md diff --git a/docs/en/dataset/multi_turn_conversation.md b/docs/en/user_guides/multi_turn_conversation.md similarity index 100% rename from docs/en/dataset/multi_turn_conversation.md rename to docs/en/user_guides/multi_turn_conversation.md diff --git a/docs/en/dataset/single_turn_conversation.md b/docs/en/user_guides/single_turn_conversation.md similarity index 100% rename from docs/en/dataset/single_turn_conversation.md rename to docs/en/user_guides/single_turn_conversation.md diff --git a/docs/zh_cn/chat.md b/docs/zh_cn/user_guides/chat.md similarity index 100% rename from docs/zh_cn/chat.md rename to docs/zh_cn/user_guides/chat.md diff --git a/docs/zh_cn/dataset/dataset_format.md b/docs/zh_cn/user_guides/dataset_format.md similarity index 100% rename from docs/zh_cn/dataset/dataset_format.md rename to docs/zh_cn/user_guides/dataset_format.md diff --git a/docs/zh_cn/user_guides/dataset_prepare.md b/docs/zh_cn/user_guides/dataset_prepare.md new file mode 100644 index 000000000..7009e0b10 --- /dev/null +++ b/docs/zh_cn/user_guides/dataset_prepare.md @@ -0,0 +1,51 @@ +# 数据集准备 + +## HuggingFace 数据集 + +针对 HuggingFace Hub 中的数据集,比如 [alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca),用户可以快速使用它们。更多使用指南请参照[单轮对话文档](./single_turn_conversation.md)和[多轮对话文档](./multi_turn_conversation.md)。 + +## 其他 + +### Arxiv Gentitle 生成题目 + +Arxiv 数据集并未在 HuggingFace Hub上发布,但是可以在 Kaggle 上下载。 + +**步骤 0**,从 https://kaggle.com/datasets/Cornell-University/arxiv 下载原始数据。 + +**步骤 1**,使用 `xtuner preprocess arxiv ${DOWNLOADED_DATA} ${SAVE_DATA_PATH} [optional arguments]` 命令处理数据。 + +例如,提取从 `2020-01-01` 起的所有 `cs.AI`、`cs.CL`、`cs.CV` 论文: + +```shell +xtuner preprocess arxiv ${DOWNLOADED_DATA} ${SAVE_DATA_PATH} --categories cs.AI cs.CL cs.CV --start-date 2020-01-01 +``` + +**步骤 2**,所有的 Arixv Gentitle 配置文件都假设数据集路径为 `./data/arxiv_data.json`。用户可以移动并重命名数据,或者在配置文件中重新设置数据路径。 + +### MOSS-003-SFT + +MOSS-003-SFT 数据集可以在 https://huggingface.co/datasets/fnlp/moss-003-sft-data 下载。 + +**步骤 0**,下载数据。 + +```shell +# 确保已经安装 git-lfs (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/datasets/fnlp/moss-003-sft-data +``` + +**步骤 1**,解压缩。 + +```shell +cd moss-003-sft-data +unzip moss-003-sft-no-tools.jsonl.zip +unzip moss-003-sft-with-tools-no-text2image.zip +``` + +**步骤 2**, 所有的 moss-003-sft 配置文件都假设数据集路径为 `./data/moss-003-sft-no-tools.jsonl` 和 `./data/conversations_with_tools_with_inner_instruction_no_text2image_train_all_random_meta0.5_0.1_0.01_moss_0709.jsonl`。用户可以移动并重命名数据,或者在配置文件中重新设置数据路径。 + +### Chinese Lawyer + +Chinese Lawyer 数据集有两个子数据集,它们可以在 https://github.com/LiuHC0428/LAW-GPT 下载。 + +所有的 Chinese Lawyer 配置文件都假设数据集路径为 `./data/CrimeKgAssitant清洗后_52k.json` 和 `./data/训练数据_带法律依据_92k.json`。用户可以移动并重命名数据,或者在配置文件中重新设置数据路径。 diff --git a/docs/zh_cn/finetune.md b/docs/zh_cn/user_guides/finetune.md similarity index 100% rename from docs/zh_cn/finetune.md rename to docs/zh_cn/user_guides/finetune.md diff --git a/docs/zh_cn/dataset/incremental_pretraining.md b/docs/zh_cn/user_guides/incremental_pretraining.md similarity index 100% rename from docs/zh_cn/dataset/incremental_pretraining.md rename to docs/zh_cn/user_guides/incremental_pretraining.md diff --git a/docs/zh_cn/dataset/multi_turn_conversation.md b/docs/zh_cn/user_guides/multi_turn_conversation.md similarity index 100% rename from docs/zh_cn/dataset/multi_turn_conversation.md rename to docs/zh_cn/user_guides/multi_turn_conversation.md diff --git a/docs/zh_cn/dataset/single_turn_conversation.md b/docs/zh_cn/user_guides/single_turn_conversation.md similarity index 100% rename from docs/zh_cn/dataset/single_turn_conversation.md rename to docs/zh_cn/user_guides/single_turn_conversation.md diff --git a/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_arxiv_gentitle_e3.py b/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_arxiv_gentitle_e3.py index 093e0915f..ef55c81e3 100644 --- a/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.title max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_lawyer_e3.py b/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_lawyer_e3.py index 4cfd32f82..08c958f1f 100644 --- a/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_lawyer_e3.py +++ b/xtuner/configs/baichuan/baichuan_13b_base/baichuan_13b_base_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.lawyer max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_arxiv_gentitle_e3.py b/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_arxiv_gentitle_e3.py index 77fdde70c..fc7430c8f 100644 --- a/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.baichuan_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_lawyer_e3.py b/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_lawyer_e3.py index 741e90b65..9c99cdb5c 100644 --- a/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_lawyer_e3.py +++ b/xtuner/configs/baichuan/baichuan_13b_chat/baichuan_13b_chat_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.baichuan_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_arxiv_gentitle_e3.py index 3c8b3edbe..b8f229978 100644 --- a/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.title max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_lawyer_e3.py b/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_lawyer_e3.py index bdd6f2d68..91b44c27f 100644 --- a/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_lawyer_e3.py +++ b/xtuner/configs/baichuan/baichuan_7b/baichuan_7b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.lawyer max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_arxiv_gentitle_e3.py index 519299c8b..b77fd4ee8 100644 --- a/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.chatglm max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_lawyer_e3.py b/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_lawyer_e3.py index 85dbc2718..620055786 100644 --- a/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_lawyer_e3.py +++ b/xtuner/configs/chatglm2/chatglm2_6b/chatglm2_6b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.chatglm max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_arxiv_gentitle_e3.py index b9cfb864a..ac7c6f709 100644 --- a/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.title max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_lawyer_e3.py b/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_lawyer_e3.py index a45e71fcd..cddde9779 100644 --- a/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_lawyer_e3.py +++ b/xtuner/configs/internlm/internlm_7b/internlm_7b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.lawyer max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_arxiv_gentitle_e3.py index cd0aa71f8..51f2407b0 100644 --- a/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.internlm_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_lawyer_e3.py b/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_lawyer_e3.py index eaf301d34..c1bfb69f5 100644 --- a/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_lawyer_e3.py +++ b/xtuner/configs/internlm/internlm_chat_7b/internlm_chat_7b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.internlm_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_arxiv_gentitle_e3.py index 35b3e1086..e089b01e9 100644 --- a/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.title max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_lawyer_e3.py b/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_lawyer_e3.py index f604e2bb0..2d1f8a2f3 100644 --- a/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_lawyer_e3.py +++ b/xtuner/configs/llama/llama2_7b/llama2_7b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.lawyer max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_arxiv_gentitle_e3.py b/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_arxiv_gentitle_e3.py index 1439f41d7..2688dd320 100644 --- a/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.llama_2_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_lawyer_e3.py b/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_lawyer_e3.py index 70a8262db..2c694e87a 100644 --- a/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_lawyer_e3.py +++ b/xtuner/configs/llama/llama2_7b_chat/llama2_7b_chat_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.llama_2_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/llama/llama_7b/llama_7b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/llama/llama_7b/llama_7b_qlora_arxiv_gentitle_e3.py index f505db457..459ae06f0 100644 --- a/xtuner/configs/llama/llama_7b/llama_7b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/llama/llama_7b/llama_7b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.title max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/llama/llama_7b/llama_7b_qlora_lawyer_e3.py b/xtuner/configs/llama/llama_7b/llama_7b_qlora_lawyer_e3.py index fb1d7dab2..5420e5774 100644 --- a/xtuner/configs/llama/llama_7b/llama_7b_qlora_lawyer_e3.py +++ b/xtuner/configs/llama/llama_7b/llama_7b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.lawyer max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_arxiv_gentitle_e3.py b/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_arxiv_gentitle_e3.py index a6d3cdf8c..68cfd48e7 100644 --- a/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.title max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_lawyer_e3.py b/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_lawyer_e3.py index ade478543..e847e0995 100644 --- a/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_lawyer_e3.py +++ b/xtuner/configs/qwen/qwen_7b/qwen_7b_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.lawyer max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_arxiv_gentitle_e3.py b/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_arxiv_gentitle_e3.py index d662b888e..3d819b50b 100644 --- a/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_arxiv_gentitle_e3.py +++ b/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_arxiv_gentitle_e3.py @@ -25,8 +25,8 @@ # Data # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv -# 2. Process data with `./tools/data_preprocess/arxiv.py` -data_path = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json' +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' prompt_template = PROMPT_TEMPLATE.qwen_chat max_length = 2048 pack_to_max_length = True diff --git a/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_lawyer_e3.py b/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_lawyer_e3.py index d4d0d8e42..701887f46 100644 --- a/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_lawyer_e3.py +++ b/xtuner/configs/qwen/qwen_7b_chat/qwen_7b_chat_qlora_lawyer_e3.py @@ -27,8 +27,8 @@ # Data # download data from https://github.com/LiuHC0428/LAW-GPT -crime_kg_assitant_path = './data/law/CrimeKgAssitant清洗后_52k.json' -law_reference_data_path = './data/law/训练数据_带法律依据_92k.json' +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' prompt_template = PROMPT_TEMPLATE.qwen_chat max_length = 2048 pack_to_max_length = True