diff --git a/.gitignore b/.gitignore index f922610..5d8287c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ _unsloth_sentencepiece_temp _* llama.cpp !dummy.txt -logs \ No newline at end of file +logs +model diff --git a/ChatTemplate.py b/ChatTemplate.py new file mode 100644 index 0000000..feba3a5 --- /dev/null +++ b/ChatTemplate.py @@ -0,0 +1,43 @@ +from datasets import load_dataset, Dataset +import pandas as pd +from unsloth.chat_templates import get_chat_template + +class ChatTemplate(): + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def formating_messages(self,example): + user_chat = {"role": example["user"]["role"], "content": example["user"]["content"]} + assistant_chat = {"role": example["assistant"]["role"], "content": example["assistant"]["content"]} + return {"messages": [user_chat, assistant_chat]} + + def formatting_prompts_func(self,examples): + convos = examples["messages"] + texts = [self.tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos] + return { "text" : texts, } + + def load_data(self): + self.tokenizer = get_chat_template( + self.tokenizer, + chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth + mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"}, # ShareGPT style + map_eos_token = True, # Maps <|im_end|> to instead + ) + dataset_train = load_dataset("Labagaite/fr-summarizer-dataset", split = "train") + dataset_val = load_dataset("Labagaite/fr-summarizer-dataset", split = "validation") + # Group the data + grouped_data_train = [{"user": dataset_train[i], "assistant": dataset_train[i+1]} for i in range(0, len(dataset_train), 2)] + grouped_data_val = [{"user": dataset_val[i], "assistant": dataset_val[i+1]} for i in range(0, len(dataset_val), 2)] + # Convert the list of dictionaries to a DataFrame + df_train = pd.DataFrame(grouped_data_train) + df_val = pd.DataFrame(grouped_data_val) + # Create a new Dataset object + dataset_train = Dataset.from_pandas(df_train) + dataset_val = Dataset.from_pandas(df_val) + + dataset_train = dataset_train.map(self.formating_messages, batched = False) + dataset_train = dataset_train.map(self.formatting_prompts_func, batched = True) + dataset_val = dataset_val.map(self.formating_messages, batched = False) + dataset_val = dataset_val.map(self.formatting_prompts_func, batched = True) + + return dataset_train, dataset_val \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..43b6688 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [2024] [DERUE-William] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/SampleGenerationCallback.py b/SampleGenerationCallback.py new file mode 100644 index 0000000..4b79bdf --- /dev/null +++ b/SampleGenerationCallback.py @@ -0,0 +1,62 @@ +# Description: This file contains the SampleGenerationCallback class which is used to generate and score summaries during training. +from transformers import TrainerCallback +import wandb +import random +from rouge_score import rouge_scorer + +class SampleGenerationCallback(TrainerCallback): + def __init__(self, every_x_steps=5, dataset_val=None, generate_summary=None, score_threshold = 0.2, patience=5, min_delta=0.01, warmup_steps=10): + self.every_x_steps = every_x_steps + self.dataset_val = dataset_val + self.generate_summary = generate_summary + self.score_threshold = score_threshold + self.patience = patience + self.min_delta = min_delta + self.warmup_steps = warmup_steps + self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) + self.summary_table_data = [] + self.best_score = None + self.patience_counter = 0 + + def generate_and_score_summary(self): + # Get the length of the list + length = len(self.dataset_val["messages"]) + # Generate a random index + index = random.randrange(0, length) + messages_chat = self.dataset_val[index]["messages"] + # Remove content from dictionaries with role 'assistant' + for message in messages_chat: + if message['role'] == 'assistant': + message['content'] = '' + messages_text = self.dataset_val[index]["text"] + messages_str = "".join(messages_text) + Reference_summary = messages_str.split('assistant', 1)[1] + summary_text = self.generate_summary(messages_chat) + scores = self.rouge.score(Reference_summary, summary_text) + rouge1 = scores['rouge1'].fmeasure + rouge2 = scores['rouge2'].fmeasure + rougeL = scores['rougeL'].fmeasure + return summary_text, Reference_summary, rouge1, rouge2, rougeL + + def on_step_end(self, args, state, control, model, **kwargs): + if state.global_step % self.every_x_steps == 0: + summary_text, Reference_summary, rouge1, rouge2, rougeL = self.generate_and_score_summary() + self.summary_table_data.append([Reference_summary, summary_text, f"Rouge-1: {rouge1},\n Rouge-2: {rouge2},\n Rouge-L: {rougeL}"]) + my_table = wandb.Table(columns=["Reference_summary", "Generated_summary", "Rouge-Score"], data=self.summary_table_data) + wandb.log({"summary_table": my_table}) + + if state.global_step % args.eval_steps == 0 and state.global_step > self.warmup_steps: + _, _, rouge1, rouge2, rougeL = self.generate_and_score_summary() + wandb.log({"ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougeL}) + + # Check if the performance has improved + if self.best_score is None or rouge2 > self.best_score + self.min_delta: + self.best_score = rouge2 + self.patience_counter = 0 + else: + self.patience_counter += 1 + + # Check if the patience has been exceeded + if self.patience_counter >= self.patience: + control.should_training_stop = True + print(f"\033[91m\nEarly stopping at step {state.global_step}, rouge2 score did not improve: {rouge2}\n\033[0m") \ No newline at end of file diff --git a/docs/ChatTemplate.md b/docs/ChatTemplate.md new file mode 100644 index 0000000..d7d22bb --- /dev/null +++ b/docs/ChatTemplate.md @@ -0,0 +1,45 @@ +# ChatTemplate + +The `ChatTemplate.py` script is a key part of the LLM Summarizer Trainer project. It defines a `ChatTemplate` class that is used to preprocess datasets in chat conversational format to fit the trainer. + +## Key Points + +The `ChatTemplate` class is initialized with a tokenizer. It has several methods: + +- `formating_messages`: This method takes an example from the dataset and formats it into a dictionary with a "messages" key. The value of the "messages" key is a list of dictionaries, each representing a chat message with "role" and "content" keys. + +- `formatting_prompts_func`: This method takes a batch of examples and applies the chat template to the "messages" of each example. It returns a dictionary with a "text" key and a list of formatted texts as the value. + +- `load_data`: This method loads the training and validation datasets, groups the data, converts the grouped data into a DataFrame, creates a new Dataset object from the DataFrame, and applies the `formating_messages` and `formatting_prompts_func` methods to the datasets. + +## Configuration + +In the `load_data` method, you can configure the chat template and the mapping by modifying the arguments passed to the `get_chat_template` function. The `chat_template` argument specifies the chat template to use. You can choose from several chat templates as described in this [link](https://github.com/unslothai/unsloth/blob/4606443b77f98a624896d4ca50710255d8436d86/unsloth/chat_templates.py#L258). For example, you can change `chat_template = "chatml"` to `chat_template = "zephyr"` to use the zephyr chat template. + +The `mapping` argument specifies the mapping between the roles and contents of the chat messages and the keys in the dataset. You can configure this by modifying the following code: + +```python +user_chat = {"role": example["user"]["role"], "content": example["user"]["content"]} +assistant_chat = {"role": example["assistant"]["role"], "content": example["assistant"]["content"]} +``` + +Here, you need to replace `"user"` and `"assistant"` with the keys present in your dataset. For example, if your dataset uses `"human"` and `"gpt"` as the keys, you can modify the code as follows: + +```python +user_chat = {"role": example["user"]["role"], "content": example["human"]["content"]} +assistant_chat = {"role": example["assistant"]["role"], "content": example["gpt"]["content"]} +``` + +To use a different dataset, you need to modify the arguments passed to the `load_dataset` function. The first argument is the name of the dataset to load. For example, you can change `"Labagaite/fr-summarizer-dataset"` to `"your_dataset_name"` to load your dataset. The `split` argument specifies the split of the dataset to load. Note that this script is designed to work with datasets in chat format where each entry is an instruction and the next entry is the response. The script first groups the entries two by two to form a single entry as a conversation. + +## Usage + +To use the `ChatTemplate` class to preprocess a dataset, you need to call the `load_data` method. This method returns the preprocessed training and validation datasets. Here's an example: + +```python +dataset_train, dataset_val = chat_template.load_data() +``` + +In this example, `dataset_train` and `dataset_val` are the preprocessed training and validation datasets, respectively. You can then pass these datasets to the trainer for training. + +For more details on how the `ChatTemplate` class works and how to use it, please refer to the [ChatTemplate Documentation](docs/ChatTemplate.md). \ No newline at end of file diff --git a/docs/dummy.txt b/docs/dummy.txt new file mode 100644 index 0000000..e69de29 diff --git a/docs/model_selection.md b/docs/model_selection.md new file mode 100644 index 0000000..747fa09 --- /dev/null +++ b/docs/model_selection.md @@ -0,0 +1,40 @@ +# Model Selection + +The `model_selector.py` script provides functionality to select a model from a list of models compatible with Unsloth, search for a model, or select a model from a local folder. It contains two main functions: `select_model` and `get_model_list`. + +## select_model Function + +The `select_model` function allows the user to select a model in one of three ways: + +1. **Search for a model:** The user can enter a search term, and the function will print out all models that contain this term in their name. The user can then enter the name of the model they want to select. + +2. **Select a model from a list:** The function prints out a list of standard models and 4x faster models. The user can then enter the name of the model they want to select. + +3. **Select a model from a local folder:** The user can enter the path of a local folder containing the model. + +The function returns the selected model and a boolean value indicating whether the selected model is a 4x faster model. + +## get_model_list Function + +The `get_model_list` function retrieves a list of models from the Hugging Face Model Hub. It sends a GET request to the Hugging Face API and parses the response to separate the models into standard models and 4x faster models. + +The function returns these two lists of models. + +## Usage + +To use the `select_model` function, you first need to get a list of models using the `get_model_list` function. Then, you can pass these lists to the `select_model` function. Here's an example: + +```python +standard_models, faster_models = get_model_list() +selected_model, is_4bit = select_model(standard_models, faster_models) +``` + +This will prompt the user to select a model as described above. + +Please note that you need to have the `requests` library installed to use the `get_model_list` function. You can install it with the following command: + +```bash +pip install requests +``` + +This document provides a detailed description of the `model_selector.py` script. For more information on how to use this script in the context of the LLM Summarizer Trainer project, please refer to the main README file. \ No newline at end of file diff --git a/docs/save_model.md b/docs/save_model.md new file mode 100644 index 0000000..9f7e7b1 --- /dev/null +++ b/docs/save_model.md @@ -0,0 +1,42 @@ +How to use the `ModelSaver` class to save the model. + + +## Saving the Model + +The `ModelSaver` class in `modelSaver.py` is used to save the model. Here's how you can use it: + +1. Initialize the `ModelSaver` class with your model, tokenizer, and other necessary parameters: + +```python +model_saver = ModelSaver(model, tokenizer, fine_tuned_model_dir, out_model_name, wandb_run_url, wandb_run_path) +``` + +2. Call the `save_model` method of the `ModelSaver` class: + +```python +model_saver.save_model() +``` + +When you run `save_model`, you will be prompted to enter the types of models you want to save. Options are: '16bit', '4bit', 'lora', 'gguf_q8_0', 'gguf_f16', 'gguf_q4_k_m'. You can enter 'all' to save all types. If you want to save multiple types, separate them with commas. + +The `ModelSaver` class will then save your model in the specified formats and update the model card with the training details and performance metrics. + +Please replace `model`, `tokenizer`, `fine_tuned_model_dir`, `out_model_name`, `wandb_run_url`, and `wandb_run_path` with your actual parameters. They are automatically retrieve by `trainer.py` during training. + + +## Updating the Model Card + +The `ModelSaver` class in `modelSaver.py` also includes functionality to update the model card on Hugging Face Model Hub. The model card provides important information about the model, including its base model, the method used for training, the ROUGE scores achieved, and a link to the training logs on Weights & Biases. + +The `UpdateModelCard` method is used to update the model card. It first retrieves the ROUGE scores from the Weights & Biases run using the `get_wandb_run` method. It then formats the model card content using these scores and other information about the model. Finally, it pushes the updated model card to the Hugging Face Model Hub. + +Here's how you can use the `UpdateModelCard` method: + +```python +model_saver.UpdateModelCard(save_directory, token) +``` + +Please replace `save_directory` and `token` with your actual parameters. The `save_directory` is the directory where the model is saved, and `token` is your Hugging Face API token. + +The model card is formatted using the `CUSTOM_MODEL_CARD` string, which is a template for the model card content. You can modify this template to include any additional information you want to display on the model card. + diff --git a/docs/setup.md b/docs/setup.md new file mode 100644 index 0000000..0b15a23 --- /dev/null +++ b/docs/setup.md @@ -0,0 +1,78 @@ +## Setup + +### Automated Setup (Linux) + +If you are using a Linux system, you can simply run the `setup.sh` script to set up your environment. This script creates a virtual environment, installs the necessary requirements, and configures the environment based on your GPU architecture and PyTorch version. + +To run the script, open a terminal and navigate to the directory containing the `setup.sh` file. Then, run the following command: + +```bash +./setup.sh +``` + +### Manual Setup + +If you prefer to set up your environment manually or are using a different operating system, follow these steps: + +1. **Create a virtual environment**: You can use `venv` to create a virtual environment. Open a terminal and run the following command: + + ```bash + python3 -m venv env + ``` + +2. **Activate the virtual environment**: The command to activate the environment depends on your operating system: + + - On Linux or MacOS, run: + + ```bash + source env/bin/activate + ``` + +3. **Install the requirements**: The `requirements.txt` file lists the Python packages that your project depends on. You can install these using `pip`: + + ```bash + pip install -r requirements.txt + ``` + + This will install the following packages: + + - `wandb`: Weights & Biases, a tool for tracking and visualizing machine learning experiments. + - `rouge_score`: A Python package for calculating the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score, a common metric for evaluating text summaries. + - `evaluate`: A package for evaluating machine learning models. + +4. **Install additional packages based on your GPU architecture and PyTorch version**: Refer to the `setup.sh` script for the specific packages to install based on your GPU architecture (Ampere or older) and PyTorch version (2.1.0, 2.1.1, 2.2.0, or 2.2.1). You can check your GPU architecture and PyTorch version using the following Python commands: + + ```python + import torch + print(torch.version.cuda) # prints the CUDA version + print(torch.version.__version__) # prints the PyTorch version + print(torch.cuda.get_device_capability()[0]) # prints the GPU architecture + ``` + + Then, install the appropriate packages using `pip`. For example, if your GPU architecture is Ampere or newer and your PyTorch version is 2.2.1, you would run: + + ```bash + pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" + pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes + ``` + +Remember to replace the commands with the appropriate ones for your GPU architecture and PyTorch version. Find more at [unsloth Github](https://github.com/unslothai/unsloth) + +## Setting Up Environment Variables + +For the application to function correctly, you need to set up several environment variables. These variables hold the API keys for various services that the application uses. + +Create a `.env` file in the root directory of the project and add the following lines to it: + +```properties +HUGGING_FACE=your_hugging_face_token +WANDB_API_KEY=your_wandb_api_key +# Add more tokens as needed +``` + +Replace `your_hugging_face_token`, `your_openai_key`, and `your_wandb_api_key` with your actual API keys. + +- `HUGGING_FACE`: Your Hugging Face API token. You can find this on your Hugging Face account page. +- `WANDB_API_KEY`: Your Weights & Biases API key. You can find this on your Weights & Biases account page. + +Remember not to share these tokens with anyone or publish them online. They provide access to your accounts on these services, and anyone with these tokens can use these services as if they were you. \ No newline at end of file diff --git a/docs/training_metrics.md b/docs/training_metrics.md new file mode 100644 index 0000000..4303ae8 --- /dev/null +++ b/docs/training_metrics.md @@ -0,0 +1,41 @@ +# Training Metrics + +The `SampleGenerationCallback.py` script is part of the LLM Summarizer Trainer project. It defines a custom callback class `SampleGenerationCallback` that is used during the training process to generate and score summaries. This class is a subclass of the `TrainerCallback` class from the Hugging Face Transformers library. + +## How it Works + +The `SampleGenerationCallback` class is initialized with several parameters: + +- `every_x_steps`: The number of steps after which a summary is generated and scored. +- `dataset_val`: The validation dataset used to generate the summaries. +- `generate_summary`: A function that generates a summary given a list of messages. +- `score_threshold`: The threshold below which training should stop. + +The class has two main methods: + +- `generate_and_score_summary`: This method generates a random summary and scores it using the ROUGE metric. It first selects a random message from the validation dataset, generates a summary for it, and then scores the summary against the reference summary using the ROUGE-1, ROUGE-2, and ROUGE-L metrics. + +- `on_step_end`: This method is called at the end of each training step. If the current step is a multiple of `every_x_steps`, it generates and scores a summary and logs the results to Weights & Biases. If the current step is a multiple of `args.eval_steps`, it generates and scores a summary and logs the ROUGE scores to Weights & Biases. If the ROUGE-2 score is below the `score_threshold`, it stops the training process. + +## Why it's Used + +The `SampleGenerationCallback` class is used to monitor the performance of the model during training. By generating and scoring summaries at regular intervals, it provides a way to track how well the model is learning to generate summaries. The ROUGE scores give a quantitative measure of the quality of the summaries, and logging these scores to Weights & Biases allows for easy tracking and visualization of the training progress. + +The `SampleGenerationCallback` class also provides a form of early stopping. If the ROUGE-2 score falls below a certain threshold, it stops the training process. This can prevent overfitting and save computational resources by stopping the training process when the model is no longer improving. + +## How to Use + +To use the `SampleGenerationCallback` class, you need to initialize it with the appropriate parameters and pass it to the `Trainer` class when initializing the trainer. Here's an example: + +```python +sample_generation_callback = SampleGenerationCallback(every_x_steps=5, dataset_val=dataset_val, generate_summary=generate_summary, score_threshold=0.2) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + callbacks=[sample_generation_callback] +) +``` + +In this example, `generate_summary` is a function that generates a summary given a list of messages. This function is defined in [Trainer script](../trainer.py) \ No newline at end of file diff --git a/docs/training_model.md b/docs/training_model.md new file mode 100644 index 0000000..25bba64 --- /dev/null +++ b/docs/training_model.md @@ -0,0 +1,27 @@ +# Training the Model + +## trainer Method + +The `trainer` method initializes the model and the trainer. It first patches the model and adds fast LoRA weights using the `get_peft_model` method of the `FastLanguageModel` class. It then initializes a Weights & Biases run and sets up the trainer with the model, the training and evaluation datasets, the tokenizer, and the training arguments. + +The training arguments include settings for the batch size, gradient accumulation steps, warmup steps, maximum steps, learning rate scheduler, and more. The trainer also includes several callbacks for generating samples, early stopping, and logging to Weights & Biases. + +## Start_Training Method + +The `Start_Training` method starts the training process. It checks if there are any saved checkpoints in the `outputs` folder. If there are, it resumes training from the latest checkpoint. If there are no saved checkpoints, it starts training from scratch. + +After training is completed, the method saves the model to the specified directory and logs the end of training. + +The trainer saves checkpoints after every few steps as specified in the training arguments. This allows you to resume training from the latest checkpoint if the training process is interrupted for any reason. + +## Usage + +To use the `trainer` and `Start_Training` methods, you first need to initialize the `TrainerClass` with your model, tokenizer, training and evaluation datasets, and other necessary parameters. Then, you can call the `trainer` method to initialize the trainer and the `Start_Training` method to start the training process. Here's an example: + +```python +trainer_class = TrainerClass(model, tokenizer, dataset_train, dataset_val, max_seq_length, out_model_name, fine_tuned_model_dir) +trainer = trainer_class.trainer() +trainer_class.Start_Training(trainer) +``` + +This document provides a detailed description of the `trainer.py` script. For more information on how to use this script in the context of the LLM Summarizer Trainer project, please refer to the main README file. In the next section, we will look in more detail at the metric and sample generation callback, a custom callback function.0 \ No newline at end of file diff --git a/images/Llm Summarizer trainer icon.jpg b/images/Llm Summarizer trainer icon.jpg new file mode 100644 index 0000000..f556fcf Binary files /dev/null and b/images/Llm Summarizer trainer icon.jpg differ diff --git a/images/Llm_Summarizer_trainer_icon-removebg.png b/images/Llm_Summarizer_trainer_icon-removebg.png new file mode 100644 index 0000000..38adeb7 Binary files /dev/null and b/images/Llm_Summarizer_trainer_icon-removebg.png differ diff --git a/modelSaver.py b/modelSaver.py new file mode 100644 index 0000000..094d183 --- /dev/null +++ b/modelSaver.py @@ -0,0 +1,175 @@ + +from huggingface_hub import ModelCard +from dotenv import load_dotenv +import os +import wandb +load_dotenv() +HUGGING_FACE = os.getenv('HUGGING_FACE') +WANDB_API_KEY = os.getenv('WANDB_API_KEY') +wandb.login(key=WANDB_API_KEY) +api = wandb.Api() + +class ModelSaver: + def __init__(self, model, tokenizer, fine_tuned_model_dir, out_model_name, wandb_run_url, wandb_run_path): + self.model = model + self.tokenizer = tokenizer + self.fine_tuned_model_dir = fine_tuned_model_dir + self.out_model_name = out_model_name + self.method = "" + self.wandb_run_url = wandb_run_url + self.wandb_run_path = wandb_run_path + def save_model(self): + print("\nEnter the types of models you want to save. Options are: '16bit', '4bit', 'lora', 'gguf_q8_0', 'gguf_f16', 'gguf_q4_k_m'. Enter 'all' to save all types. Separate multiple options with commas.\n") + user_input = [x.strip() for x in input().split(',')] + + if '16bit' in user_input or 'all' in user_input: + self.method = "16bit" + temp_model_name = self.out_model_name + if(temp_model_name.endswith("-bnb-4bit")): + temp_model_name = temp_model_name.replace("-bnb-4bit", "") + print(f"\033[32m\nSaving 16bit model as \033[34m{temp_model_name}\033[32m\n\033[0m") + self.model.save_pretrained_merged(f"{self.fine_tuned_model_dir}/{temp_model_name}", self.tokenizer, save_method = "merged_16bit",) + self.model.push_to_hub_merged(f"Labagaite/{temp_model_name}", self.tokenizer, save_method = "merged_16bit", token = HUGGING_FACE) + self.UpdateModelCard(f"Labagaite/{temp_model_name}", HUGGING_FACE) + + if '4bit' in user_input or 'all' in user_input: + self.method = "4bit" + temp_model_name = self.out_model_name + if(temp_model_name.endswith("-bnb-4bit")): + temp_model_name = temp_model_name.replace("-bnb-4bit", "-bnb-4bit") + print(f"\033[32m\nSaving 16bit model as \033[34m{temp_model_name}\033[32m\n\033[0m") + self.model.save_pretrained_merged(f"{self.fine_tuned_model_dir}/{temp_model_name}", self.tokenizer, save_method = "merged_4bit_forced",) + self.model.push_to_hub_merged(f"Labagaite/{temp_model_name}", self.tokenizer, save_method = "merged_4bit_forced", token = HUGGING_FACE) + self.UpdateModelCard(f"Labagaite/{temp_model_name}", HUGGING_FACE) + + if 'lora' in user_input or 'all' in user_input: + self.method = "lora" + temp_model_name = self.out_model_name + if(temp_model_name.endswith("-bnb-4bit")): + temp_model_name = temp_model_name.replace("-bnb-4bit", "-LORA-bnb-4bit") + else: + temp_model_name = temp_model_name + "-LORA" + print(f"\033[32m\nSaving 16bit model as \033[34m{temp_model_name}\033[32m\n\033[0m") + self.model.save_pretrained_merged(f"{self.fine_tuned_model_dir}/{temp_model_name}", self.tokenizer, save_method = "lora",) + self.model.push_to_hub_merged(f"Labagaite/{temp_model_name}", self.tokenizer, save_method = "lora", token = HUGGING_FACE) + self.UpdateModelCard(f"Labagaite/{temp_model_name}", HUGGING_FACE) + + if 'gguf_q8_0' in user_input or 'all' in user_input: + self.method = "q8_0" + temp_model_name = self.out_model_name + if(temp_model_name.endswith("-bnb-4bit")): + temp_model_name = temp_model_name.replace("-bnb-4bit", "-GGUF-Q8-0") + else: + temp_model_name = temp_model_name + "-GGUF-Q8-0" + print(f"\033[32m\nSaving 16bit model as \033[34m{temp_model_name}\033[32m\n\033[0m") + self.model.save_pretrained_gguf(f"{self.fine_tuned_model_dir}/{temp_model_name}", self.tokenizer,) + self.model.push_to_hub_gguf(f"Labagaite/{temp_model_name}", self.tokenizer, token = HUGGING_FACE) + self.UpdateModelCard(f"Labagaite/{temp_model_name}", HUGGING_FACE) + + if 'gguf_f16' in user_input or 'all' in user_input: + self.method = "f16" + temp_model_name = self.out_model_name + if(temp_model_name.endswith("-bnb-4bit")): + temp_model_name = temp_model_name.replace("-bnb-4bit", "-GGUF") + else: + temp_model_name = temp_model_name + "-GGUF" + print(f"\033[32m\nSaving 16bit model as \033[34m{temp_model_name}\033[32m\n\033[0m") + self.model.save_pretrained_gguf(f"{self.fine_tuned_model_dir}/{temp_model_name}", self.tokenizer, quantization_method = "f16") + self.model.push_to_hub_gguf(f"Labagaite/{temp_model_name}", self.tokenizer, quantization_method = "f16", token = HUGGING_FACE) + self.UpdateModelCard(f"Labagaite/{temp_model_name}", HUGGING_FACE) + + if 'gguf_q4_k_m' in user_input or 'all' in user_input: + self.method = "q4_k_m" + temp_model_name = self.out_model_name + if(temp_model_name.endswith("-bnb-4bit")): + temp_model_name = temp_model_name.replace("-bnb-4bit", "-GGUF-q4-k-m") + else: + temp_model_name = temp_model_name + "-GGUF-q4-k-m" + print(f"\033[32m\nSaving 16bit model as \033[34m{temp_model_name}\033[32m\n\033[0m") + self.model.save_pretrained_gguf(f"{self.fine_tuned_model_dir}/{temp_model_name}", self.tokenizer, quantization_method = "q4_k_m") + self.model.push_to_hub_gguf(f"Labagaite/{temp_model_name}", self.tokenizer, quantization_method = "q4_k_m", token = HUGGING_FACE) + self.UpdateModelCard(f"Labagaite/{temp_model_name}", HUGGING_FACE) + + def UpdateModelCard(self, save_directory, token): + rouge_1_score, rouge_2_score, rougeL_score = self.get_wandb_run() + content = CUSTOM_MODEL_CARD.format( + username="Labagaite", + base_model=self.model.config._name_or_path, + model_type=self.model.config.model_type, + method=self.method, + extra="", + wandb_run_url=self.wandb_run_url, + rouge_1_score=rouge_1_score, + rouge_2_score=rouge_2_score, + rougeL_score=rougeL_score, + ) + card = ModelCard(content) + card.push_to_hub(save_directory, token = token) + + def get_wandb_run(self): + run = api.run(self.wandb_run_path) + # Access the summary metrics + rouge_1_score = run.summary.get("ROUGE-1") + rouge_2_score = run.summary.get("ROUGE-2") + rougeL_score = run.summary.get("ROUGE-L") + return rouge_1_score, rouge_2_score, rougeL_score + + + +# Define new custom Model Card +CUSTOM_MODEL_CARD = """ +--- +base_model: {base_model} +tags: +- text-generation-inference +- transformers +- unsloth +- {model_type} +- {extra} +- summarizer +- {method} +license: apache-2.0 +language: +- fr +--- + +# Uploaded as {method} model + +- **Developed by:** {username} +- **License:** apache-2.0 +- **Finetuned from model :** {base_model} + +# Training Logs + +## Summary metrics +### Best ROUGE-1 score : **{rouge_1_score}** +### Best ROUGE-2 score : **{rouge_2_score}** +### Best ROUGE-L score : **{rougeL_score}** + +## Wandb logs +You can view the training logs []({wandb_run_url}). + +## Training details + +### training data +- Dataset : [fr-summarizer-dataset](https://huggingface.co/datasets/Labagaite/fr-summarizer-dataset) +- Data-size : 7.65 MB +- train : 1.97k rows +- validation : 440 rows +- roles : user , assistant +- Format chatml "role": "role", "content": "content", "user": "user", "assistant": "assistant" +
+*French audio podcast transcription* + +# Project details +[](https://github.com/WillIsback/Report_Maker) +Fine-tuned on French audio podcast transcription data for summarization task. As a result, the model is able to summarize French audio podcast transcription data. +The model will be used for an AI application: [Report Maker](https://github.com/WillIsback/Report_Maker) wich is a powerful tool designed to automate the process of transcribing and summarizing meetings. +It leverages state-of-the-art machine learning models to provide detailed and accurate reports. + +This {model_type} model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. +This {model_type} was trained with [LLM summarizer trainer](images/Llm_Summarizer_trainer_icon-removebg.png) +[](https://github.com/unslothai/unsloth) +**LLM summarizer trainer** +[](https://github.com/WillIsback/LLM_Summarizer_Trainer) +""" diff --git a/model_selector.py b/model_selector.py new file mode 100644 index 0000000..6730a91 --- /dev/null +++ b/model_selector.py @@ -0,0 +1,54 @@ +# Description: This script contains the function select_model, which allows the user to select a model from a list of models, search for a model, or select a model from a local folder. The function also returns a boolean value indicating whether the selected model is a 4x faster model. +import requests + +def select_model(standard_models, faster_models): + print("Enter '1' to search for a model, '2' to select a model from a list, or '3' to select a model from a local folder.") + choice = input() + + if choice == '1': + print("Enter the name of the model you want to search for:") + search_term = input() + for model in standard_models + faster_models: + if search_term in model: + print(model) + print("Enter the name of the model you want to select:") + selected_model = input() + is_4bit = selected_model.endswith('4bit') + return selected_model, is_4bit + + elif choice == '2': + print("Standard models:") + for model in standard_models: + print(model) + print("\n4x faster models:") + for model in faster_models: + print(model) + print("Enter the name of the model you want to select:") + selected_model = input() + is_4bit = selected_model.endswith('4bit') + return selected_model, is_4bit + + elif choice == '3': + print("Enter the path of the local folder containing the model:") + folder_path = input() + is_4bit = '4bit' in folder_path + return folder_path, is_4bit + + else: + print("Invalid choice. Please enter '1', '2', or '3'.") + return select_model(standard_models, faster_models) + +def get_model_list(): + response = requests.get('https://huggingface.co/api/models?search=unsloth') + data = response.json() + + standard_models = [] + faster_models = [] + + for model in data: + if model['modelId'].endswith('4bit'): + faster_models.append(model['modelId']) + else: + standard_models.append(model['modelId']) + + return standard_models, faster_models diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..b247c49 --- /dev/null +++ b/readme.md @@ -0,0 +1,201 @@ +

+ +

+ +

LLM Summarizer Trainer

+ +

+ Fine-tuning Large Language Models for text summarization +

+
+
+ + +
+ Report Maker +
+
+
+ + +
+ Model & Data +
+
+
+ + +
+ unsloth +
+
+
+ +## Table of Contents +- [Summary](#summary) +- [Motivation](#motivation) +- [Setup](#setup) + - [Automated Setup (Linux)](#automated-setup-linux) + - [Manual Setup](#manual-setup) +- [Setting Up Environment Variables](#setting-up-environment-variables) +- [How to Use](#how-to-use) +- [Model Selection](#model-selection) +- [Data Preprocessing](#data-preprocessing) +- [Training the Model](#training-the-model) + - [Training Metrics](#training-metrics) +- [Saving and Updating the Model](#saving-and-updating-the-model) +- [Conclusion](#conclusion) +- [License](#license) + +## Summary +The `trainer.py` script is part of the LLM Summarizer Trainer project. It is used to fine-tune a Large Language Model (LLM) for the task of summarizing text using QLora as the fine-tuning method. The script leverages the power of the Hugging Face Transformers library, the Weights & Biases tool for experiment tracking, and the TRL library for training. All in the fast tool [unsloth](https://github.com/unslothai/unsloth). +[](https://github.com/unslothai/unsloth) + +## Motivation +[](https://github.com/WillIsback/Report_Maker) +To fine-tuned on French audio podcast transcription data for summarization task. The model will be used for an AI application: [Report Maker](https://github.com/WillIsback/Report_Maker) wich is a powerful tool designed to automate the process of transcribing and summarizing meetings. It leverages state-of-the-art machine learning models to provide detailed and accurate reports. + +## Setup + +### Automated Setup (Linux) + +If you are using a Linux system, you can simply run the `setup.sh` script to set up your environment. This script creates a virtual environment, installs the necessary requirements, and configures the environment based on your GPU architecture and PyTorch version. + +To run the script, open a terminal and navigate to the directory containing the `setup.sh` file. Then, run the following command: + +```bash +./setup.sh +``` + +### Manual Setup + +If you prefer to set up your environment manually or are using a different operating system, follow these steps: + +1. **Create a virtual environment**: You can use `venv` to create a virtual environment. Open a terminal and run the following command: + + ```bash + python3 -m venv env + ``` + +2. **Activate the virtual environment**: The command to activate the environment depends on your operating system: + + - On Linux or MacOS, run: + + ```bash + source env/bin/activate + ``` + +3. **Install the requirements**: The `requirements.txt` file lists the Python packages that your project depends on. You can install these using `pip`: + + ```bash + pip install -r requirements.txt + ``` + + This will install the following packages: + + - `wandb`: Weights & Biases, a tool for tracking and visualizing machine learning experiments. + - `rouge_score`: A Python package for calculating the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score, a common metric for evaluating text summaries. + - `evaluate`: A package for evaluating machine learning models. + +4. **Install additional packages based on your GPU architecture and PyTorch version**: Refer to the `setup.sh` script for the specific packages to install based on your GPU architecture (Ampere or older) and PyTorch version (2.1.0, 2.1.1, 2.2.0, or 2.2.1). You can check your GPU architecture and PyTorch version using the following Python commands: + + ```python + import torch + print(torch.version.cuda) # prints the CUDA version + print(torch.version.__version__) # prints the PyTorch version + print(torch.cuda.get_device_capability()[0]) # prints the GPU architecture + ``` + + Then, install the appropriate packages using `pip`. For example, if your GPU architecture is Ampere or newer and your PyTorch version is 2.2.1, you would run: + + ```bash + pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" + pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes + ``` + +Remember to replace the commands with the appropriate ones for your GPU architecture and PyTorch version. Find more at [unsloth Github](https://github.com/unslothai/unsloth) + +## Setting Up Environment Variables + +For the application to function correctly, you need to set up several environment variables. These variables hold the API keys for various services that the application uses. + +Create a `.env` file in the root directory of the project and add the following lines to it: + +```properties +HUGGING_FACE=your_hugging_face_token +WANDB_API_KEY=your_wandb_api_key +# Add more tokens as needed +``` + +Replace `your_hugging_face_token`, `your_openai_key`, and `your_wandb_api_key` with your actual API keys. + +- `HUGGING_FACE`: Your Hugging Face API token. You can find this on your Hugging Face account page. +- `WANDB_API_KEY`: Your Weights & Biases API key. You can find this on your Weights & Biases account page. + +Remember not to share these tokens with anyone or publish them online. They provide access to your accounts on these services, and anyone with these tokens can use these services as if they were you. + + +## How to Use + +To use the script, you need to have the necessary libraries installed, including `torch`, `transformers`, `trl`, `wandb`, and others. You also need to have a Hugging Face and Weights & Biases account, and the API keys should be stored in environment variables. + +To run the script, navigate to the directory containing the script and run the command: + +```bash +python trainer.py +``` + +During the testing phase, you will be prompted to enter 'r' to retry with a different message, 's' to save the model, or 'q' to quit. + +## Model Selection + +The LLM Summarizer Trainer includes functionality to select a model for fine-tuning. The `model_selector.py` script is used for this purpose. It allows you to search for a model, select a model from a list, or select a model from a local folder. + +For more details on how to use the `model_selector.py` script to select a model, please refer to the [Model Selection Documentation](docs/model_selection.md). + + +The `trainer.py` script provides functionality to train a model using the Hugging Face's `Trainer` class. It contains two main methods: `trainer` and `Start_Training`. + +## Data Preprocessing + +The `ChatTemplate.py` script is a crucial part of the LLM Summarizer Trainer project. It defines a `ChatTemplate` class that is used to preprocess datasets in chat conversational format to fit the trainer. + +The `ChatTemplate` class is initialized with a tokenizer and has several methods for formatting and loading the data. It allows you to configure the chat template and the mapping between the roles and contents of the chat messages and the keys in the dataset. It also allows you to load any dataset in chat format. + +For more details on how to use the `ChatTemplate.py` script to preprocess your data, please refer to the [Data Preprocessing Documentation](docs/ChatTemplate.md). + +## Training the Model + +The `trainer.py` script provides functionality to train a model using the Hugging Face's `Trainer` class. It contains two main methods: `trainer` and `Start_Training`. + +The `trainer` method initializes the model and the trainer. It first patches the model and adds fast LoRA weights using the `get_peft_model` method of the `FastLanguageModel` class. It then initializes a Weights & Biases run and sets up the trainer with the model, the training and evaluation datasets, the tokenizer, and the training arguments. + +The `Start_Training` method starts the training process. It checks if there are any saved checkpoints in the `outputs` folder. If there are, it resumes training from the latest checkpoint. If there are no saved checkpoints, it starts training from scratch. + +After training is completed, the method saves the model to the specified directory and logs the end of training. + +The trainer saves checkpoints after every few steps as specified in the training arguments. This allows you to resume training from the latest checkpoint if the training process is interrupted for any reason. + +### Training Metrics + +During the training process, the `SampleGenerationCallback.py` script is used to generate and score summaries at regular intervals. This custom callback class monitors the performance of the model and provides a form of early stopping. It generates a random summary, scores it using the ROUGE metric, and logs the results to Weights & Biases. If the ROUGE-2 score falls below a certain threshold, it stops the training process. + +For more details on how the `SampleGenerationCallback` works and why it's used, please refer to the [Training Metrics Documentation](docs/training_metrics.md). + +For more details on how the `Trainer` works , please refer to the [Training Documentation](docs/training_model.md). + +## Saving and Updating the Model + +The LLM Summarizer Trainer includes functionality to save the fine-tuned model and update the model card on Hugging Face Model Hub. The `ModelSaver` class in `modelSaver.py` is used for this purpose. It allows you to save the model in various formats and update the model card with important information about the model, including its base model, the method used for training, the ROUGE scores achieved, and a link to the training logs on Weights & Biases. + +For more details on how to use the `ModelSaver` class to save the model and update the model card, please refer to the [Save Model Documentation](docs/save_model.md). + + + +## Conclusion + +The `trainer.py` script provides a comprehensive pipeline for fine-tuning a Large Language Model for the task of summarizing text. It includes features for logging, model saving, and testing, making it a versatile tool for model training and evaluation. + +## License + +This project is licensed under the terms of the MIT license. See the [LICENSE](LICENSE) file for details. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..88145e5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +wandb +rouge_score +evaluate diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..d311b93 --- /dev/null +++ b/setup.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +# Create a virtual environment +python3 -m venv env + +# Activate the virtual environment +source env/bin/activate + +# Install the requirements +pip install -r requirements.txt + +# Find CUDA version +CUDA_VERSION=$(python -c "import torch; print(torch.version.cuda)") +# Install additional packages for PyTorch 2.2.1 +PYTORCH_VERSION=$(python -c "import torch; print(torch.version.__version__)") +# Check GPU architecture +GPU_ARCH=$(python -c "import torch; print(torch.cuda.get_device_capability()[0])") + +# if PyTorch version is 2.1.0 +if [[ $PYTORCH_VERSION == "2.1.0" ]]; then + pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton \ + --index-url https://download.pytorch.org/whl/cu121 + # if GPU architecture is ampere of newer (RTX 30xx, RTX 40xx, A100, H100, L40) + if [[ $GPU_ARCH -ge 8 ]]; then + # if cuda version is 11.8 + if [[ $CUDA_VERSION == "11.8" ]]; then + pip install "unsloth[cu118-ampere] @ git+https://github.com/unslothai/unsloth.git" + # if cuda version is 12.1 + elif [[ $CUDA_VERSION == "12.1" ]]; then + pip install "unsloth[cu121-ampere] @ git+https://github.com/unslothai/unsloth.git" + fi + # else GPU architecture is older (V100, Tesla T4, RTX 20xx) + else + # if cuda version is 11.8 + if [[ $CUDA_VERSION == "11.8" ]]; then + pip install "unsloth[cu118] @ git+https://github.com/unslothai/unsloth.git" + # if cuda version is 12.1 + elif [[ $CUDA_VERSION == "12.1" ]]; then + pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git" + fi + fi +# else if PyTorch version is 2.1.1 +elif [[ $PYTORCH_VERSION == "2.1.1" ]]; then + pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.1 triton \ + --index-url https://download.pytorch.org/whl/cu121 + # if GPU architecture is ampere of newer (RTX 30xx, RTX 40xx, A100, H100, L40) + if [[ $GPU_ARCH -ge 8 ]]; then + # if cuda version is 11.8 + if [[ $CUDA_VERSION == "11.8" ]]; then + pip install "unsloth[cu118-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git" + # if cuda version is 12.1 + elif [[ $CUDA_VERSION == "12.1" ]]; then + pip install "unsloth[cu121-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git" + fi + # else GPU architecture is older (V100, Tesla T4, RTX 20xx) + else + # if cuda version is 11.8 + if [[ $CUDA_VERSION == "11.8" ]]; then + pip install "unsloth[cu118-torch211] @ git+https://github.com/unslothai/unsloth.git" + # if cuda version is 12.1 + elif [[ $CUDA_VERSION == "12.1" ]]; then + pip install "unsloth[cu121-torch211] @ git+https://github.com/unslothai/unsloth.git" + fi + fi +# else if PyTorch version is 2.2.0 +elif [[ $PYTORCH_VERSION == "2.2.0" ]]; then + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu121 + # if GPU architecture is ampere of newer (RTX 30xx, RTX 40xx, A100, H100, L40) + if [[ $GPU_ARCH -ge 8 ]]; then + # if cuda version is 11.8 + if [[ $CUDA_VERSION == "11.8" ]]; then + pip install "unsloth[cu118-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git" + # if cuda version is 12.1 + elif [[ $CUDA_VERSION == "12.1" ]]; then + pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git" + fi + # else GPU architecture is older (V100, Tesla T4, RTX 20xx) + else + # if cuda version is 11.8 + if [[ $CUDA_VERSION == "11.8" ]]; then + pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git" + # if cuda version is 12.1 + elif [[ $CUDA_VERSION == "12.1" ]]; then + pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git" + fi + fi + +# else if PyTorch version is 2.2.1 +elif [[ $PYTORCH_VERSION == "2.2.1" ]]; then + # if GPU architecture is ampere of newer (RTX 30xx, RTX 40xx, A100, H100, L40) + if [[ $GPU_ARCH -ge 8 ]]; then + pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" + pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes + # else GPU architecture is older (V100, Tesla T4, RTX 20xx) + else + pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" + pip install --no-deps xformers trl peft accelerate bitsandbytes + fi +fi \ No newline at end of file diff --git a/snippet.py b/snippet.py new file mode 100644 index 0000000..f65b601 --- /dev/null +++ b/snippet.py @@ -0,0 +1,59 @@ +from pathlib import Path +from unsloth import FastLanguageModel +from modelSaver import ModelSaver +from ChatTemplate import ChatTemplate +import random +import torch +from tests import check_token_threshold_and_truncate, test_dataset, test_text_generation +from evaluate import load as load_metric +from rouge_score import rouge_scorer +""" +INFO:root:Base Model name: mistral-Summarizer-7b-instruct-v0.2-bnb-4bit +INFO:root:Output Model name: mistral-Summarizer-Summarizer-7b-instruct-v0.2-bnb-4bit +INFO:root:Max sequence length: 1024 +INFO:root:Load in 4-bit: True +INFO:root:Fine-tuned model directory: /home/will/model +INFO:root:Weights & Biases run URL: https://wandb.ai/william-derue/LLM-summarizer_trainer/runs/s9xqw6o8 +INFO:root:Weights & Biases run path: william-derue/LLM-summarizer_trainer/s9xqw6o8 +""" + +model_name = "mistral-Summarizer-Summarizer-7b-instruct-v0.2-bnb-4bit" +output_model_name = "mistral-Summarizer-7b-instruct-v0.2-bnb-4bit" +max_seq_length = 1024 +load_in_4bit = True +Fine_tuned_model_directory = Path("/home/will/model") +wandb_run_url = "https://wandb.ai/william-derue/LLM-summarizer_trainer/runs/s9xqw6o8" +wandb_run_path = "william-derue/LLM-summarizer_trainer/s9xqw6o8" +# unsloth/gemma-2b-it-bnb-4bit +# unsloth/llama-2-7b-chat-bnb-4bit +model, tokenizer = FastLanguageModel.from_pretrained( + model_name="unsloth/gemma-2b-it-bnb-4bit", + max_seq_length=max_seq_length, + dtype=None, + load_in_4bit=load_in_4bit, +) + +chatml = ChatTemplate(tokenizer) +if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id +# Load your data +dataset_train, dataset_val = chatml.load_data() + +# Validate the training and validation datasets +test_dataset(dataset_train) +test_dataset(dataset_val) + +# Get the length of the list +length = len(dataset_val["messages"]) +# Generate a random index +index = random.randrange(0, length) +if True : index = 6 # Force index to have to truncate + +print(f"\n\nIndex: {index}\n\n") + +rouge = load_metric("rouge", trust_remote_code=True) + +# Access the element at the random even index +messages_chat = dataset_val[0]["messages"] +test_text_generation(tokenizer, model, messages_chat, max_seq_length) + diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 0000000..a91a7de --- /dev/null +++ b/tests/tests.py @@ -0,0 +1,84 @@ +import sys +from unsloth import FastLanguageModel +from ChatTemplate import ChatTemplate + +def test_dataset(dataset): + # Check the structure of the dataset + assert all(key in dataset.features for key in ['messages']), "Dataset structure is incorrect" + + # Check the content of the dataset + for example in dataset: + assert 'messages' in example, "Missing 'messages' in example" + + # Check that 'messages' is a list of dictionaries + assert isinstance(example['messages'], list), "'messages' should be a list" + for message in example['messages']: + assert isinstance(message, dict), "Each message should be a dictionary" + assert 'role' in message, "Missing 'role' in message" + assert 'content' in message, "Missing 'content' in message" + + # Check the length of the dataset + assert len(dataset) > 0, "Dataset is empty" + +def check_token_threshold_and_truncate(tokenizer, model, messages_chat, max_seq_length): + # Check if the input token length is less than the max_seq_length + input_token_length = len(tokenizer.apply_chat_template(messages_chat, tokenize=True)) + + if model.config.max_position_embeddings is not None: + max_model_token_config = model.config.max_position_embeddings + else: + max_model_token_config = tokenizer.model_max_length + + MaxTokenCapacityThreshold = (max_model_token_config - (input_token_length + max_seq_length)) < 0 + + if MaxTokenCapacityThreshold: + print("Warning: Maximum token threshold has been reached. Activating truncation to prevent crash. Rouge score will be affected.") + truncation = True + else: + truncation = False + return truncation + +def test_text_generation(tokenizer, model, message_chat, max_seq_length): + for message in message_chat: + if message['role'] == 'assistant': + message['content'] = '' + # Check if the model is in training mode + if model.training: + # If it's in training mode, switch it to inference mode + FastLanguageModel.for_inference(model) + #check if the input token length is less than the max_seq_length, if it is set truncation to True + truncation = check_token_threshold_and_truncate(tokenizer, model, message_chat, max_seq_length) + # Tokenize the input messages + + inputs = tokenizer.apply_chat_template( + message_chat, + tokenize = True, + add_generation_prompt = True, # Must add for generation + return_tensors = "pt", + max_length=max_seq_length, + truncation = truncation, + ).to(device='cuda') + + # Generate the summary + summary_ids = model.generate( + input_ids=inputs, + max_new_tokens=max_seq_length, + do_sample=True, + pad_token_id=tokenizer.pad_token_id, + temperature=0.3, + top_k=20, + top_p=0.95, + repetition_penalty=1.2, + ) + # Decode the summary + summary_text = tokenizer.decode(summary_ids[0][inputs.shape[1]:], skip_special_tokens=True) + # Check if the summary text is not None + assert summary_text is not None, "Final summary text should not be None" + # Split the summary text into lines + summary_lines = summary_text.split('\n') + # Return the first 3 lines + sample_text = '\n'.join(summary_lines[:10]) + sample = f"\n\nGeneration test, small sample result: \n\n{sample_text}\n\n" + print(sample) + return summary_text, sample + diff --git a/trainer.py b/trainer.py new file mode 100644 index 0000000..59d8dc9 --- /dev/null +++ b/trainer.py @@ -0,0 +1,268 @@ +# trainer.py is part of the project "LLM summarizer trainer" and is used to train Large Langage Model to the summarizing task using QLora as fine tuning method. +import torch +from pathlib import Path +from transformers import TrainingArguments, EarlyStoppingCallback +from transformers.integrations import WandbCallback +from trl import SFTTrainer +from evaluate import load as load_metric +from unsloth import FastLanguageModel +from dotenv import load_dotenv +from huggingface_hub import login +import os +import wandb +import random +import locale +import gc +import glob +from ChatTemplate import ChatTemplate +from modelSaver import ModelSaver +from SampleGenerationCallback import SampleGenerationCallback +from model_selector import select_model, get_model_list +from tests import check_token_threshold_and_truncate, test_dataset, test_text_generation +import logging + +class Unsloth_LLM_Trainer(): + def __init__(self, model_name, load_in_4bit=True, max_seq_length=512, dry_run=False): + gc.collect() + torch.cuda.empty_cache() + locale.getpreferredencoding = lambda: "UTF-8" + load_dotenv() + # Get the Hugging Face API key from the environment variables + self.HUGGING_FACE = os.getenv('HUGGING_FACE') + # Get the Weights & Biases API key from the environment variables + self.WANDB_API_KEY = os.getenv('WANDB_API_KEY') + # Log in to Weights & Biases + wandb.login(key=self.WANDB_API_KEY) + # Log in to Hugging Face + login(self.HUGGING_FACE) + # Get the absolute path of the root directory of the project + self.root_dir = Path(__file__).resolve().parent.parent + # Path of the output fine-tuned model: + + # metrics + self.rouge = load_metric("rouge", trust_remote_code=True) + self.max_seq_length = max_seq_length + #select the model to fine-tune + self.model_name = model_name + self.out_model_name = self.GetOutputModelName() + self.fine_tuned_model_dir = self.root_dir / "model" + self.load_in_4bit = load_in_4bit + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # Load the model and tokenizer + print(f"\n\nLoading model and tokenizer of : {model_name}\n\n") + self.model, self.tokenizer = FastLanguageModel.from_pretrained( + model_name=model_name, + max_seq_length=self.max_seq_length, + dtype=None, + load_in_4bit=self.load_in_4bit, + ) + chatml = ChatTemplate(self.tokenizer) + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + # Load your data + self.dataset_train, self.dataset_val = chatml.load_data() + test_dataset(self.dataset_train) + test_dataset(self.dataset_val) + self.dry_run = dry_run + self.wandb_run_url = None + self.wandb_run_path = None + test_text_generation(self.tokenizer, self.model, self.dataset_val[0]["messages"], self.max_seq_length) + logging.basicConfig(filename='logs/training.log', level=logging.INFO) + + def log_end_of_training(self): + # Log the values to a local log file + logging.info(f"Base Model name: {self.model_name}") + logging.info(f"Output Model name: {self.out_model_name}") + logging.info(f"Max sequence length: {self.max_seq_length}") + logging.info(f"Load in 4-bit: {self.load_in_4bit}") + logging.info(f"Fine-tuned model directory: {str(self.fine_tuned_model_dir)}") + logging.info(f"Weights & Biases run URL: {self.wandb_run_url}") + logging.info(f"Weights & Biases run path: {self.wandb_run_path}") + + def generate_summary(self, messages): + # Check if the model is in training mode + if self.model.training: + # If it's in training mode, switch it to inference mode + FastLanguageModel.for_inference(self.model) + #check if the input token length is less than the max_seq_length, if it is set truncation to True + truncation = check_token_threshold_and_truncate(self.tokenizer, self.model, messages, self.max_seq_length) + # Tokenize the input messages + inputs = self.tokenizer.apply_chat_template( + messages, + tokenize = True, + add_generation_prompt = True, # Must add for generation + return_tensors = "pt", + max_length=self.max_seq_length, + truncation = truncation, + ).to(device=self.device) + # Generate the summary + summary_ids = self.model.generate( + input_ids=inputs, + max_new_tokens=self.max_seq_length, + do_sample=True, + pad_token_id=self.tokenizer.pad_token_id, + temperature=0.3, + top_k=20, + top_p=0.95, + repetition_penalty=1.2, + ) + # Decode the summary + summary_text = self.tokenizer.decode(summary_ids[0][inputs.shape[1]:], skip_special_tokens=True) + return summary_text + + def GetOutputModelName(self): + # Get the base name of the model and use it to name the fine-tuned model + base_name_parts = self.model_name.split('/') + base_name = base_name_parts[-1] if len(base_name_parts) > 1 else base_name_parts[0] + base_name_parts = base_name.split('-') + if 'Summarizer' in base_name_parts: + base_name_parts.remove('Summarizer') + base_name_parts.insert(1, 'Summarizer') + out_model_name = '-'.join(base_name_parts) + return out_model_name + + def GetRandomValidationMessage(self): + # Get the length of the list + length = len(self.dataset_val["messages"]) + # Generate a random index + index = random.randrange(0, length) + # Access the element at the random even index + messages_chat = self.dataset_val[index]["messages"] + # Remove reference from dictionaries with role 'assistant' + for message in messages_chat: + if message['role'] == 'assistant': + message['content'] = '' + messages_text = self.dataset_val[index]["text"] + messages_str = "".join(messages_text) + Reference_summary = messages_str.split('assistant', 1)[1] + return messages_chat, Reference_summary + + def save_model(self): + model_saver = ModelSaver(self.model, self.tokenizer, self.fine_tuned_model_dir, self.out_model_name, self.wandb_run_url, self.wandb_run_path) + model_saver.save_model() + + def Test_Model(self): + while True: + # Get a random validation message + message, validate_summary = self.GetRandomValidationMessage() + summary_text = self.generate_summary(message) + print(f"Validate_Summary : {validate_summary}\n\nGenerated_summary : {summary_text}\n\n") + + # Ask the user for input + user_input = input("Enter 'r' to retry, 's' to save, or 'q' to quit: ") + + if user_input.lower() == 'r': + continue + elif user_input.lower() == 's': + self.save_model() + print("Model saved.") + break + elif user_input.lower() == 'q': + break + else: + print("Invalid input. Please enter 'r', 's', or 'q'.") + + def trainer(self): + model_name = self.model_name + model = self.model + # Do model patching and add fast LoRA weights + model = FastLanguageModel.get_peft_model( + model, + r = 16, + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj",], + lora_alpha = 16, + lora_dropout = 0, # Supports any, but = 0 is optimized + bias = "none", # Supports any, but = "none" is optimized + use_gradient_checkpointing = "unsloth", + random_state = 3407, + max_seq_length = self.max_seq_length, + use_rslora = False, # We support rank stabilized LoRA + loftq_config = None, # And LoftQ + ) + run_name = f"run-{model_name}-{random.randint(0, 100000)}" + run = wandb.init(project="LLM-summarizer_trainer", name=run_name) + self.wandb_run_url = run.get_url() + self.wandb_run_path = run.path + trainer = SFTTrainer( + model = model, + train_dataset = self.dataset_train, + eval_dataset = self.dataset_val, + max_seq_length = self.max_seq_length, + dataset_text_field = "text", + tokenizer = self.tokenizer, + packing=False, + args = TrainingArguments( + fp16_full_eval = True, + per_device_eval_batch_size = 2, + eval_accumulation_steps = 4, + evaluation_strategy = "steps", + eval_steps = 1, + per_device_train_batch_size = 2, + gradient_accumulation_steps = 4, + warmup_steps = 5, + max_steps = 60 if not self.dry_run else 10, + fp16 = not torch.cuda.is_bf16_supported(), + bf16 = torch.cuda.is_bf16_supported(), + logging_steps = 1, + output_dir = "outputs", + save_strategy = "steps", + save_steps = 5, + optim = "adamw_8bit", + weight_decay = 0.01, + lr_scheduler_type = "linear", + seed = 3407, + load_best_model_at_end = True, + ), + callbacks=[SampleGenerationCallback(every_x_steps=5, dataset_val=self.dataset_val, generate_summary=self.generate_summary, score_threshold=0), + EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.0), + WandbCallback()], + ) + return trainer + + + def Start_Training(self, trainer): + if glob.glob("outputs/checkpoint-*"): + trainer_stats = trainer.train(resume_from_checkpoint = True) + if wandb.run is not None: + wandb.finish() + print(f"\nTraining completed for {self.out_model_name}\n\n") + trainer.save_model(f"{self.fine_tuned_model_dir}/{self.out_model_name}") + self.log_end_of_training() + else: + trainer_stats = trainer.train(resume_from_checkpoint = False) + if wandb.run is not None: + wandb.finish() + print(f"\nTraining completed for {self.out_model_name}\n\n") + trainer.save_model(f"{self.fine_tuned_model_dir}/{self.out_model_name}") + self.log_end_of_training() + +def main(): + + #list of models available for fine-tuning on unsloth + standard_models, faster_models = get_model_list() + # Select the model to fine-tune + selected_model, is_4bit = select_model(standard_models, faster_models) + # Check if the selected model is a 4-bit model + print("\nSelected model:", selected_model) + print("\nIs the selected model a 4-bit model?", is_4bit) + # Instantiate the trainer with the desired parameters + trainer_instance = Unsloth_LLM_Trainer( + model_name=selected_model, # replace with your model name + load_in_4bit=is_4bit, + max_seq_length=1024, + dry_run=True, + ) + print("\n\nInitialization done\n\n") + # Get the trainer + trainer = trainer_instance.trainer() + print("\n\nTrainer created\n\n") + # Start the training + trainer_instance.Start_Training(trainer) + print("\n\nTraining ended\n\n") + # test and save the model + trainer_instance.Test_Model() + print("\n\nTesting done\n\n") + +if __name__ == "__main__": + main() \ No newline at end of file