first commit

WillIsback · Apr 13, 2024 · a258a09 · a258a09
1 parent 03cfd0c
commit a258a09
Show file tree

Hide file tree

Showing 21 changed files with 1,345 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ _unsloth_sentencepiece_temp
 _*
 llama.cpp
 !dummy.txt
-logs
+logs
+model
diff --git a/ChatTemplate.py b/ChatTemplate.py
@@ -0,0 +1,43 @@
+from datasets import load_dataset, Dataset
+import pandas as pd
+from unsloth.chat_templates import get_chat_template
+
+class ChatTemplate():
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def formating_messages(self,example):
+        user_chat = {"role": example["user"]["role"], "content": example["user"]["content"]}
+        assistant_chat = {"role": example["assistant"]["role"], "content": example["assistant"]["content"]}
+        return {"messages": [user_chat, assistant_chat]}
+
+    def formatting_prompts_func(self,examples):
+        convos = examples["messages"]
+        texts = [self.tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+        return { "text" : texts, }
+
+    def load_data(self):
+        self.tokenizer = get_chat_template(
+        self.tokenizer,
+        chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
+        mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"}, # ShareGPT style
+        map_eos_token = True, # Maps <|im_end|> to </s> instead
+        )
+        dataset_train = load_dataset("Labagaite/fr-summarizer-dataset", split = "train")
+        dataset_val = load_dataset("Labagaite/fr-summarizer-dataset", split = "validation")
+        # Group the data
+        grouped_data_train = [{"user": dataset_train[i], "assistant": dataset_train[i+1]} for i in range(0, len(dataset_train), 2)]
+        grouped_data_val = [{"user": dataset_val[i], "assistant": dataset_val[i+1]} for i in range(0, len(dataset_val), 2)]
+        # Convert the list of dictionaries to a DataFrame
+        df_train = pd.DataFrame(grouped_data_train)
+        df_val = pd.DataFrame(grouped_data_val)
+        # Create a new Dataset object
+        dataset_train = Dataset.from_pandas(df_train)
+        dataset_val = Dataset.from_pandas(df_val)
+
+        dataset_train = dataset_train.map(self.formating_messages, batched = False)
+        dataset_train = dataset_train.map(self.formatting_prompts_func, batched = True)
+        dataset_val = dataset_val.map(self.formating_messages, batched = False)
+        dataset_val = dataset_val.map(self.formatting_prompts_func, batched = True)
+
+        return dataset_train, dataset_val
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [2024] [DERUE-William]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/SampleGenerationCallback.py b/SampleGenerationCallback.py
@@ -0,0 +1,62 @@
+# Description: This file contains the SampleGenerationCallback class which is used to generate and score summaries during training.
+from transformers import TrainerCallback
+import wandb
+import random
+from rouge_score import rouge_scorer
+
+class SampleGenerationCallback(TrainerCallback):
+    def __init__(self, every_x_steps=5, dataset_val=None, generate_summary=None, score_threshold = 0.2, patience=5, min_delta=0.01, warmup_steps=10):
+        self.every_x_steps = every_x_steps
+        self.dataset_val = dataset_val
+        self.generate_summary = generate_summary
+        self.score_threshold = score_threshold
+        self.patience = patience
+        self.min_delta = min_delta
+        self.warmup_steps = warmup_steps
+        self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+        self.summary_table_data = []
+        self.best_score = None
+        self.patience_counter = 0
+
+    def generate_and_score_summary(self):
+        # Get the length of the list
+        length = len(self.dataset_val["messages"])
+        # Generate a random index
+        index = random.randrange(0, length)
+        messages_chat = self.dataset_val[index]["messages"]
+        # Remove content from dictionaries with role 'assistant'
+        for message in messages_chat:
+            if message['role'] == 'assistant':
+                message['content'] = ''
+        messages_text = self.dataset_val[index]["text"]
+        messages_str = "".join(messages_text)
+        Reference_summary = messages_str.split('assistant', 1)[1]
+        summary_text = self.generate_summary(messages_chat)
+        scores = self.rouge.score(Reference_summary, summary_text)
+        rouge1 = scores['rouge1'].fmeasure
+        rouge2 = scores['rouge2'].fmeasure
+        rougeL = scores['rougeL'].fmeasure
+        return summary_text, Reference_summary, rouge1, rouge2, rougeL
+
+    def on_step_end(self, args, state, control, model, **kwargs):
+        if state.global_step % self.every_x_steps == 0:
+            summary_text, Reference_summary, rouge1, rouge2, rougeL = self.generate_and_score_summary()
+            self.summary_table_data.append([Reference_summary, summary_text, f"Rouge-1: {rouge1},\n Rouge-2: {rouge2},\n Rouge-L: {rougeL}"])
+            my_table = wandb.Table(columns=["Reference_summary", "Generated_summary", "Rouge-Score"], data=self.summary_table_data)
+            wandb.log({"summary_table": my_table})
+
+            if state.global_step % args.eval_steps == 0 and state.global_step > self.warmup_steps:
+                _, _, rouge1, rouge2, rougeL = self.generate_and_score_summary()
+                wandb.log({"ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougeL})
+
+                # Check if the performance has improved
+                if self.best_score is None or rouge2 > self.best_score + self.min_delta:
+                    self.best_score = rouge2
+                    self.patience_counter = 0
+                else:
+                    self.patience_counter += 1
+
+                # Check if the patience has been exceeded
+                if self.patience_counter >= self.patience:
+                    control.should_training_stop = True
+                    print(f"\033[91m\nEarly stopping at step {state.global_step}, rouge2 score did not improve: {rouge2}\n\033[0m")
diff --git a/docs/ChatTemplate.md b/docs/ChatTemplate.md
@@ -0,0 +1,45 @@
+# ChatTemplate
+
+The `ChatTemplate.py` script is a key part of the LLM Summarizer Trainer project. It defines a `ChatTemplate` class that is used to preprocess datasets in chat conversational format to fit the trainer.
+
+## Key Points
+
+The `ChatTemplate` class is initialized with a tokenizer. It has several methods:
+
+- `formating_messages`: This method takes an example from the dataset and formats it into a dictionary with a "messages" key. The value of the "messages" key is a list of dictionaries, each representing a chat message with "role" and "content" keys.
+
+- `formatting_prompts_func`: This method takes a batch of examples and applies the chat template to the "messages" of each example. It returns a dictionary with a "text" key and a list of formatted texts as the value.
+
+- `load_data`: This method loads the training and validation datasets, groups the data, converts the grouped data into a DataFrame, creates a new Dataset object from the DataFrame, and applies the `formating_messages` and `formatting_prompts_func` methods to the datasets.
+
+## Configuration
+
+In the `load_data` method, you can configure the chat template and the mapping by modifying the arguments passed to the `get_chat_template` function. The `chat_template` argument specifies the chat template to use. You can choose from several chat templates as described in this [link](https://github.com/unslothai/unsloth/blob/4606443b77f98a624896d4ca50710255d8436d86/unsloth/chat_templates.py#L258). For example, you can change `chat_template = "chatml"` to `chat_template = "zephyr"` to use the zephyr chat template.
+
+The `mapping` argument specifies the mapping between the roles and contents of the chat messages and the keys in the dataset. You can configure this by modifying the following code:
+
+```python
+user_chat = {"role": example["user"]["role"], "content": example["user"]["content"]}
+assistant_chat = {"role": example["assistant"]["role"], "content": example["assistant"]["content"]}
+```
+
+Here, you need to replace `"user"` and `"assistant"` with the keys present in your dataset. For example, if your dataset uses `"human"` and `"gpt"` as the keys, you can modify the code as follows:
+
+```python
+user_chat = {"role": example["user"]["role"], "content": example["human"]["content"]}
+assistant_chat = {"role": example["assistant"]["role"], "content": example["gpt"]["content"]}
+```
+
+To use a different dataset, you need to modify the arguments passed to the `load_dataset` function. The first argument is the name of the dataset to load. For example, you can change `"Labagaite/fr-summarizer-dataset"` to `"your_dataset_name"` to load your dataset. The `split` argument specifies the split of the dataset to load. Note that this script is designed to work with datasets in chat format where each entry is an instruction and the next entry is the response. The script first groups the entries two by two to form a single entry as a conversation.
+
+## Usage
+
+To use the `ChatTemplate` class to preprocess a dataset, you need to call the `load_data` method. This method returns the preprocessed training and validation datasets. Here's an example:
+
+```python
+dataset_train, dataset_val = chat_template.load_data()
+```
+
+In this example, `dataset_train` and `dataset_val` are the preprocessed training and validation datasets, respectively. You can then pass these datasets to the trainer for training.
+
+For more details on how the `ChatTemplate` class works and how to use it, please refer to the [ChatTemplate Documentation](docs/ChatTemplate.md).
diff --git a/docs/dummy.txt b/docs/dummy.txt
diff --git a/docs/model_selection.md b/docs/model_selection.md
@@ -0,0 +1,40 @@
+# Model Selection
+
+The `model_selector.py` script provides functionality to select a model from a list of models compatible with Unsloth, search for a model, or select a model from a local folder. It contains two main functions: `select_model` and `get_model_list`.
+
+## select_model Function
+
+The `select_model` function allows the user to select a model in one of three ways:
+
+1. **Search for a model:** The user can enter a search term, and the function will print out all models that contain this term in their name. The user can then enter the name of the model they want to select.
+
+2. **Select a model from a list:** The function prints out a list of standard models and 4x faster models. The user can then enter the name of the model they want to select.
+
+3. **Select a model from a local folder:** The user can enter the path of a local folder containing the model.
+
+The function returns the selected model and a boolean value indicating whether the selected model is a 4x faster model.
+
+## get_model_list Function
+
+The `get_model_list` function retrieves a list of models from the Hugging Face Model Hub. It sends a GET request to the Hugging Face API and parses the response to separate the models into standard models and 4x faster models.
+
+The function returns these two lists of models.
+
+## Usage
+
+To use the `select_model` function, you first need to get a list of models using the `get_model_list` function. Then, you can pass these lists to the `select_model` function. Here's an example:
+
+```python
+standard_models, faster_models = get_model_list()
+selected_model, is_4bit = select_model(standard_models, faster_models)
+```
+
+This will prompt the user to select a model as described above.
+
+Please note that you need to have the `requests` library installed to use the `get_model_list` function. You can install it with the following command:
+
+```bash
+pip install requests
+```
+
+This document provides a detailed description of the `model_selector.py` script. For more information on how to use this script in the context of the LLM Summarizer Trainer project, please refer to the main README file.
diff --git a/docs/save_model.md b/docs/save_model.md
@@ -0,0 +1,42 @@
+How to use the `ModelSaver` class to save the model.
+
+
+## Saving the Model
+
+The `ModelSaver` class in `modelSaver.py` is used to save the model. Here's how you can use it:
+
+1. Initialize the `ModelSaver` class with your model, tokenizer, and other necessary parameters:
+
+```python
+model_saver = ModelSaver(model, tokenizer, fine_tuned_model_dir, out_model_name, wandb_run_url, wandb_run_path)
+```
+
+2. Call the `save_model` method of the `ModelSaver` class:
+
+```python
+model_saver.save_model()
+```
+
+When you run `save_model`, you will be prompted to enter the types of models you want to save. Options are: '16bit', '4bit', 'lora', 'gguf_q8_0', 'gguf_f16', 'gguf_q4_k_m'. You can enter 'all' to save all types. If you want to save multiple types, separate them with commas.
+
+The `ModelSaver` class will then save your model in the specified formats and update the model card with the training details and performance metrics.
+
+Please replace `model`, `tokenizer`, `fine_tuned_model_dir`, `out_model_name`, `wandb_run_url`, and `wandb_run_path` with your actual parameters. They are automatically retrieve by `trainer.py` during training.
+
+
+## Updating the Model Card
+
+The `ModelSaver` class in `modelSaver.py` also includes functionality to update the model card on Hugging Face Model Hub. The model card provides important information about the model, including its base model, the method used for training, the ROUGE scores achieved, and a link to the training logs on Weights & Biases.
+
+The `UpdateModelCard` method is used to update the model card. It first retrieves the ROUGE scores from the Weights & Biases run using the `get_wandb_run` method. It then formats the model card content using these scores and other information about the model. Finally, it pushes the updated model card to the Hugging Face Model Hub.
+
+Here's how you can use the `UpdateModelCard` method:
+
+```python
+model_saver.UpdateModelCard(save_directory, token)
+```
+
+Please replace `save_directory` and `token` with your actual parameters. The `save_directory` is the directory where the model is saved, and `token` is your Hugging Face API token.
+
+The model card is formatted using the `CUSTOM_MODEL_CARD` string, which is a template for the model card content. You can modify this template to include any additional information you want to display on the model card.
+
diff --git a/docs/setup.md b/docs/setup.md
@@ -0,0 +1,78 @@
+## Setup
+
+### Automated Setup (Linux)
+
+If you are using a Linux system, you can simply run the `setup.sh` script to set up your environment. This script creates a virtual environment, installs the necessary requirements, and configures the environment based on your GPU architecture and PyTorch version.
+
+To run the script, open a terminal and navigate to the directory containing the `setup.sh` file. Then, run the following command:
+
+```bash
+./setup.sh
+```
+
+### Manual Setup
+
+If you prefer to set up your environment manually or are using a different operating system, follow these steps:
+
+1. **Create a virtual environment**: You can use `venv` to create a virtual environment. Open a terminal and run the following command:
+
+    ```bash
+    python3 -m venv env
+    ```
+
+2. **Activate the virtual environment**: The command to activate the environment depends on your operating system:
+
+    - On Linux or MacOS, run:
+
+        ```bash
+        source env/bin/activate
+        ```
+
+3. **Install the requirements**: The `requirements.txt` file lists the Python packages that your project depends on. You can install these using `pip`:
+
+    ```bash
+    pip install -r requirements.txt
+    ```
+
+    This will install the following packages:
+
+    - `wandb`: Weights & Biases, a tool for tracking and visualizing machine learning experiments.
+    - `rouge_score`: A Python package for calculating the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score, a common metric for evaluating text summaries.
+    - `evaluate`: A package for evaluating machine learning models.
+
+4. **Install additional packages based on your GPU architecture and PyTorch version**: Refer to the `setup.sh` script for the specific packages to install based on your GPU architecture (Ampere or older) and PyTorch version (2.1.0, 2.1.1, 2.2.0, or 2.2.1). You can check your GPU architecture and PyTorch version using the following Python commands:
+
+    ```python
+    import torch
+    print(torch.version.cuda)  # prints the CUDA version
+    print(torch.version.__version__)  # prints the PyTorch version
+    print(torch.cuda.get_device_capability()[0])  # prints the GPU architecture
+    ```
+
+    Then, install the appropriate packages using `pip`. For example, if your GPU architecture is Ampere or newer and your PyTorch version is 2.2.1, you would run:
+
+    ```bash
+    pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+    pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
+    ```
+
+Remember to replace the commands with the appropriate ones for your GPU architecture and PyTorch version. Find more at [unsloth Github](https://github.com/unslothai/unsloth)
+
+## Setting Up Environment Variables
+
+For the application to function correctly, you need to set up several environment variables. These variables hold the API keys for various services that the application uses.
+
+Create a `.env` file in the root directory of the project and add the following lines to it:
+
+```properties
+HUGGING_FACE=your_hugging_face_token
+WANDB_API_KEY=your_wandb_api_key
+# Add more tokens as needed
+```
+
+Replace `your_hugging_face_token`, `your_openai_key`, and `your_wandb_api_key` with your actual API keys.
+
+- `HUGGING_FACE`: Your Hugging Face API token. You can find this on your Hugging Face account page.
+- `WANDB_API_KEY`: Your Weights & Biases API key. You can find this on your Weights & Biases account page.
+
+Remember not to share these tokens with anyone or publish them online. They provide access to your accounts on these services, and anyone with these tokens can use these services as if they were you.
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,4 +10,5 @@ _unsloth_sentencepiece_temp @@
     _*
     llama.cpp
     !dummy.txt
-    logs
+    logs
+    model