Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
WillIsback committed Apr 13, 2024
1 parent 03cfd0c commit a258a09
Show file tree
Hide file tree
Showing 21 changed files with 1,345 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ _unsloth_sentencepiece_temp
_*
llama.cpp
!dummy.txt
logs
logs
model
43 changes: 43 additions & 0 deletions ChatTemplate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from datasets import load_dataset, Dataset
import pandas as pd
from unsloth.chat_templates import get_chat_template

class ChatTemplate():
def __init__(self, tokenizer):
self.tokenizer = tokenizer

def formating_messages(self,example):
user_chat = {"role": example["user"]["role"], "content": example["user"]["content"]}
assistant_chat = {"role": example["assistant"]["role"], "content": example["assistant"]["content"]}
return {"messages": [user_chat, assistant_chat]}

def formatting_prompts_func(self,examples):
convos = examples["messages"]
texts = [self.tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
return { "text" : texts, }

def load_data(self):
self.tokenizer = get_chat_template(
self.tokenizer,
chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"}, # ShareGPT style
map_eos_token = True, # Maps <|im_end|> to </s> instead
)
dataset_train = load_dataset("Labagaite/fr-summarizer-dataset", split = "train")
dataset_val = load_dataset("Labagaite/fr-summarizer-dataset", split = "validation")
# Group the data
grouped_data_train = [{"user": dataset_train[i], "assistant": dataset_train[i+1]} for i in range(0, len(dataset_train), 2)]
grouped_data_val = [{"user": dataset_val[i], "assistant": dataset_val[i+1]} for i in range(0, len(dataset_val), 2)]
# Convert the list of dictionaries to a DataFrame
df_train = pd.DataFrame(grouped_data_train)
df_val = pd.DataFrame(grouped_data_val)
# Create a new Dataset object
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

dataset_train = dataset_train.map(self.formating_messages, batched = False)
dataset_train = dataset_train.map(self.formatting_prompts_func, batched = True)
dataset_val = dataset_val.map(self.formating_messages, batched = False)
dataset_val = dataset_val.map(self.formatting_prompts_func, batched = True)

return dataset_train, dataset_val
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) [2024] [DERUE-William]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
62 changes: 62 additions & 0 deletions SampleGenerationCallback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Description: This file contains the SampleGenerationCallback class which is used to generate and score summaries during training.
from transformers import TrainerCallback
import wandb
import random
from rouge_score import rouge_scorer

class SampleGenerationCallback(TrainerCallback):
def __init__(self, every_x_steps=5, dataset_val=None, generate_summary=None, score_threshold = 0.2, patience=5, min_delta=0.01, warmup_steps=10):
self.every_x_steps = every_x_steps
self.dataset_val = dataset_val
self.generate_summary = generate_summary
self.score_threshold = score_threshold
self.patience = patience
self.min_delta = min_delta
self.warmup_steps = warmup_steps
self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
self.summary_table_data = []
self.best_score = None
self.patience_counter = 0

def generate_and_score_summary(self):
# Get the length of the list
length = len(self.dataset_val["messages"])
# Generate a random index
index = random.randrange(0, length)
messages_chat = self.dataset_val[index]["messages"]
# Remove content from dictionaries with role 'assistant'
for message in messages_chat:
if message['role'] == 'assistant':
message['content'] = ''
messages_text = self.dataset_val[index]["text"]
messages_str = "".join(messages_text)
Reference_summary = messages_str.split('assistant', 1)[1]
summary_text = self.generate_summary(messages_chat)
scores = self.rouge.score(Reference_summary, summary_text)
rouge1 = scores['rouge1'].fmeasure
rouge2 = scores['rouge2'].fmeasure
rougeL = scores['rougeL'].fmeasure
return summary_text, Reference_summary, rouge1, rouge2, rougeL

def on_step_end(self, args, state, control, model, **kwargs):
if state.global_step % self.every_x_steps == 0:
summary_text, Reference_summary, rouge1, rouge2, rougeL = self.generate_and_score_summary()
self.summary_table_data.append([Reference_summary, summary_text, f"Rouge-1: {rouge1},\n Rouge-2: {rouge2},\n Rouge-L: {rougeL}"])
my_table = wandb.Table(columns=["Reference_summary", "Generated_summary", "Rouge-Score"], data=self.summary_table_data)
wandb.log({"summary_table": my_table})

if state.global_step % args.eval_steps == 0 and state.global_step > self.warmup_steps:
_, _, rouge1, rouge2, rougeL = self.generate_and_score_summary()
wandb.log({"ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougeL})

# Check if the performance has improved
if self.best_score is None or rouge2 > self.best_score + self.min_delta:
self.best_score = rouge2
self.patience_counter = 0
else:
self.patience_counter += 1

# Check if the patience has been exceeded
if self.patience_counter >= self.patience:
control.should_training_stop = True
print(f"\033[91m\nEarly stopping at step {state.global_step}, rouge2 score did not improve: {rouge2}\n\033[0m")
45 changes: 45 additions & 0 deletions docs/ChatTemplate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# ChatTemplate

The `ChatTemplate.py` script is a key part of the LLM Summarizer Trainer project. It defines a `ChatTemplate` class that is used to preprocess datasets in chat conversational format to fit the trainer.

## Key Points

The `ChatTemplate` class is initialized with a tokenizer. It has several methods:

- `formating_messages`: This method takes an example from the dataset and formats it into a dictionary with a "messages" key. The value of the "messages" key is a list of dictionaries, each representing a chat message with "role" and "content" keys.

- `formatting_prompts_func`: This method takes a batch of examples and applies the chat template to the "messages" of each example. It returns a dictionary with a "text" key and a list of formatted texts as the value.

- `load_data`: This method loads the training and validation datasets, groups the data, converts the grouped data into a DataFrame, creates a new Dataset object from the DataFrame, and applies the `formating_messages` and `formatting_prompts_func` methods to the datasets.

## Configuration

In the `load_data` method, you can configure the chat template and the mapping by modifying the arguments passed to the `get_chat_template` function. The `chat_template` argument specifies the chat template to use. You can choose from several chat templates as described in this [link](https://github.com/unslothai/unsloth/blob/4606443b77f98a624896d4ca50710255d8436d86/unsloth/chat_templates.py#L258). For example, you can change `chat_template = "chatml"` to `chat_template = "zephyr"` to use the zephyr chat template.

The `mapping` argument specifies the mapping between the roles and contents of the chat messages and the keys in the dataset. You can configure this by modifying the following code:

```python
user_chat = {"role": example["user"]["role"], "content": example["user"]["content"]}
assistant_chat = {"role": example["assistant"]["role"], "content": example["assistant"]["content"]}
```

Here, you need to replace `"user"` and `"assistant"` with the keys present in your dataset. For example, if your dataset uses `"human"` and `"gpt"` as the keys, you can modify the code as follows:

```python
user_chat = {"role": example["user"]["role"], "content": example["human"]["content"]}
assistant_chat = {"role": example["assistant"]["role"], "content": example["gpt"]["content"]}
```

To use a different dataset, you need to modify the arguments passed to the `load_dataset` function. The first argument is the name of the dataset to load. For example, you can change `"Labagaite/fr-summarizer-dataset"` to `"your_dataset_name"` to load your dataset. The `split` argument specifies the split of the dataset to load. Note that this script is designed to work with datasets in chat format where each entry is an instruction and the next entry is the response. The script first groups the entries two by two to form a single entry as a conversation.

## Usage

To use the `ChatTemplate` class to preprocess a dataset, you need to call the `load_data` method. This method returns the preprocessed training and validation datasets. Here's an example:

```python
dataset_train, dataset_val = chat_template.load_data()
```

In this example, `dataset_train` and `dataset_val` are the preprocessed training and validation datasets, respectively. You can then pass these datasets to the trainer for training.

For more details on how the `ChatTemplate` class works and how to use it, please refer to the [ChatTemplate Documentation](docs/ChatTemplate.md).
Empty file added docs/dummy.txt
Empty file.
40 changes: 40 additions & 0 deletions docs/model_selection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Model Selection

The `model_selector.py` script provides functionality to select a model from a list of models compatible with Unsloth, search for a model, or select a model from a local folder. It contains two main functions: `select_model` and `get_model_list`.

## select_model Function

The `select_model` function allows the user to select a model in one of three ways:

1. **Search for a model:** The user can enter a search term, and the function will print out all models that contain this term in their name. The user can then enter the name of the model they want to select.

2. **Select a model from a list:** The function prints out a list of standard models and 4x faster models. The user can then enter the name of the model they want to select.

3. **Select a model from a local folder:** The user can enter the path of a local folder containing the model.

The function returns the selected model and a boolean value indicating whether the selected model is a 4x faster model.

## get_model_list Function

The `get_model_list` function retrieves a list of models from the Hugging Face Model Hub. It sends a GET request to the Hugging Face API and parses the response to separate the models into standard models and 4x faster models.

The function returns these two lists of models.

## Usage

To use the `select_model` function, you first need to get a list of models using the `get_model_list` function. Then, you can pass these lists to the `select_model` function. Here's an example:

```python
standard_models, faster_models = get_model_list()
selected_model, is_4bit = select_model(standard_models, faster_models)
```

This will prompt the user to select a model as described above.

Please note that you need to have the `requests` library installed to use the `get_model_list` function. You can install it with the following command:

```bash
pip install requests
```

This document provides a detailed description of the `model_selector.py` script. For more information on how to use this script in the context of the LLM Summarizer Trainer project, please refer to the main README file.
42 changes: 42 additions & 0 deletions docs/save_model.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
How to use the `ModelSaver` class to save the model.


## Saving the Model

The `ModelSaver` class in `modelSaver.py` is used to save the model. Here's how you can use it:

1. Initialize the `ModelSaver` class with your model, tokenizer, and other necessary parameters:

```python
model_saver = ModelSaver(model, tokenizer, fine_tuned_model_dir, out_model_name, wandb_run_url, wandb_run_path)
```

2. Call the `save_model` method of the `ModelSaver` class:

```python
model_saver.save_model()
```

When you run `save_model`, you will be prompted to enter the types of models you want to save. Options are: '16bit', '4bit', 'lora', 'gguf_q8_0', 'gguf_f16', 'gguf_q4_k_m'. You can enter 'all' to save all types. If you want to save multiple types, separate them with commas.

The `ModelSaver` class will then save your model in the specified formats and update the model card with the training details and performance metrics.

Please replace `model`, `tokenizer`, `fine_tuned_model_dir`, `out_model_name`, `wandb_run_url`, and `wandb_run_path` with your actual parameters. They are automatically retrieve by `trainer.py` during training.


## Updating the Model Card

The `ModelSaver` class in `modelSaver.py` also includes functionality to update the model card on Hugging Face Model Hub. The model card provides important information about the model, including its base model, the method used for training, the ROUGE scores achieved, and a link to the training logs on Weights & Biases.

The `UpdateModelCard` method is used to update the model card. It first retrieves the ROUGE scores from the Weights & Biases run using the `get_wandb_run` method. It then formats the model card content using these scores and other information about the model. Finally, it pushes the updated model card to the Hugging Face Model Hub.

Here's how you can use the `UpdateModelCard` method:

```python
model_saver.UpdateModelCard(save_directory, token)
```

Please replace `save_directory` and `token` with your actual parameters. The `save_directory` is the directory where the model is saved, and `token` is your Hugging Face API token.

The model card is formatted using the `CUSTOM_MODEL_CARD` string, which is a template for the model card content. You can modify this template to include any additional information you want to display on the model card.

78 changes: 78 additions & 0 deletions docs/setup.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
## Setup

### Automated Setup (Linux)

If you are using a Linux system, you can simply run the `setup.sh` script to set up your environment. This script creates a virtual environment, installs the necessary requirements, and configures the environment based on your GPU architecture and PyTorch version.

To run the script, open a terminal and navigate to the directory containing the `setup.sh` file. Then, run the following command:

```bash
./setup.sh
```

### Manual Setup

If you prefer to set up your environment manually or are using a different operating system, follow these steps:

1. **Create a virtual environment**: You can use `venv` to create a virtual environment. Open a terminal and run the following command:

```bash
python3 -m venv env
```

2. **Activate the virtual environment**: The command to activate the environment depends on your operating system:

- On Linux or MacOS, run:

```bash
source env/bin/activate
```

3. **Install the requirements**: The `requirements.txt` file lists the Python packages that your project depends on. You can install these using `pip`:

```bash
pip install -r requirements.txt
```

This will install the following packages:

- `wandb`: Weights & Biases, a tool for tracking and visualizing machine learning experiments.
- `rouge_score`: A Python package for calculating the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score, a common metric for evaluating text summaries.
- `evaluate`: A package for evaluating machine learning models.

4. **Install additional packages based on your GPU architecture and PyTorch version**: Refer to the `setup.sh` script for the specific packages to install based on your GPU architecture (Ampere or older) and PyTorch version (2.1.0, 2.1.1, 2.2.0, or 2.2.1). You can check your GPU architecture and PyTorch version using the following Python commands:

```python
import torch
print(torch.version.cuda) # prints the CUDA version
print(torch.version.__version__) # prints the PyTorch version
print(torch.cuda.get_device_capability()[0]) # prints the GPU architecture
```

Then, install the appropriate packages using `pip`. For example, if your GPU architecture is Ampere or newer and your PyTorch version is 2.2.1, you would run:

```bash
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
```

Remember to replace the commands with the appropriate ones for your GPU architecture and PyTorch version. Find more at [unsloth Github](https://github.com/unslothai/unsloth)

## Setting Up Environment Variables

For the application to function correctly, you need to set up several environment variables. These variables hold the API keys for various services that the application uses.

Create a `.env` file in the root directory of the project and add the following lines to it:

```properties
HUGGING_FACE=your_hugging_face_token
WANDB_API_KEY=your_wandb_api_key
# Add more tokens as needed
```

Replace `your_hugging_face_token`, `your_openai_key`, and `your_wandb_api_key` with your actual API keys.

- `HUGGING_FACE`: Your Hugging Face API token. You can find this on your Hugging Face account page.
- `WANDB_API_KEY`: Your Weights & Biases API key. You can find this on your Weights & Biases account page.

Remember not to share these tokens with anyone or publish them online. They provide access to your accounts on these services, and anyone with these tokens can use these services as if they were you.
Loading

0 comments on commit a258a09

Please sign in to comment.