Skip to content

Commit

Permalink
[v3] Update example scripts to the new v3 training format (UKPLab#2622
Browse files Browse the repository at this point in the history
)

* Update example scripts to the new v3 training format

* Add distillation training examples

* Add Matryoshka training examples

* Add NLI training examples

* Add STS training scripts

* Fix accidentally overriding eval set

* Update paraphrases multi-dataset training script

* Convert regular dicts to DatasetDict on Trainer init

* Update Quora duplicate training scripts

* Update "other" training scripts

* Update multilingual conversion script

* Add example scripts to Evaluators

* Add example to ST class itself

* Update docs formatting slightly

* Fix model card snippet

* Add short docstring for similarity_fn_name property
  • Loading branch information
tomaarsen authored May 8, 2024
1 parent e7c3f86 commit 4443bf5
Show file tree
Hide file tree
Showing 39 changed files with 3,209 additions and 2,767 deletions.
184 changes: 74 additions & 110 deletions examples/training/adaptive_layer/adaptive_layer_nli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with AdaptiveLayerLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
with AdaptiveLayerLoss using MultipleNegativesRankingLoss. Entailing texts are used as positive pairs and contradictory
texts are seen as negative pairs. At every 100 training steps, the model is evaluated on the STS benchmark dataset.
Usage:
python adaptive_layer_nli.py
Expand All @@ -11,147 +10,112 @@
python adaptive_layer_nli.py pretrained_transformer_model_name
"""

import math
import traceback
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers import losses
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random

#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1

# Save path of the model
model_save_path = (
"output/adaptive_layer_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)


# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"

if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)

# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")

from sentence_transformers.training_args import BatchSamplers

def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
num_train_epochs = 1

train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()

add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite


train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)

logging.info("Train samples: {}".format(len(train_samples)))
# Save path of the model
output_dir = f"output/adaptive_layer_nli_{model_name.replace('/', '-')}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# If we want, we can limit the maximum sequence length for the model
# model.max_seq_length = 75
logging.info(model)

# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train")
eval_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="dev")
logging.info(train_dataset)

# If you wish, you can limit the number of training samples
# train_dataset = train_dataset.select(range(5000))

# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.AdaptiveLayerLoss(model, train_loss)
# 3. Define our training loss
inner_train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.AdaptiveLayerLoss(model, inner_train_loss)

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
sentences1=stsb_eval_dataset["sentence1"],
sentences2=stsb_eval_dataset["sentence2"],
scores=stsb_eval_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=num_train_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=100,
run_name="adaptive-layer-nli", # Will be used in W&B if `wandb` is installed
)

# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
trainer.train()


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################


model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
test_evaluator(model)

# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)

# Optionally, save the model to the Hugging Face Hub!
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-adaptive-layer")
except Exception:
logging.error(
"Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-adaptive-layer')`."
)
Loading

0 comments on commit 4443bf5

Please sign in to comment.