diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..548051825 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,52 @@ +name: Unit tests + +on: + push: + branches: + - master + - v*-release + pull_request: + branches: + - master + workflow_dispatch: + +jobs: + + test_sampling: + name: Run unit tests + strategy: + matrix: + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + os: [ubuntu-latest, windows-latest] + fail-fast: false + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Setup Python environment + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Try to load cached dependencies + uses: actions/cache@v3 + id: restore-cache + with: + path: ${{ env.pythonLocation }} + key: python-dependencies-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ env.pythonLocation }} + + - name: Install external dependencies on cache miss + run: | + python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir -r requirements.txt + python -m pip install --no-cache-dir pytest + if: steps.restore-cache.outputs.cache-hit != 'true' + + - name: Install the checked-out sentence-transformers + run: python -m pip install . + + - name: Run unit tests + shell: bash + run: | + pytest -sv tests/ diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 000000000..5b46e89a2 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = + tests +addopts = --strict-markers -m "not slow" +markers = + slow: marks tests as slow \ No newline at end of file diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 6361ec9a5..afacc3a5f 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -189,7 +189,7 @@ def paraphrase_mining_embeddings(embeddings: Tensor, if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs: added_pairs.add((sorted_i, sorted_j)) - pairs_list.append([score, i, j]) + pairs_list.append([score, sorted_i, sorted_j]) # Highest scores first pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True) diff --git a/tests/test_cross_encoder.py b/tests/test_cross_encoder.py index f21853954..f2e5ba510 100644 --- a/tests/test_cross_encoder.py +++ b/tests/test_cross_encoder.py @@ -5,10 +5,10 @@ import gzip import os import unittest +import pytest from torch.utils.data import DataLoader -import logging -from sentence_transformers import CrossEncoder, util, LoggingHandler +from sentence_transformers import CrossEncoder, util from sentence_transformers.readers import InputExample from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator @@ -22,7 +22,6 @@ def setUp(self): #Read STSB self.stsb_train_samples = [] - self.dev_samples = [] self.test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) @@ -30,15 +29,13 @@ def setUp(self): score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - if row['split'] == 'dev': - self.dev_samples.append(inp_example) - elif row['split'] == 'test': + if row['split'] == 'test': self.test_samples.append(inp_example) - else: + elif row['split'] == 'train': self.stsb_train_samples.append(inp_example) - def evaluate_stsb_test(self, model, expected_score): - evaluator = CECorrelationEvaluator.from_input_examples(self.test_samples, name='sts-test') + def evaluate_stsb_test(self, model, expected_score, num_test_samples: int = -1): + evaluator = CECorrelationEvaluator.from_input_examples(self.test_samples[:num_test_samples], name='sts-test') score = evaluator(model)*100 print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score-expected_score) < 0.1 @@ -47,7 +44,8 @@ def test_pretrained_stsb(self): model = CrossEncoder("cross-encoder/stsb-distilroberta-base") self.evaluate_stsb_test(model, 87.92) - def test_train_stsb(self): + @pytest.mark.slow + def test_train_stsb_slow(self): model = CrossEncoder('distilroberta-base', num_labels=1) train_dataloader = DataLoader(self.stsb_train_samples, shuffle=True, batch_size=16) model.fit(train_dataloader=train_dataloader, @@ -55,8 +53,10 @@ def test_train_stsb(self): warmup_steps=int(len(train_dataloader)*0.1)) self.evaluate_stsb_test(model, 75) - - - -if "__main__" == __name__: - unittest.main() \ No newline at end of file + def test_train_stsb(self): + model = CrossEncoder('distilroberta-base', num_labels=1) + train_dataloader = DataLoader(self.stsb_train_samples[:500], shuffle=True, batch_size=16) + model.fit(train_dataloader=train_dataloader, + epochs=1, + warmup_steps=int(len(train_dataloader)*0.1)) + self.evaluate_stsb_test(model, 50, num_test_samples=100) diff --git a/tests/test_pretrained_stsb.py b/tests/test_pretrained_stsb.py index 95974c12b..0bd210871 100644 --- a/tests/test_pretrained_stsb.py +++ b/tests/test_pretrained_stsb.py @@ -1,43 +1,42 @@ """ Tests that the pretrained models produce the correct scores on the STSbenchmark dataset """ +from functools import partial from sentence_transformers import SentenceTransformer, InputExample, util from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator -import unittest import os import gzip import csv +import pytest -class PretrainedSTSbTest(unittest.TestCase): +def pretrained_model_score(model_name, expected_score, max_test_samples: int = 100): + model = SentenceTransformer(model_name) + sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' - def pretrained_model_score(self, model_name, expected_score): - model = SentenceTransformer(model_name) - sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' + if not os.path.exists(sts_dataset_path): + util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) - if not os.path.exists(sts_dataset_path): - util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) + test_samples = [] + with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: + reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) + for row in reader: + score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 + inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - train_samples = [] - dev_samples = [] - test_samples = [] - with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: - reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) - for row in reader: - score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 - inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) + if row['split'] == 'test': + test_samples.append(inp_example) + if max_test_samples != -1 and len(test_samples) >= max_test_samples: + break - if row['split'] == 'dev': - dev_samples.append(inp_example) - elif row['split'] == 'test': - test_samples.append(inp_example) - else: - train_samples.append(inp_example) + evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') - evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') + score = model.evaluate(evaluator)*100 + print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) + assert score > expected_score or abs(score-expected_score) < 0.1 - score = model.evaluate(evaluator)*100 - print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) - assert score > expected_score or abs(score-expected_score) < 0.1 +@pytest.mark.slow +class TestPretrainedSTSbSlow: + pretrained_model_score = partial(pretrained_model_score, max_test_samples=-1) def test_bert_base(self): self.pretrained_model_score('bert-base-nli-mean-tokens', 77.12) @@ -45,7 +44,6 @@ def test_bert_base(self): self.pretrained_model_score('bert-base-nli-cls-token', 76.30) self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 85.14) - def test_bert_large(self): self.pretrained_model_score('bert-large-nli-mean-tokens', 79.19) self.pretrained_model_score('bert-large-nli-max-tokens', 78.41) @@ -81,5 +79,47 @@ def test_msmarco(self): def test_sentence_t5(self): self.pretrained_model_score('sentence-t5-base', 85.52) -if "__main__" == __name__: - unittest.main() \ No newline at end of file + +class TestPretrainedSTSbFast: + pretrained_model_score = partial(pretrained_model_score, max_test_samples=100) + + def test_bert_base(self): + self.pretrained_model_score('bert-base-nli-mean-tokens', 86.53) + self.pretrained_model_score('bert-base-nli-max-tokens', 87.00) + self.pretrained_model_score('bert-base-nli-cls-token', 85.93) + self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 89.26) + + def test_bert_large(self): + self.pretrained_model_score('bert-large-nli-mean-tokens', 90.06) + self.pretrained_model_score('bert-large-nli-max-tokens', 90.15) + self.pretrained_model_score('bert-large-nli-cls-token', 89.51) + self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 92.27) + + def test_roberta(self): + self.pretrained_model_score('roberta-base-nli-mean-tokens', 87.91) + self.pretrained_model_score('roberta-large-nli-mean-tokens', 89.41) + self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 93.39) + self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 91.26) + + def test_distilbert(self): + self.pretrained_model_score('distilbert-base-nli-mean-tokens', 88.83) + self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 91.01) + self.pretrained_model_score('paraphrase-distilroberta-base-v1', 90.89) + + def test_multiling(self): + self.pretrained_model_score('distiluse-base-multilingual-cased', 88.79) + self.pretrained_model_score('paraphrase-xlm-r-multilingual-v1', 92.76) + self.pretrained_model_score('paraphrase-multilingual-MiniLM-L12-v2', 92.64) + + def test_mpnet(self): + self.pretrained_model_score('paraphrase-mpnet-base-v2', 92.83) + + def test_other_models(self): + self.pretrained_model_score('average_word_embeddings_komninos', 68.97) + + def test_msmarco(self): + self.pretrained_model_score('msmarco-roberta-base-ance-firstp', 83.61) + self.pretrained_model_score('msmarco-distilbert-base-v3', 87.96) + + def test_sentence_t5(self): + self.pretrained_model_score('sentence-t5-base', 92.75) diff --git a/tests/test_train_stsb.py b/tests/test_train_stsb.py index 7c7195847..b2d72206e 100644 --- a/tests/test_train_stsb.py +++ b/tests/test_train_stsb.py @@ -5,6 +5,7 @@ import gzip import os import unittest +import pytest from torch.utils.data import DataLoader @@ -38,7 +39,6 @@ def setUp(self): #Read STSB self.stsb_train_samples = [] - self.dev_samples = [] self.test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) @@ -46,11 +46,9 @@ def setUp(self): score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) - if row['split'] == 'dev': - self.dev_samples.append(inp_example) - elif row['split'] == 'test': + if row['split'] == 'test': self.test_samples.append(inp_example) - else: + elif row['split'] == 'train': self.stsb_train_samples.append(inp_example) def evaluate_stsb_test(self, model, expected_score): @@ -59,7 +57,8 @@ def evaluate_stsb_test(self, model, expected_score): print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score-expected_score) < 0.1 - def test_train_stsb(self): + @pytest.mark.slow + def test_train_stsb_slow(self): word_embedding_model = models.Transformer('distilbert-base-uncased') pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) @@ -75,7 +74,24 @@ def test_train_stsb(self): self.evaluate_stsb_test(model, 80.0) - def test_train_nli(self): + def test_train_stsb(self): + word_embedding_model = models.Transformer('distilbert-base-uncased') + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + train_dataset = SentencesDataset(self.stsb_train_samples[:100], model) + train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) + train_loss = losses.CosineSimilarityLoss(model=model) + model.fit(train_objectives=[(train_dataloader, train_loss)], + evaluator=None, + epochs=1, + evaluation_steps=1000, + warmup_steps=int(len(train_dataloader)*0.1), + use_amp=True) + + self.evaluate_stsb_test(model, 60.0) + + @pytest.mark.slow + def test_train_nli_slow(self): word_embedding_model = models.Transformer('distilbert-base-uncased') pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) @@ -90,7 +106,17 @@ def test_train_nli(self): self.evaluate_stsb_test(model, 50.0) + def test_train_nli(self): + word_embedding_model = models.Transformer('distilbert-base-uncased') + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) + train_dataset = SentencesDataset(self.nli_train_samples[:100], model=model) + train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) + train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3) + model.fit(train_objectives=[(train_dataloader, train_loss)], + evaluator=None, + epochs=1, + warmup_steps=int(len(train_dataloader) * 0.1), + use_amp=True) - -if "__main__" == __name__: - unittest.main() \ No newline at end of file + self.evaluate_stsb_test(model, 50.0)