diff --git a/README.md b/README.md index 300debaf0..faf016bbc 100644 --- a/README.md +++ b/README.md @@ -380,6 +380,28 @@ results = mteb.load_results(models=models, tasks=tasks) df = results_to_dataframe(results) ``` + + + +
+ Annotate Contamination in the training data of a model + +### Annotate Contamination + +have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR +annotatig the training datasets of the model: + +```py +model_w_contamination = ModelMeta( + name = "model-with-contamination" + ... + training_datasets: {"ArguAna": # name of dataset within MTEB + ["test"]} # the splits that have been trained on + ... +) +``` + +
diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 8dcac9ab0..3dca66b0f 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -83,10 +83,10 @@ def __call__( corpus, queries, self.top_k, instructions=instructions, **kwargs ) elif ( - hasattr(self.retriever.model, "mteb_model_meta") - and self.retriever.model.mteb_model_meta.name == "bm25s" + hasattr(self.retriever.model.model, "mteb_model_meta") + and self.retriever.model.model.mteb_model_meta.name == "bm25s" ): - return self.retriever.model.search( + return self.retriever.model.model.search( corpus, queries, self.top_k, diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 68deb02f2..38b77432e 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -76,6 +76,9 @@ class ModelMeta(BaseModel): zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models are evaluated non-zero-shot unless specified otherwise. citation: The citation for the model. This is a bibtex string. + training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example + {"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to + a benchmark as well as mark dataset contaminations. adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc. superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1. """ @@ -99,7 +102,7 @@ class ModelMeta(BaseModel): reference: STR_URL | None = None similarity_fn_name: DISTANCE_METRICS | None = None use_instructions: bool | None = None - zero_shot_benchmarks: list[str] | None = None + training_datasets: dict[str, list[str]] | None = None adapted_from: str | None = None superseded_by: str | None = None citation: str | None = None diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index 1848b9e4e..7d1161cdd 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -17,7 +17,7 @@ def bm25_loader(**kwargs): import Stemmer except ImportError: raise ImportError( - "bm25s or Stemmer is not installed. Please install it with `pip install bm25s Stemmer`." + "bm25s or Stemmer is not installed. Please install it with `pip install bm25s PyStemmer`." ) class BM25Search(DRESModel, Wrapper): @@ -58,7 +58,17 @@ def search( ) -> dict[str, dict[str, float]]: logger.info("Encoding Corpus...") corpus_ids = list(corpus.keys()) - corpus_with_ids = [{"doc_id": cid, **corpus[cid]} for cid in corpus_ids] + corpus_with_ids = [ + { + "doc_id": cid, + **( + {"text": corpus[cid]} + if isinstance(corpus[cid], str) + else corpus[cid] + ), + } + for cid in corpus_ids + ] corpus_texts = [ "\n".join([doc.get("title", ""), doc["text"]]) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 2ed0b76a9..3f07a0d23 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -5,6 +5,7 @@ import numpy as np import torch +import tqdm from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta @@ -140,25 +141,43 @@ def __init__( ) def _embed( - self, sentences: list[str], cohere_task_type: str, retries: int = 5 + self, + sentences: list[str], + cohere_task_type: str, + show_progress_bar: bool = False, + retries: int = 5, ) -> torch.Tensor: import cohere # type: ignore + max_batch_size = 256 + + batches = [ + sentences[i : i + max_batch_size] + for i in range(0, len(sentences), max_batch_size) + ] + client = cohere.Client() - while retries > 0: # Cohere's API is not always reliable - try: - response = client.embed( - texts=list(sentences), - model=self.model_name, - input_type=cohere_task_type, - ) - break - except Exception as e: - print(f"Retrying... {retries} retries left.") - retries -= 1 - if retries == 0: - raise e - return torch.tensor(response.embeddings) + + all_embeddings = [] + + for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): + while retries > 0: # Cohere's API is not always reliable + try: + response = client.embed( + texts=batch, + model=self.model_name, + input_type=cohere_task_type, + ) + break + except Exception as e: + print(f"Retrying... {retries} retries left.") + retries -= 1 + if retries == 0: + raise e + + all_embeddings.extend(torch.tensor(response.embeddings).numpy()) + + return np.array(all_embeddings) def encode( self, @@ -168,13 +187,24 @@ def encode( prompt_type: PromptType | None = None, **kwargs: Any, ) -> np.ndarray: - cohere_task_type = self.get_prompt_name( - self.model_prompts, task_name, prompt_type - ) + prompt_name = self.get_prompt_name(self.model_prompts, task_name, prompt_type) + cohere_task_type = self.model_prompts.get(prompt_name) + if cohere_task_type is None: # search_document is recommended if unknown (https://cohere.com/blog/introducing-embed-v3) cohere_task_type = "search_document" - return self._embed(sentences, cohere_task_type=cohere_task_type).numpy() + + show_progress_bar = ( + False + if "show_progress_bar" not in kwargs + else kwargs.pop("show_progress_bar") + ) + + return self._embed( + sentences, + cohere_task_type=cohere_task_type, + show_progress_bar=show_progress_bar, + ) model_prompts = { diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index ca2b32b2a..adf96fbe4 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -5,6 +5,7 @@ from typing import Any import numpy as np +import tqdm from mteb.model_meta import ModelMeta from mteb.requires_package import requires_package @@ -68,9 +69,15 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: for i in range(0, len(trimmed_sentences), max_batch_size) ] + show_progress_bar = ( + False + if "show_progress_bar" not in kwargs + else kwargs.pop("show_progress_bar") + ) + all_embeddings = [] - for sublist in sublists: + for sublist in tqdm.tqdm(sublists, leave=False, disable=not show_progress_bar): try: response = self._client.embeddings.create( input=sublist, diff --git a/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py b/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py index c22d15afc..6f7d188b7 100644 --- a/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py @@ -43,6 +43,9 @@ def _load_publichealthqa_data( answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))} for row in data: + if row["question"] is None or row["answer"] is None: + # There are some questions and answers that are None in the original dataset, specifically in the Arabic subset. + continue question = row["question"] answer = row["answer"] query_id = f"Q{question_ids[question]}" diff --git a/pyproject.toml b/pyproject.toml index 70e6bd5c9..90abd3575 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.22.0" +version = "1.23.0" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [