Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into update_imports
Browse files Browse the repository at this point in the history
# Conflicts:
#	mteb/model_meta.py
  • Loading branch information
Samoed committed Dec 8, 2024
2 parents 54a7f5c + 2550a27 commit 07f1391
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 27 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,28 @@ results = mteb.load_results(models=models, tasks=tasks)
df = results_to_dataframe(results)
```

</details>


<details>
<summary> Annotate Contamination in the training data of a model </summary>

### Annotate Contamination

have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR
annotatig the training datasets of the model:

```py
model_w_contamination = ModelMeta(
name = "model-with-contamination"
...
training_datasets: {"ArguAna": # name of dataset within MTEB
["test"]} # the splits that have been trained on
...
)
```


</details>

<details>
Expand Down
6 changes: 3 additions & 3 deletions mteb/evaluation/evaluators/RetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ def __call__(
corpus, queries, self.top_k, instructions=instructions, **kwargs
)
elif (
hasattr(self.retriever.model, "mteb_model_meta")
and self.retriever.model.mteb_model_meta.name == "bm25s"
hasattr(self.retriever.model.model, "mteb_model_meta")
and self.retriever.model.model.mteb_model_meta.name == "bm25s"
):
return self.retriever.model.search(
return self.retriever.model.model.search(
corpus,
queries,
self.top_k,
Expand Down
5 changes: 4 additions & 1 deletion mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ class ModelMeta(BaseModel):
zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
are evaluated non-zero-shot unless specified otherwise.
citation: The citation for the model. This is a bibtex string.
training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example
{"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to
a benchmark as well as mark dataset contaminations.
adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc.
superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1.
"""
Expand All @@ -99,7 +102,7 @@ class ModelMeta(BaseModel):
reference: STR_URL | None = None
similarity_fn_name: DISTANCE_METRICS | None = None
use_instructions: bool | None = None
zero_shot_benchmarks: list[str] | None = None
training_datasets: dict[str, list[str]] | None = None
adapted_from: str | None = None
superseded_by: str | None = None
citation: str | None = None
Expand Down
14 changes: 12 additions & 2 deletions mteb/models/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def bm25_loader(**kwargs):
import Stemmer
except ImportError:
raise ImportError(
"bm25s or Stemmer is not installed. Please install it with `pip install bm25s Stemmer`."
"bm25s or Stemmer is not installed. Please install it with `pip install bm25s PyStemmer`."
)

class BM25Search(DRESModel, Wrapper):
Expand Down Expand Up @@ -58,7 +58,17 @@ def search(
) -> dict[str, dict[str, float]]:
logger.info("Encoding Corpus...")
corpus_ids = list(corpus.keys())
corpus_with_ids = [{"doc_id": cid, **corpus[cid]} for cid in corpus_ids]
corpus_with_ids = [
{
"doc_id": cid,
**(
{"text": corpus[cid]}
if isinstance(corpus[cid], str)
else corpus[cid]
),
}
for cid in corpus_ids
]

corpus_texts = [
"\n".join([doc.get("title", ""), doc["text"]])
Expand Down
68 changes: 49 additions & 19 deletions mteb/models/cohere_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import torch
import tqdm

from mteb.encoder_interface import PromptType
from mteb.model_meta import ModelMeta
Expand Down Expand Up @@ -140,25 +141,43 @@ def __init__(
)

def _embed(
self, sentences: list[str], cohere_task_type: str, retries: int = 5
self,
sentences: list[str],
cohere_task_type: str,
show_progress_bar: bool = False,
retries: int = 5,
) -> torch.Tensor:
import cohere # type: ignore

max_batch_size = 256

batches = [
sentences[i : i + max_batch_size]
for i in range(0, len(sentences), max_batch_size)
]

client = cohere.Client()
while retries > 0: # Cohere's API is not always reliable
try:
response = client.embed(
texts=list(sentences),
model=self.model_name,
input_type=cohere_task_type,
)
break
except Exception as e:
print(f"Retrying... {retries} retries left.")
retries -= 1
if retries == 0:
raise e
return torch.tensor(response.embeddings)

all_embeddings = []

for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar):
while retries > 0: # Cohere's API is not always reliable
try:
response = client.embed(
texts=batch,
model=self.model_name,
input_type=cohere_task_type,
)
break
except Exception as e:
print(f"Retrying... {retries} retries left.")
retries -= 1
if retries == 0:
raise e

all_embeddings.extend(torch.tensor(response.embeddings).numpy())

return np.array(all_embeddings)

def encode(
self,
Expand All @@ -168,13 +187,24 @@ def encode(
prompt_type: PromptType | None = None,
**kwargs: Any,
) -> np.ndarray:
cohere_task_type = self.get_prompt_name(
self.model_prompts, task_name, prompt_type
)
prompt_name = self.get_prompt_name(self.model_prompts, task_name, prompt_type)
cohere_task_type = self.model_prompts.get(prompt_name)

if cohere_task_type is None:
# search_document is recommended if unknown (https://cohere.com/blog/introducing-embed-v3)
cohere_task_type = "search_document"
return self._embed(sentences, cohere_task_type=cohere_task_type).numpy()

show_progress_bar = (
False
if "show_progress_bar" not in kwargs
else kwargs.pop("show_progress_bar")
)

return self._embed(
sentences,
cohere_task_type=cohere_task_type,
show_progress_bar=show_progress_bar,
)


model_prompts = {
Expand Down
9 changes: 8 additions & 1 deletion mteb/models/openai_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any

import numpy as np
import tqdm

from mteb.model_meta import ModelMeta
from mteb.requires_package import requires_package
Expand Down Expand Up @@ -68,9 +69,15 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
for i in range(0, len(trimmed_sentences), max_batch_size)
]

show_progress_bar = (
False
if "show_progress_bar" not in kwargs
else kwargs.pop("show_progress_bar")
)

all_embeddings = []

for sublist in sublists:
for sublist in tqdm.tqdm(sublists, leave=False, disable=not show_progress_bar):
try:
response = self._client.embeddings.create(
input=sublist,
Expand Down
3 changes: 3 additions & 0 deletions mteb/tasks/Retrieval/multilingual/PublicHealthQARetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def _load_publichealthqa_data(
answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}

for row in data:
if row["question"] is None or row["answer"] is None:
# There are some questions and answers that are None in the original dataset, specifically in the Arabic subset.
continue
question = row["question"]
answer = row["answer"]
query_id = f"Q{question_ids[question]}"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "mteb"
version = "1.22.0"
version = "1.23.0"
description = "Massive Text Embedding Benchmark"
readme = "README.md"
authors = [
Expand Down

0 comments on commit 07f1391

Please sign in to comment.