Skip to content

Commit

Permalink
fix: add more model meta (jina, e5) (#1537)
Browse files Browse the repository at this point in the history
* add e5 model meta

* address review comments
  • Loading branch information
isaac-chung authored Dec 4, 2024
1 parent 5fa7b7b commit 36bab4d
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 7 deletions.
4 changes: 2 additions & 2 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class ModelMeta(BaseModel):
zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
are evaluated non-zero-shot unless specified otherwise.
adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc.
supersedes: Name of the model that this model supersedes, e.g. nvidia/NV-Embed-v2 supersedes v1.
superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1.
"""

model_config = ConfigDict(extra="forbid")
Expand All @@ -99,7 +99,7 @@ class ModelMeta(BaseModel):
use_instructions: bool | None = None
zero_shot_benchmarks: list[str] | None = None
adapted_from: str | None = None
supersedes: str | None = None
superseded_by: str | None = None

def to_dict(self):
dict_repr = self.model_dump()
Expand Down
60 changes: 57 additions & 3 deletions mteb/models/e5_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,15 +238,17 @@
open_weights=True,
revision="1c644c92ad3ba1efdad3f1451a637716616a20e8",
release_date=E5_PAPER_RELEASE_DATE,
n_parameters=278_000_000,
n_parameters=109_000_000,
memory_usage=None,
embed_dim=768,
license="mit",
max_tokens=514,
max_tokens=512,
reference="https://huggingface.co/intfloat/e5-base-v2",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
superseded_by=None,
adapted_from=None,
)

e5_eng_large_v2 = ModelMeta(
Expand All @@ -261,7 +263,7 @@
open_weights=True,
revision="b322e09026e4ea05f42beadf4d661fb4e101d311",
release_date=E5_PAPER_RELEASE_DATE,
n_parameters=560_000_000,
n_parameters=335_000_000,
memory_usage=None,
embed_dim=1024,
license="mit",
Expand All @@ -270,4 +272,56 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
superseded_by=None,
adapted_from=None,
)

e5_large = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="intfloat/e5-large",
revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81",
model_prompts=model_prompts,
),
name="intfloat/e5-large",
languages=["eng-Latn"],
open_weights=True,
revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81",
release_date="2022-12-26",
n_parameters=335_000_000,
memory_usage=None,
embed_dim=1024,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/intfloat/e5-large",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
superseded_by="intfloat/e5-large-v2",
adapted_from=None,
)

e5_base = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="intfloat/e5-base",
revision="b533fe4636f4a2507c08ddab40644d20b0006d6a",
model_prompts=model_prompts,
),
name="intfloat/e5-base",
languages=["eng-Latn"],
open_weights=True,
revision="b533fe4636f4a2507c08ddab40644d20b0006d6a",
release_date="2022-12-26",
n_parameters=109_000_000,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/intfloat/e5-base",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
superseded_by="intfloat/e5-base-v2",
adapted_from=None,
)
88 changes: 86 additions & 2 deletions mteb/models/sentence_transformers_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
)

paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta(
Expand All @@ -92,6 +94,8 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
)

paraphrase_multilingual_mpnet_base_v2 = ModelMeta(
Expand All @@ -109,6 +113,8 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
)

labse = ModelMeta(
Expand All @@ -126,6 +132,8 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
)

multi_qa_MiniLM_L6_cos_v1 = ModelMeta(
Expand All @@ -143,7 +151,7 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
supersedes=None,
superseded_by=None,
adapted_from=None,
)

Expand All @@ -162,6 +170,82 @@
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
supersedes="sentence-transformers/all-mpnet-base-v1",
superseded_by=None,
adapted_from=None,
)

jina_embeddings_v2_base_en = ModelMeta(
name="jinaai/jina-embeddings-v2-base-en",
languages=["eng-Latn"],
open_weights=True,
revision="6e85f575bc273f1fd840a658067d0157933c83f0", # can be any
release_date="2023-09-27",
n_parameters=137_000_000,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=8192,
reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
)

jina_embeddings_v2_small_en = ModelMeta(
name="jinaai/jina-embeddings-v2-small-en",
languages=["eng-Latn"],
open_weights=True,
revision="", # can be any
release_date="2023-09-27",
n_parameters=32_700_000,
memory_usage=None,
embed_dim=512,
license="apache-2.0",
max_tokens=8192,
reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
)

jina_embedding_b_en_v1 = ModelMeta(
name="jinaai/jina-embedding-b-en-v1",
languages=["eng-Latn"],
open_weights=True,
revision="aa0645035294a8c0607ce5bb700aba982cdff32c", # can be any
release_date="2023-07-07",
n_parameters=110_000_000,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by="jinaai/jina-embeddings-v2-base-en",
adapted_from=None,
)

jina_embedding_s_en_v1 = ModelMeta(
name="jinaai/jina-embedding-s-en-v1",
languages=["eng-Latn"],
open_weights=True,
revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", # can be any
release_date="2023-07-07",
n_parameters=35_000_000,
memory_usage=None,
embed_dim=512,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by="jinaai/jina-embeddings-v2-small-en",
adapted_from=None,
)

0 comments on commit 36bab4d

Please sign in to comment.