Skip to content

Commit

Permalink
feat: add new arctic v2.0 models (#1574)
Browse files Browse the repository at this point in the history
* feat: add new arctic v2.0 models

* chore: make lint
  • Loading branch information
dbuades authored Dec 10, 2024
1 parent e605c7b commit 53756ad
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 32 deletions.
187 changes: 155 additions & 32 deletions mteb/models/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,82 @@

from mteb.model_meta import ModelMeta, sentence_transformers_loader

arctic_m_v1_5 = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="Snowflake/snowflake-arctic-embed-m-v1.5",
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
model_prompts={
"query": "Represent this sentence for searching relevant passages: "
},
),
name="Snowflake/snowflake-arctic-embed-m-v1.5",
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
release_date="2024-07-08", # initial commit of hf model.
languages=["eng_Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=768,
license="apache-2.0",
reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5",
similarity_fn_name="cosine",
use_instructions=False,
adapted_from=None,
superseded_by=None,
)

LANGUAGES_V2_0 = [
"afr_Latn",
"ara_Arab",
"aze_Latn",
"bel_Cyrl",
"bul_Cyrl",
"ben_Beng",
"cat_Latn",
"ceb_Latn",
"ces_Latn",
"cym_Latn",
"dan_Latn",
"deu_Latn",
"ell_Grek",
"eng_Latn",
"spa_Latn",
"est_Latn",
"eus_Latn",
"fas_Arab",
"fin_Latn",
"fra_Latn",
"glg_Latn",
"guj_Gujr",
"heb_Hebr",
"hin_Deva",
"hrv_Latn",
"hat_Latn",
"hun_Latn",
"hye_Armn",
"ind_Latn",
"isl_Latn",
"ita_Latn",
"jpn_Jpan",
"jav_Latn",
"kat_Geor",
"kaz_Cyrl",
"khm_Khmr",
"kan_Knda",
"kor_Hang",
"kir_Cyrl",
"lao_Laoo",
"lit_Latn",
"lav_Latn",
"mkd_Cyrl",
"mal_Mlym",
"mon_Cyrl",
"mar_Deva",
"msa_Latn",
"mya_Mymr",
"nep_Deva",
"nld_Latn",
"pan_Guru",
"pol_Latn",
"por_Latn",
"que_Latn",
"ron_Latn",
"rus_Cyrl",
"sin_Sinh",
"slk_Latn",
"slv_Latn",
"som_Latn",
"sqi_Latn",
"srp_Cyrl",
"swe_Latn",
"swa_Latn",
"tam_Taml",
"tel_Telu",
"tha_Thai",
"tgl_Latn",
"tur_Latn",
"ukr_Cyrl",
"urd_Arab",
"vie_Latn",
"yor_Latn",
"zho_Hans",
]

arctic_embed_xs = ModelMeta(
loader=partial(
Expand Down Expand Up @@ -118,7 +167,7 @@
languages=["eng_Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
n_parameters=137_000_000,
memory_usage=None,
max_tokens=2048,
embed_dim=768,
Expand All @@ -127,10 +176,9 @@
similarity_fn_name="cosine",
use_instructions=False,
adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised",
superseded_by=None,
superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
)


arctic_embed_l = ModelMeta(
loader=partial(
sentence_transformers_loader,
Expand All @@ -143,14 +191,89 @@
languages=["eng_Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
n_parameters=335_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=768,
embed_dim=1024,
license="apache-2.0",
reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l",
similarity_fn_name="cosine",
use_instructions=False,
adapted_from="intfloat/e5-base-unsupervised",
superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0",
)

arctic_embed_m_v1_5 = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="Snowflake/snowflake-arctic-embed-m-v1.5",
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
model_prompts={
"query": "Represent this sentence for searching relevant passages: "
},
),
name="Snowflake/snowflake-arctic-embed-m-v1.5",
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
release_date="2024-07-08", # initial commit of hf model.
languages=["eng_Latn"],
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=109_000_000,
memory_usage=None,
max_tokens=512,
embed_dim=768,
license="apache-2.0",
reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5",
similarity_fn_name="cosine",
use_instructions=False,
adapted_from=None,
superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
)

arctic_embed_m_v2_0 = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="Snowflake/snowflake-arctic-embed-m-v2.0",
revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc",
),
name="Snowflake/snowflake-arctic-embed-m-v2.0",
revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc",
release_date="2024-12-04", # initial commit of hf model.
languages=LANGUAGES_V2_0,
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=305_000_000,
memory_usage=None,
max_tokens=8192,
embed_dim=768,
license="apache-2.0",
reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0",
similarity_fn_name="cosine",
use_instructions=False,
adapted_from="Alibaba-NLP/gte-multilingual-base",
superseded_by=None,
)

arctic_embed_l_v2_0 = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="Snowflake/snowflake-arctic-embed-l-v2.0",
revision="edc2df7b6c25794b340229ca082e7c78782e6374",
),
name="Snowflake/snowflake-arctic-embed-l-v2.0",
revision="edc2df7b6c25794b340229ca082e7c78782e6374",
release_date="2024-12-04", # initial commit of hf model.
languages=LANGUAGES_V2_0,
open_weights=True,
framework=["Sentence Transformers", "PyTorch"],
n_parameters=568_000_000,
memory_usage=None,
max_tokens=8192,
embed_dim=1024,
license="apache-2.0",
reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0",
similarity_fn_name="cosine",
use_instructions=False,
adapted_from="BAAI/bge-m3-retromae",
superseded_by=None,
)
2 changes: 2 additions & 0 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from mteb.model_meta import ModelMeta

Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
Expand Down

0 comments on commit 53756ad

Please sign in to comment.