diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 9ac70fd63..eadc4065f 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -4,33 +4,82 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -arctic_m_v1_5 = ModelMeta( - loader=partial( - sentence_transformers_loader, - model_name="Snowflake/snowflake-arctic-embed-m-v1.5", - revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", - model_prompts={ - "query": "Represent this sentence for searching relevant passages: " - }, - ), - name="Snowflake/snowflake-arctic-embed-m-v1.5", - revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", - release_date="2024-07-08", # initial commit of hf model. - languages=["eng_Latn"], - open_weights=True, - framework=["Sentence Transformers", "PyTorch"], - n_parameters=109_000_000, - memory_usage=None, - max_tokens=512, - embed_dim=768, - license="apache-2.0", - reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", - similarity_fn_name="cosine", - use_instructions=False, - adapted_from=None, - superseded_by=None, -) - +LANGUAGES_V2_0 = [ + "afr_Latn", + "ara_Arab", + "aze_Latn", + "bel_Cyrl", + "bul_Cyrl", + "ben_Beng", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "ell_Grek", + "eng_Latn", + "spa_Latn", + "est_Latn", + "eus_Latn", + "fas_Arab", + "fin_Latn", + "fra_Latn", + "glg_Latn", + "guj_Gujr", + "heb_Hebr", + "hin_Deva", + "hrv_Latn", + "hat_Latn", + "hun_Latn", + "hye_Armn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jpn_Jpan", + "jav_Latn", + "kat_Geor", + "kaz_Cyrl", + "khm_Khmr", + "kan_Knda", + "kor_Hang", + "kir_Cyrl", + "lao_Laoo", + "lit_Latn", + "lav_Latn", + "mkd_Cyrl", + "mal_Mlym", + "mon_Cyrl", + "mar_Deva", + "msa_Latn", + "mya_Mymr", + "nep_Deva", + "nld_Latn", + "pan_Guru", + "pol_Latn", + "por_Latn", + "que_Latn", + "ron_Latn", + "rus_Cyrl", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "som_Latn", + "sqi_Latn", + "srp_Cyrl", + "swe_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tha_Thai", + "tgl_Latn", + "tur_Latn", + "ukr_Cyrl", + "urd_Arab", + "vie_Latn", + "yor_Latn", + "zho_Hans", +] arctic_embed_xs = ModelMeta( loader=partial( @@ -118,7 +167,7 @@ languages=["eng_Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], - n_parameters=109_000_000, + n_parameters=137_000_000, memory_usage=None, max_tokens=2048, embed_dim=768, @@ -127,10 +176,9 @@ similarity_fn_name="cosine", use_instructions=False, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", - superseded_by=None, + superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", ) - arctic_embed_l = ModelMeta( loader=partial( sentence_transformers_loader, @@ -143,14 +191,89 @@ languages=["eng_Latn"], open_weights=True, framework=["Sentence Transformers", "PyTorch"], - n_parameters=109_000_000, + n_parameters=335_000_000, memory_usage=None, max_tokens=512, - embed_dim=768, + embed_dim=1024, license="apache-2.0", reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l", similarity_fn_name="cosine", use_instructions=False, adapted_from="intfloat/e5-base-unsupervised", + superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", +) + +arctic_embed_m_v1_5 = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-m-v1.5", + revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", + model_prompts={ + "query": "Represent this sentence for searching relevant passages: " + }, + ), + name="Snowflake/snowflake-arctic-embed-m-v1.5", + revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47", + release_date="2024-07-08", # initial commit of hf model. + languages=["eng_Latn"], + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=109_000_000, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from=None, + superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", +) + +arctic_embed_m_v2_0 = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-m-v2.0", + revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc", + ), + name="Snowflake/snowflake-arctic-embed-m-v2.0", + revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc", + release_date="2024-12-04", # initial commit of hf model. + languages=LANGUAGES_V2_0, + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=305_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=768, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="Alibaba-NLP/gte-multilingual-base", + superseded_by=None, +) + +arctic_embed_l_v2_0 = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="Snowflake/snowflake-arctic-embed-l-v2.0", + revision="edc2df7b6c25794b340229ca082e7c78782e6374", + ), + name="Snowflake/snowflake-arctic-embed-l-v2.0", + revision="edc2df7b6c25794b340229ca082e7c78782e6374", + release_date="2024-12-04", # initial commit of hf model. + languages=LANGUAGES_V2_0, + open_weights=True, + framework=["Sentence Transformers", "PyTorch"], + n_parameters=568_000_000, + memory_usage=None, + max_tokens=8192, + embed_dim=1024, + license="apache-2.0", + reference="https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0", + similarity_fn_name="cosine", + use_instructions=False, + adapted_from="BAAI/bge-m3-retromae", superseded_by=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 2429cce39..61dc549b1 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from mteb.model_meta import ModelMeta Haon_Chen__speed_embedding_7b_instruct = ModelMeta(