From ce8c17541e61ca259bf73f1b0d634a9cea3f93bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 9 Dec 2024 01:28:18 +0100 Subject: [PATCH] fix: Added metadata for miscellaneous models (#1557) * Added script for generating metadata, and metadata for the listed models * Added misc models to overview * Fixed misc metas * Removed unnecessary imports * Added logic to retrieve base model information * Added base models to misc meta * Added superseded_by to sentence-croissant models * Added training datasets to mis models --- mteb/models/misc_models.py | 1632 ++++++++++++++++++++++++++++++++++ mteb/models/overview.py | 2 + scripts/generate_metadata.py | 259 ++++++ 3 files changed, 1893 insertions(+) create mode 100644 mteb/models/misc_models.py create mode 100644 scripts/generate_metadata.py diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py new file mode 100644 index 000000000..2429cce39 --- /dev/null +++ b/mteb/models/misc_models.py @@ -0,0 +1,1632 @@ +from mteb.model_meta import ModelMeta + +Haon_Chen__speed_embedding_7b_instruct = ModelMeta( + name="Haon-Chen/speed-embedding-7b-instruct", + revision="c167e9a8144b397622ce47b85d9edcdeecef3d3f", + release_date="2024-10-31", + languages=["eng_Latn"], + loader=None, + n_parameters=7110660096, + memory_usage=None, + max_tokens=32768.0, + embed_dim=None, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="mistralai/Mistral-7B-v0.1", + superseded_by=None, +) +Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta( + name="Gameselo/STS-multilingual-mpnet-base-v2", + revision="449f917af30f590fc31f9ffb226c94f21a2f47b8", + release_date="2024-06-07", + languages=[], + loader=None, + n_parameters=278043648, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + superseded_by=None, +) +HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta( + name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + revision="45e42c89990c40aca042659133fc8b13c28634b5", + release_date="2024-10-23", + languages=None, + loader=None, + n_parameters=494032768, + memory_usage=None, + max_tokens=131072.0, + embed_dim=896, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_1022_filtered_v2_inst_3node_g8_1e-5_sin-0.1_mrl", + superseded_by=None, +) +HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta( + name="HIT-TMG/KaLM-embedding-multilingual-mini-v1", + revision="8a82a0cd2b322b91723e252486f7cce6fd8ac9d3", + release_date="2024-08-27", + languages=None, + loader=None, + n_parameters=494032768, + memory_usage=None, + max_tokens=131072.0, + embed_dim=896, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/mnt/shgeminicephfs/wx-dc-plt-hpc/xinshuohu/Output/Embedding/Qwen2-0.5B-eos_mean_pretrain_0806_1e-4_uen_sft_0902_filtered_v2_3node_g8_1e-5_sin-0.1", + superseded_by=None, +) +Hum_Works__lodestone_base_4096_v1 = ModelMeta( + name="Hum-Works/lodestone-base-4096-v1", + revision="9bbc2d0b57dd2198aea029404b0f976712a7d966", + release_date="2023-08-25", + languages=["eng_Latn"], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=None, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={ + "s2orc": ["train"], + "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], + "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ + "train" + ], + "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ + "train" + ], + "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ + "train" + ], + "sentence-transformers/reddit-title-body": ["train"], + "msmarco": ["train"], + "gooaq": ["train"], + "yahoo_answers_topics": ["train"], + "code_search_net": ["train"], + "search_qa": ["train"], + "eli5": ["train"], + "snli": ["train"], + "multi_nli": ["train"], + "wikihow": ["train"], + "natural_questions": ["train"], + "trivia_qa": ["train"], + "embedding-data/sentence-compression": ["train"], + "embedding-data/flickr30k-captions": ["train"], + "embedding-data/altlex": ["train"], + "embedding-data/simple-wiki": ["train"], + "embedding-data/QQP": ["train"], + "embedding-data/SPECTER": ["train"], + "embedding-data/PAQ_pairs": ["train"], + "embedding-data/WikiAnswers": ["train"], + "sentence-transformers/embedding-training-data": ["train"], + }, + adapted_from="hum-lodestone-v1", + superseded_by=None, +) +Jaume__gemma_2b_embeddings = ModelMeta( + name="Jaume/gemma-2b-embeddings", + revision="86431f65d7c3f66b2af096c61e614a2958f191f1", + release_date="2024-06-29", + languages=[], + loader=None, + n_parameters=2506172416, + memory_usage=None, + max_tokens=8192.0, + embed_dim=2048, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Jaume/gemma-2b-embeddings", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={}, + adapted_from="google/gemma-2b", + superseded_by=None, +) +BeastyZ__e5_R_mistral_7b = ModelMeta( + name="BeastyZ/e5-R-mistral-7b", + revision="3f810a6a7fd220369ad248e3705cf13d71803602", + release_date="2024-06-28", + languages=["eng_Latn"], + loader=None, + n_parameters=7241732096, + memory_usage=None, + max_tokens=32768.0, + embed_dim=None, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"BeastyZ/E5-R": ["train"]}, + adapted_from="/ConRetriever/public_weight_mistral", + superseded_by=None, +) +Lajavaness__bilingual_embedding_base = ModelMeta( + name="Lajavaness/bilingual-embedding-base", + revision="0bfc54bb2aa2666dd84715289c7ef58a95eb4d8d", + release_date="2024-06-26", + languages=None, + loader=None, + n_parameters=278043648, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="dangvantuan/bilingual_impl", + superseded_by=None, +) +Lajavaness__bilingual_embedding_large = ModelMeta( + name="Lajavaness/bilingual-embedding-large", + revision="e83179d7a66e8aed1b3015e98bb5ae234ed89598", + release_date="2024-06-24", + languages=["fra_Latn", "eng_Latn"], + loader=None, + n_parameters=559890432, + memory_usage=None, + max_tokens=514.0, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="dangvantuan/bilingual_impl", + superseded_by=None, +) +Lajavaness__bilingual_embedding_small = ModelMeta( + name="Lajavaness/bilingual-embedding-small", + revision="ed4a1dd814de0db81d4a4e287c296a03194463e3", + release_date="2024-07-17", + languages=["fra_Latn", "eng_Latn"], + loader=None, + n_parameters=117653760, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="dangvantuan/bilingual_impl", + superseded_by=None, +) +Mihaiii__Bulbasaur = ModelMeta( + name="Mihaiii/Bulbasaur", + revision="6876f839e18ae36224049a41194a431953f08747", + release_date="2024-04-27", + languages=None, + loader=None, + n_parameters=17389824, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/Bulbasaur", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Mihaiii/qa-assistant": ["train"]}, + adapted_from="Mihaiii/dwsdwass", + superseded_by=None, +) +Mihaiii__Ivysaur = ModelMeta( + name="Mihaiii/Ivysaur", + revision="65914d976f45beb4bda7485c39d88865b4ce6554", + release_date="2024-04-27", + languages=None, + loader=None, + n_parameters=22713216, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/Ivysaur", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Mihaiii/qa-assistant": ["train"]}, + adapted_from="Mihaiii/jhjghjgh", + superseded_by=None, +) +Mihaiii__Squirtle = ModelMeta( + name="Mihaiii/Squirtle", + revision="5b991da48a9286637a256d4a35aab87a1a57b78a", + release_date="2024-04-30", + languages=None, + loader=None, + n_parameters=15615360, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/Squirtle", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Mihaiii/qa-assistant": ["train"]}, + adapted_from="Mihaiii/test21", + superseded_by=None, +) +Mihaiii__Venusaur = ModelMeta( + name="Mihaiii/Venusaur", + revision="0dc817f0addbb7bab8feeeeaded538f9ffeb3419", + release_date="2024-04-29", + languages=None, + loader=None, + n_parameters=15615360, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/Venusaur", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Mihaiii/qa-assistant": ["train"]}, + adapted_from="Mihaiii/test14", + superseded_by=None, +) +Mihaiii__Wartortle = ModelMeta( + name="Mihaiii/Wartortle", + revision="14caca5253414d38a7d28b62d1b7c30ef3293a87", + release_date="2024-04-30", + languages=None, + loader=None, + n_parameters=17389824, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/Wartortle", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Mihaiii/qa-assistant": ["train"]}, + adapted_from="Mihaiii/test22", + superseded_by=None, +) +Mihaiii__gte_micro = ModelMeta( + name="Mihaiii/gte-micro", + revision="6fd2397cb9dfa7c901aedf9a2a44d3c888ccafdd", + release_date="2024-04-21", + languages=None, + loader=None, + n_parameters=17389824, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/gte-micro", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +Mihaiii__gte_micro_v4 = ModelMeta( + name="Mihaiii/gte-micro-v4", + revision="78e1a4b348f8524c3ab2e3e3475788f5adb8c98f", + release_date="2024-04-22", + languages=None, + loader=None, + n_parameters=19164288, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Mihaiii/gte-micro-v4", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +OrdalieTech__Solon_embeddings_large_0_1 = ModelMeta( + name="OrdalieTech/Solon-embeddings-large-0.1", + revision="9f6465f6ea2f6d10c6294bc15d84edf87d47cdef", + release_date="2023-12-09", + languages=["fra_Latn"], + loader=None, + n_parameters=559890432, + memory_usage=None, + max_tokens=514.0, + embed_dim=1024, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="solon-large-06-BIG", + superseded_by=None, +) +Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta( + name="Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", + revision="d0361a36f6fe69febfc8550d0918abab174f6f30", + release_date="2024-06-16", + languages=["ara_Arab"], + loader=None, + n_parameters=135193344, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + adapted_from="aubmindlab/bert-base-arabertv02", + superseded_by=None, +) +Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta( + name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", + revision="6916465c43b984e955aa6dc72851474f0128f428", + release_date="2024-06-25", + languages=["ara_Arab"], + loader=None, + n_parameters=117653760, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + superseded_by=None, +) +Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta( + name="Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", + revision="1ca467cc576bd76666a4d21b24ee43afa914dd10", + release_date="2024-06-14", + languages=["ara_Arab"], + loader=None, + n_parameters=278043648, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + superseded_by=None, +) +Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta( + name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", + revision="ee6d5e33c78ed582ade47fd452a74ea52aa5bfe2", + release_date="2024-06-16", + languages=["ara_Arab"], + loader=None, + n_parameters=470926848, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + adapted_from="sentence-transformers/LaBSE", + superseded_by=None, +) +Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta( + name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", + revision="2628cb641e040f44328195fadcdfb58e6d5cffa7", + release_date="2024-06-15", + languages=["ara_Arab"], + loader=None, + n_parameters=109486464, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + adapted_from="tomaarsen/mpnet-base-all-nli-triplet", + superseded_by=None, +) +Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta( + name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", + revision="ecf3274e164f057c4a3dd70691cae0265d87a9d0", + release_date="2024-06-17", + languages=["ara_Arab"], + loader=None, + n_parameters=162841344, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + adapted_from="UBC-NLP/MARBERTv2", + superseded_by=None, +) +consciousAI__cai_lunaris_text_embeddings = ModelMeta( + name="consciousAI/cai-lunaris-text-embeddings", + revision="8332c464d13505968ff7a6e2213f36fd8730b4c7", + release_date="2023-06-22", + languages=None, + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=512.0, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/root/.cache/torch/sentence_transformers/intfloat_e5-large-v2", + superseded_by=None, +) +consciousAI__cai_stellaris_text_embeddings = ModelMeta( + name="consciousAI/cai-stellaris-text-embeddings", + revision="c000ec4b29588daf0f4a0b2ad4e72ee807d8efc0", + release_date="2023-06-23", + languages=None, + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/root/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v1/", + superseded_by=None, +) +manu__bge_m3_custom_fr = ModelMeta( + name="manu/bge-m3-custom-fr", + revision="ed3ef88678ba83ddf4c0fab71a93cb90d89a9078", + release_date="2024-04-11", + languages=None, + loader=None, + n_parameters=567754752, + memory_usage=None, + max_tokens=8194.0, + embed_dim=1024, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/manu/bge-m3-custom-fr", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="data/bge-m3-custom", + superseded_by=None, +) +manu__sentence_croissant_alpha_v0_2 = ModelMeta( + name="manu/sentence_croissant_alpha_v0.2", + revision="4610b8cea65d7dd59e0b04af50753933fe5b29b2", + release_date="2024-03-15", + languages=None, + loader=None, + n_parameters=1279887360, + memory_usage=None, + max_tokens=2048.0, + embed_dim=2048, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="croissantllm/CroissantCool", + superseded_by="manu/sentence_croissant_alpha_v0.3", +) +manu__sentence_croissant_alpha_v0_3 = ModelMeta( + name="manu/sentence_croissant_alpha_v0.3", + revision="4ac16754f3d81aba76cc32955dc9ee4122df96eb", + release_date="2024-04-26", + languages=None, + loader=None, + n_parameters=1279887360, + memory_usage=None, + max_tokens=2048.0, + embed_dim=2048, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="croissantllm/CroissantCool-v0.2", + superseded_by="manu/sentence_croissant_alpha_v0.4", +) +manu__sentence_croissant_alpha_v0_4 = ModelMeta( + name="manu/sentence_croissant_alpha_v0.4", + revision="0ce6372e6a3c21134dcf26dcde13cca869c767fc", + release_date="2024-04-27", + languages=["fra_Latn", "eng_Latn"], + loader=None, + n_parameters=1279887360, + memory_usage=None, + max_tokens=2048.0, + embed_dim=2048, + license="mit", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + adapted_from="croissantllm/CroissantCool-v0.2", + superseded_by=None, +) +thenlper__gte_base = ModelMeta( + name="thenlper/gte-base", + revision="c078288308d8dee004ab72c6191778064285ec0c", + release_date="2023-07-27", + languages=["eng_Latn"], + loader=None, + n_parameters=109482752, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/thenlper/gte-base", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +thenlper__gte_large = ModelMeta( + name="thenlper/gte-large", + revision="4bef63f39fcc5e2d6b0aae83089f307af4970164", + release_date="2023-07-27", + languages=["eng_Latn"], + loader=None, + n_parameters=335142400, + memory_usage=None, + max_tokens=512.0, + embed_dim=1024, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/thenlper/gte-large", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +thenlper__gte_small = ModelMeta( + name="thenlper/gte-small", + revision="17e1f347d17fe144873b1201da91788898c639cd", + release_date="2023-07-27", + languages=["eng_Latn"], + loader=None, + n_parameters=33360512, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/thenlper/gte-small", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +OrlikB__KartonBERT_USE_base_v1 = ModelMeta( + name="OrlikB/KartonBERT-USE-base-v1", + revision="1f59dd58fe57995c0e867d5e29f03763eae99645", + release_date="2024-09-30", + languages=["pol_Latn"], + loader=None, + n_parameters=103705344, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="gpl-3.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="KartonBERT-USE-base-v1", + superseded_by=None, +) +OrlikB__st_polish_kartonberta_base_alpha_v1 = ModelMeta( + name="OrlikB/st-polish-kartonberta-base-alpha-v1", + revision="5590a0e2d7bb43674e44d7076b3ff157f7d4a1cb", + release_date="2023-11-12", + languages=["pol_Latn"], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license="lgpl", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="st-polish-kartonberta-base-alpha-v1", + superseded_by=None, +) +sdadas__mmlw_e5_base = ModelMeta( + name="sdadas/mmlw-e5-base", + revision="f10628ed55b5ec400502aff439bd714a6da0af30", + release_date="2023-11-17", + languages=["pol_Latn"], + loader=None, + n_parameters=278043648, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/sdadas/mmlw-e5-base", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="intfloat/multilingual-e5-base", + superseded_by=None, +) +dwzhu__e5_base_4k = ModelMeta( + name="dwzhu/e5-base-4k", + revision="1b5664b8cb2bccd8c309429b7bfe5864402e8fbc", + release_date="2024-03-28", + languages=["eng_Latn"], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=4096.0, + embed_dim=None, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/dwzhu/e5-base-4k", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/mnt/default/longembed/models/intfloat/e5-base-v2", + superseded_by=None, +) +sdadas__mmlw_e5_large = ModelMeta( + name="sdadas/mmlw-e5-large", + revision="5c143fb045ebed664fd85b43fc45155999eb110f", + release_date="2023-11-17", + languages=["pol_Latn"], + loader=None, + n_parameters=559890432, + memory_usage=None, + max_tokens=514.0, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/sdadas/mmlw-e5-large", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="intfloat/multilingual-e5-large", + superseded_by=None, +) +sdadas__mmlw_e5_small = ModelMeta( + name="sdadas/mmlw-e5-small", + revision="ff1298cb6d997f18b794d2f3d73cad2ba2ad739a", + release_date="2023-11-17", + languages=["pol_Latn"], + loader=None, + n_parameters=117653760, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/sdadas/mmlw-e5-small", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="intfloat/multilingual-e5-small", + superseded_by=None, +) +sdadas__mmlw_roberta_base = ModelMeta( + name="sdadas/mmlw-roberta-base", + revision="0ac7f23f6c96af601fa6a17852bd08d5136d6365", + release_date="2023-11-17", + languages=["pol_Latn"], + loader=None, + n_parameters=124442880, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/sdadas/mmlw-roberta-base", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="sdadas/polish-roberta-base-v2", + superseded_by=None, +) +sdadas__mmlw_roberta_large = ModelMeta( + name="sdadas/mmlw-roberta-large", + revision="b8058066a8de32d0737b3cd82d8b4f4108745af9", + release_date="2023-11-17", + languages=["pol_Latn"], + loader=None, + n_parameters=434961408, + memory_usage=None, + max_tokens=514.0, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/sdadas/mmlw-roberta-large", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="sdadas/polish-roberta-large-v2", + superseded_by=None, +) +izhx__udever_bloom_1b1 = ModelMeta( + name="izhx/udever-bloom-1b1", + revision="7bf1ee29878cb040b2708a691aa4b61f27eaa252", + release_date="2023-10-24", + languages=[ + "aka_Latn", + "ara_Arab", + "asm_Beng", + "bam_Latn", + "ben_Beng", + "cat_Latn", + "eng_Latn", + "spa_Latn", + "eus_Latn", + "fon_Latn", + "fra_Latn", + "guj_Gujr", + "hin_Deva", + "ind_Latn", + "ibo_Latn", + "kik_Latn", + "kan_Knda", + "lug_Latn", + "lin_Latn", + "mal_Mlym", + "mar_Deva", + "nep_Deva", + "nso_Latn", + "nya_Latn", + "ori_Orya", + "pan_Guru", + "por_Latn", + "run_Latn", + "kin_Latn", + "sna_Latn", + "sot_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tsn_Latn", + "tso_Latn", + "tum_Latn", + "twi_Latn", + "urd_Arab", + "vie_Latn", + "wol_Latn", + "xho_Latn", + "yor_Latn", + "zho_Hans", + "zul_Latn", + ], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=None, + embed_dim=None, + license="bigscience-bloom-rail-1.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/izhx/udever-bloom-1b1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="bigscience/bloom-1b1", + superseded_by=None, +) +izhx__udever_bloom_3b = ModelMeta( + name="izhx/udever-bloom-3b", + revision="4edd8affe80ca89ba0f6b6ba4103fc7f25fc57b2", + release_date="2023-10-24", + languages=[ + "aka_Latn", + "ara_Arab", + "asm_Beng", + "bam_Latn", + "ben_Beng", + "cat_Latn", + "eng_Latn", + "spa_Latn", + "eus_Latn", + "fon_Latn", + "fra_Latn", + "guj_Gujr", + "hin_Deva", + "ind_Latn", + "ibo_Latn", + "kik_Latn", + "kan_Knda", + "lug_Latn", + "lin_Latn", + "mal_Mlym", + "mar_Deva", + "nep_Deva", + "nso_Latn", + "nya_Latn", + "ori_Orya", + "pan_Guru", + "por_Latn", + "run_Latn", + "kin_Latn", + "sna_Latn", + "sot_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tsn_Latn", + "tso_Latn", + "tum_Latn", + "twi_Latn", + "urd_Arab", + "vie_Latn", + "wol_Latn", + "xho_Latn", + "yor_Latn", + "zho_Hans", + "zul_Latn", + ], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=None, + embed_dim=None, + license="bigscience-bloom-rail-1.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/izhx/udever-bloom-3b", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="bigscience/bloom-3b", + superseded_by=None, +) +izhx__udever_bloom_560m = ModelMeta( + name="izhx/udever-bloom-560m", + revision="b2a723e355946ec5a5c5fbed3459766627ded2bb", + release_date="2023-10-24", + languages=[ + "aka_Latn", + "ara_Arab", + "asm_Beng", + "bam_Latn", + "ben_Beng", + "cat_Latn", + "eng_Latn", + "spa_Latn", + "eus_Latn", + "fon_Latn", + "fra_Latn", + "guj_Gujr", + "hin_Deva", + "ind_Latn", + "ibo_Latn", + "kik_Latn", + "kan_Knda", + "lug_Latn", + "lin_Latn", + "mal_Mlym", + "mar_Deva", + "nep_Deva", + "nso_Latn", + "nya_Latn", + "ori_Orya", + "pan_Guru", + "por_Latn", + "run_Latn", + "kin_Latn", + "sna_Latn", + "sot_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tsn_Latn", + "tso_Latn", + "tum_Latn", + "twi_Latn", + "urd_Arab", + "vie_Latn", + "wol_Latn", + "xho_Latn", + "yor_Latn", + "zho_Hans", + "zul_Latn", + ], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=None, + embed_dim=None, + license="bigscience-bloom-rail-1.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/izhx/udever-bloom-560m", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="bigscience/bloom-560m", + superseded_by=None, +) +izhx__udever_bloom_7b1 = ModelMeta( + name="izhx/udever-bloom-7b1", + revision="18e8d3e6dbd94868584877f2e72a105a17df22ef", + release_date="2023-10-24", + languages=[ + "aka_Latn", + "ara_Arab", + "asm_Beng", + "bam_Latn", + "ben_Beng", + "cat_Latn", + "eng_Latn", + "spa_Latn", + "eus_Latn", + "fon_Latn", + "fra_Latn", + "guj_Gujr", + "hin_Deva", + "ind_Latn", + "ibo_Latn", + "kik_Latn", + "kan_Knda", + "lug_Latn", + "lin_Latn", + "mal_Mlym", + "mar_Deva", + "nep_Deva", + "nso_Latn", + "nya_Latn", + "ori_Orya", + "pan_Guru", + "por_Latn", + "run_Latn", + "kin_Latn", + "sna_Latn", + "sot_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tsn_Latn", + "tso_Latn", + "tum_Latn", + "twi_Latn", + "urd_Arab", + "vie_Latn", + "wol_Latn", + "xho_Latn", + "yor_Latn", + "zho_Hans", + "zul_Latn", + ], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=None, + embed_dim=None, + license="bigscience-bloom-rail-1.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/izhx/udever-bloom-7b1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="bigscience/bloom-7b1", + superseded_by=None, +) +avsolatorio__GIST_Embedding_v0 = ModelMeta( + name="avsolatorio/GIST-Embedding-v0", + revision="bf6b2e55e92f510a570ad4d7d2da2ec8cd22590c", + release_date="2024-01-31", + languages=["eng_Latn"], + loader=None, + n_parameters=109482240, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +avsolatorio__GIST_all_MiniLM_L6_v2 = ModelMeta( + name="avsolatorio/GIST-all-MiniLM-L6-v2", + revision="ea89dfad053bba14677bb784a4269898abbdce44", + release_date="2024-02-03", + languages=["eng_Latn"], + loader=None, + n_parameters=22713216, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +avsolatorio__GIST_large_Embedding_v0 = ModelMeta( + name="avsolatorio/GIST-large-Embedding-v0", + revision="7831200e2f7819b994490c091cf3258a2b821f0c", + release_date="2024-02-14", + languages=["eng_Latn"], + loader=None, + n_parameters=335141888, + memory_usage=None, + max_tokens=512.0, + embed_dim=1024, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +avsolatorio__GIST_small_Embedding_v0 = ModelMeta( + name="avsolatorio/GIST-small-Embedding-v0", + revision="d6c4190f9e01b9994dc7cac99cf2f2b85cfb57bc", + release_date="2024-02-03", + languages=["eng_Latn"], + loader=None, + n_parameters=33360000, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +bigscience__sgpt_bloom_7b1_msmarco = ModelMeta( + name="bigscience/sgpt-bloom-7b1-msmarco", + revision="dc579f3d2d5a0795eba2049e16c3e36c74007ad3", + release_date="2022-08-26", + languages=None, + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=None, + embed_dim=4096, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1", + superseded_by=None, +) +aari1995__German_Semantic_STS_V2 = ModelMeta( + name="aari1995/German_Semantic_STS_V2", + revision="22912542b0ec7a7ef369837e28ffe6352a27afc9", + release_date="2022-11-17", + languages=["deu_Latn"], + loader=None, + n_parameters=335736320, + memory_usage=None, + max_tokens=512.0, + embed_dim=1024, + license=None, + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"stsb_multi_mt": ["train"]}, + adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", + superseded_by=None, +) +abhinand__MedEmbed_small_v0_1 = ModelMeta( + name="abhinand/MedEmbed-small-v0.1", + revision="40a5850d046cfdb56154e332b4d7099b63e8d50e", + release_date="2024-10-20", + languages=["eng_Latn"], + loader=None, + n_parameters=33360000, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={ + "MedicalQARetrieval": ["train"], + "NFCorpus": ["train"], + "PublicHealthQA": ["train"], + "TRECCOVID": ["train"], + "ArguAna": ["train"], + }, + adapted_from="./medical-bge-small-v1-mix1", + superseded_by=None, +) +avsolatorio__NoInstruct_small_Embedding_v0 = ModelMeta( + name="avsolatorio/NoInstruct-small-Embedding-v0", + revision="b38747000553d8268915c95a55fc87e707c9aadd", + release_date="2024-05-01", + languages=["eng_Latn"], + loader=None, + n_parameters=33360000, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +brahmairesearch__slx_v0_1 = ModelMeta( + name="brahmairesearch/slx-v0.1", + revision="688c83fd1a7f34b25575a2bc26cfd87c11b4ce71", + release_date="2024-08-13", + languages=["eng_Latn"], + loader=None, + n_parameters=22713216, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/brahmairesearch/slx-v0.1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +deepfile__embedder_100p = ModelMeta( + name="deepfile/embedder-100p", + revision="aa02f08f11517977fbcdc94dc9dbf9a1ca152d9b", + release_date="2023-07-24", + languages=None, + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=514.0, + embed_dim=768, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/deepfile/embedder-100p", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + superseded_by=None, +) +deepvk__USER_bge_m3 = ModelMeta( + name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + languages=["rus_Cyrl"], + loader=None, + n_parameters=359026688, + memory_usage=None, + max_tokens=8194.0, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/deepvk/USER-bge-m3", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={ + "deepvk/ru-HNP": ["train"], + "deepvk/ru-WANLI": ["train"], + "Shitao/bge-m3-data": ["train"], + "RussianNLP/russian_super_glue": ["train"], + "reciTAL/mlsum": ["train"], + "Milana/russian_keywords": ["train"], + "IlyaGusev/gazeta": ["train"], + "d0rj/gsm8k-ru": ["train"], + "bragovo/dsum_ru": ["train"], + "CarlBrendt/Summ_Dialog_News": ["train"], + }, + adapted_from="USER-bge-m3", + superseded_by=None, +) +infgrad__stella_base_en_v2 = ModelMeta( + name="infgrad/stella-base-en-v2", + revision="c9e80ff9892d80b39dc54e30a7873f91ea161034", + release_date="2023-10-19", + languages=["eng_Latn"], + loader=None, + n_parameters=None, + memory_usage=None, + max_tokens=512.0, + embed_dim=None, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/infgrad/stella-base-en-v2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +malenia1__ternary_weight_embedding = ModelMeta( + name="malenia1/ternary-weight-embedding", + revision="a1208fb7f646647bb62639fd2e1eb6cc2ef3738e", + release_date="2024-10-23", + languages=None, + loader=None, + n_parameters=98688000, + memory_usage=None, + max_tokens=512.0, + embed_dim=1024, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/malenia1/ternary-weight-embedding", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="ternary-weight-embedding", + superseded_by=None, +) +omarelshehy__arabic_english_sts_matryoshka = ModelMeta( + name="omarelshehy/arabic-english-sts-matryoshka", + revision="763d116fbe8bf7883c64635c862feeaa3768bb64", + release_date="2024-10-13", + languages=["ara_Arab", "eng_Latn"], + loader=None, + n_parameters=559890432, + memory_usage=None, + max_tokens=514.0, + embed_dim=1024, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="FacebookAI/xlm-roberta-large", + superseded_by=None, +) +openbmb__MiniCPM_Embedding = ModelMeta( + name="openbmb/MiniCPM-Embedding", + revision="c0cb2de33fb366e17c30f9d53142ff11bc18e049", + release_date="2024-09-04", + languages=["zho_Hans", "eng_Latn"], + loader=None, + n_parameters=2724880896, + memory_usage=None, + max_tokens=512.0, + embed_dim=2304, + license=None, + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/openbmb/MiniCPM-Embedding", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from=None, + superseded_by=None, +) +shibing624__text2vec_base_multilingual = ModelMeta( + name="shibing624/text2vec-base-multilingual", + revision="6633dc49e554de7105458f8f2e96445c6598e9d1", + release_date="2023-06-22", + languages=[ + "zho_Hans", + "eng_Latn", + "deu_Latn", + "fra_Latn", + "ita_Latn", + "nld_Latn", + "por_Latn", + "pol_Latn", + "rus_Cyrl", + ], + loader=None, + n_parameters=117654272, + memory_usage=None, + max_tokens=512.0, + embed_dim=384, + license="apache-2.0", + open_weights=True, + public_training_data=True, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/shibing624/text2vec-base-multilingual", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={"shibing624/nli-zh-all": ["train"]}, + adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + superseded_by=None, +) +silma_ai__silma_embeddding_matryoshka_v0_1 = ModelMeta( + name="silma-ai/silma-embeddding-matryoshka-v0.1", + revision="a520977a9542ebdb8a7206df6b7ff6977f1886ea", + release_date="2024-10-12", + languages=["ara_Arab", "eng_Latn"], + loader=None, + n_parameters=135193344, + memory_usage=None, + max_tokens=512.0, + embed_dim=768, + license="apache-2.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250", + superseded_by=None, +) +zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( + name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c", + release_date="2024-08-30", + languages=["eng_Latn"], + loader=None, + n_parameters=7110660096, + memory_usage=None, + max_tokens=32768.0, + embed_dim=4096, + license="mit", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 7418ee98f..f1b90f6c0 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -22,6 +22,7 @@ gte_models, jina_models, llm2vec_models, + misc_models, model2vec_models, mxbai_models, nomic_models, @@ -53,6 +54,7 @@ llm2vec_models, mxbai_models, model2vec_models, + misc_models, nomic_models, openai_models, ru_sentence_models, diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py new file mode 100644 index 000000000..a96604446 --- /dev/null +++ b/scripts/generate_metadata.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +import json +import warnings +from pathlib import Path + +import iso639 +from huggingface_hub import HfApi, ModelCard, hf_hub_download +from tqdm import tqdm + +from mteb.model_meta import ModelMeta + +to_keep = [ + "Haon-Chen/speed-embedding-7b-instruct", + "Gameselo/STS-multilingual-mpnet-base-v2", + "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", + "HIT-TMG/KaLM-embedding-multilingual-mini-v1", + "Hum-Works/lodestone-base-4096-v1", + "Jaume/gemma-2b-embeddings", + "BeastyZ/e5-R-mistral-7b", + "Lajavaness/bilingual-embedding-base", + "Lajavaness/bilingual-embedding-large", + "Lajavaness/bilingual-embedding-small", + "Mihaiii/Bulbasaur", + "Mihaiii/Ivysaur", + "Mihaiii/Squirtle", + "Mihaiii/Venusaur", + "Mihaiii/Wartortle", + "Mihaiii/gte-micro", + "Mihaiii/gte-micro-v4", + "OrdalieTech/Solon-embeddings-large-0.1", + "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", + "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", + "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", + "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", + "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", + "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", + "consciousAI/cai-lunaris-text-embeddings", + "consciousAI/cai-stellaris-text-embeddings", + "manu/bge-m3-custom-fr", + "manu/sentence_croissant_alpha_v0.2", + "manu/sentence_croissant_alpha_v0.3", + "manu/sentence_croissant_alpha_v0.4", + "thenlper/gte-base", + "thenlper/gte-large", + "thenlper/gte-small", + "OrlikB/KartonBERT-USE-base-v1", + "OrlikB/st-polish-kartonberta-base-alpha-v1", + "sdadas/mmlw-e5-base", # some models are monolingual adaptions of a another models (I would include them for now) + "dwzhu/e5-base-4k", # e.g. this is a long doc adaption of e5 + "sdadas/mmlw-e5-large", + "sdadas/mmlw-e5-small", + "sdadas/mmlw-roberta-base", + "sdadas/mmlw-roberta-large", + "izhx/udever-bloom-1b1", + "izhx/udever-bloom-3b", + "izhx/udever-bloom-560m", + "izhx/udever-bloom-7b1", + "avsolatorio/GIST-Embedding-v0", + "avsolatorio/GIST-all-MiniLM-L6-v2", + "avsolatorio/GIST-large-Embedding-v0", + "avsolatorio/GIST-small-Embedding-v0", + "bigscience/sgpt-bloom-7b1-msmarco", + "aari1995/German_Semantic_STS_V2", + "abhinand/MedEmbed-small-v0.1", + "avsolatorio/NoInstruct-small-Embedding-v0", + "brahmairesearch/slx-v0.1", + "deepfile/embedder-100p", + "deepvk/USER-bge-m3", + "infgrad/stella-base-en-v2", + "malenia1/ternary-weight-embedding", + "omarelshehy/arabic-english-sts-matryoshka", + "openbmb/MiniCPM-Embedding", + "shibing624/text2vec-base-multilingual", + "silma-ai/silma-embeddding-matryoshka-v0.1", + "zeta-alpha-ai/Zeta-Alpha-E5-Mistral", +] + +lang_to_script = { + "bam": "Latn", + "zul": "Latn", + "tsn": "Latn", + "rus": "Cyrl", + "mar": "Deva", + "ori": "Orya", + "swa": "Latn", + "vie": "Latn", + "nld": "Latn", + "kan": "Knda", + "yor": "Latn", + "urd": "Arab", + "guj": "Gujr", + "eng": "Latn", + "tso": "Latn", + "zho": "Hans", # Can also be "Hant" depending on region + "deu": "Latn", + "sna": "Latn", + "nso": "Latn", + "pol": "Latn", + "sot": "Latn", + "mal": "Mlym", + "cat": "Latn", + "lug": "Latn", + "spa": "Latn", + "wol": "Latn", + "tum": "Latn", + "xho": "Latn", + "fra": "Latn", + "tam": "Taml", + "pan": "Guru", + "twi": "Latn", + "tel": "Telu", + "ibo": "Latn", + "kik": "Latn", + "run": "Latn", + "hin": "Deva", + "ben": "Beng", + "fon": "Latn", + "ita": "Latn", + "nya": "Latn", + "aka": "Latn", + "por": "Latn", + "asm": "Beng", + "eus": "Latn", + "lin": "Latn", + "nep": "Deva", + "kin": "Latn", + "ind": "Latn", + "ara": "Arab", +} + + +def convert_code(code: str) -> str | None: + """Converts between two-letter and three-letter language codes""" + try: + lang_code = iso639.Language.match(code).part3 + script = lang_to_script[lang_code] + return f"{lang_code}_{script}" + except Exception as e: + print(f"Couldn't convert {code}, reason: {e}") + return None + + +api = HfApi() + + +def get_embedding_dimensions(model_name: str) -> int | None: + try: + file_path = hf_hub_download( + repo_id=model_name, filename="1_Pooling/config.json" + ) + with open(file_path) as in_file: + pooling_config = json.loads(in_file.read()) + return pooling_config.get("word_embedding_dimension", None) + except Exception as e: + print(f"Couldn't get embedding size for {model_name}, reason: {e}") + return None + + +def get_max_token(model_name: str) -> int | None: + try: + file_path = hf_hub_download(repo_id=model_name, filename="config.json") + with open(file_path) as in_file: + config = json.loads(in_file.read()) + return config.get("max_position_embeddings", None) + except Exception as e: + print(f"Couldn't get embedding size for {model_name}, reason: {e}") + return None + + +def get_base_model(model_name: str) -> str | None: + try: + file_path = hf_hub_download(repo_id=model_name, filename="config.json") + with open(file_path) as in_file: + config = json.loads(in_file.read()) + base_model = config.get("_name_or_path", None) + if base_model != model_name: + return base_model + else: + return None + except Exception as e: + print(f"Couldn't get base model for {model_name}, reason: {e}") + return None + + +def model_meta_from_hf_hub(model_name: str) -> ModelMeta: + try: + card = ModelCard.load(model_name) + card_data = card.data.to_dict() + frameworks = ["PyTorch"] + if card_data.get("library_name", None) == "sentence-transformers": + frameworks.append("Sentence Transformers") + languages = card_data.get("language", None) + if isinstance(languages, str): + languages = [languages] + if languages is not None: + languages = [convert_code(l) for l in languages] + languages = [l for l in languages if l is not None] + repo_info = api.repo_info(model_name) + revision = repo_info.sha + release_date = repo_info.created_at.strftime("%Y-%m-%d") + try: + n_parameters = repo_info.safetensors.total + except Exception as e: + print(f"Couldn't get model size for {model_name}, reason: {e}") + n_parameters = None + n_dimensions = get_embedding_dimensions(model_name) + datasets = card_data.get("datasets", None) + if isinstance(datasets, str): + datasets = [datasets] + if datasets is not None: + training_datasets = {ds: ["train"] for ds in datasets} + else: + training_datasets = None + return ModelMeta( + name=model_name, + revision=revision, + release_date=release_date, + languages=languages, + license=card_data.get("license", None), + framework=frameworks, + n_parameters=n_parameters, + public_training_data=bool(datasets), + adapted_from=get_base_model(model_name), + training_datasets=training_datasets, + open_weights=True, + superseded_by=None, + max_tokens=get_max_token(model_name), + embed_dim=n_dimensions, + similarity_fn_name="cosine", + reference=f"https://huggingface.co/{model_name}", + ) + except Exception as e: + warnings.warn(f"Failed to extract metadata from model: {e}.") + return ModelMeta( + name=model_name, + revision=None, + languages=None, + release_date=None, + ) + + +def code_from_meta(meta: ModelMeta) -> str: + template = "{variable_name} ={meta}\n" + variable_name = meta.name.replace("/", "__").replace("-", "_").replace(".", "_") + return template.format(variable_name=variable_name, meta=meta.__repr__()) + + +def main(): + out_path = Path("mteb/models/misc_models.py") + with open(out_path, "w") as out_file: + out_file.write("from mteb.model_meta import ModelMeta\n\n") + for model in tqdm(to_keep, desc="Generating metadata for all models."): + meta = model_meta_from_hf_hub(model) + out_file.write(code_from_meta(meta)) + + +if __name__ == "__main__": + main()