diff --git a/mteb-zh/mteb_zh/models.py b/mteb-zh/mteb_zh/models.py index 25dfdd4..f8178cf 100644 --- a/mteb-zh/mteb_zh/models.py +++ b/mteb-zh/mteb_zh/models.py @@ -1,3 +1,4 @@ +import json import os import time from enum import Enum @@ -6,6 +7,7 @@ import numpy as np import openai +import requests import torch from sentence_transformers import SentenceTransformer from tqdm import tqdm @@ -24,6 +26,7 @@ class ModelType(str, Enum): luotuo = 'luotuo' erlangshen = 'erlangshen' openai = 'openai' + minimax = 'minimax' azure = 'azure' @@ -63,6 +66,13 @@ def load_model(model_type: ModelType, model_id: str | None = None) -> MTEBModel: return ErLangShenModel(model_name='IDEA-CCNL/Erlangshen-SimCSE-110M-Chinese') else: return ErLangShenModel(model_name=model_id) + case ModelType.minimax: + if model_id is None: + return MiniMaxModel() + else: + if model_id not in {'db', 'query'}: + raise ValueError(f'Unknown model type: {model_id}') + return MiniMaxModel(embedding_type=model_id) case _: raise ValueError(f'Unknown model type: {model_type}') @@ -73,6 +83,25 @@ def generate_batch(data: Iterable[T], batch_size: int = 32) -> Generator[list[T] yield batch +class MiniMaxModel: + def __init__(self, embedding_type: str = 'db', group_id: str | None = None, api_key: str | None = None) -> None: + self.embedding_type = embedding_type + self.group_id = group_id or os.environ['MINIMAX_GROUP_ID'] + self.api_key = api_key or os.environ['MINIMAX_API_KEY'] + self.url = f'https://api.minimax.chat/v1/embeddings?GroupId={self.group_id}' + + def encode(self, sentences: list[str], batch_size: int = 32, **kwargs) -> list[np.ndarray]: + headers = {'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json'} + + embeddings = [] + for batch_sentence in tqdm(generate_batch(sentences, batch_size), total=len(sentences) // batch_size): + data = {'texts': batch_sentence, 'model': 'embo-01', 'type': 'db'} + response = requests.post(self.url, headers=headers, data=json.dumps(data)).json() + for embedding in response['vectors']: + embeddings.append(np.array(embedding)) + return embeddings + + class OpenAIModel: def __init__( self, diff --git a/mteb-zh/readme.md b/mteb-zh/readme.md index 364ff43..c7d4dd9 100644 --- a/mteb-zh/readme.md +++ b/mteb-zh/readme.md @@ -12,6 +12,10 @@ MTEB-zh 是一个使用 [MTEB](https://github.com/embeddings-benchmark/mteb) 框 - [x] [UER](https://huggingface.co/uer/sbert-base-chinese-nli) - [x] [ErLangShen](https://huggingface.co/IDEA-CCNL/Erlangshen-SimCSE-110M-Chinese) - [x] [openai](https://openai.com/blog/new-and-improved-embedding-model) +- [x] [minimax](https://api.minimax.chat/login) +- [x] [luotuo](https://github.com/LC1332/Luotuo-Text-Embedding) + +> luotuo 和 minimax 都是在实验和测试阶段,因此只是在接口上支持了这两个模型,但并未进行评测。 ## 评测