From 355b3b8f2a1fb8ee47af7490f1bf491a5cd7a6dd Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Fri, 29 Nov 2024 14:50:41 +0900 Subject: [PATCH 01/13] fix: add sentence trimming to OpenAIWrapper --- mteb/models/openai_models.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 50967e898..cd03a135d 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -5,6 +5,7 @@ from typing import Any import numpy as np +import tiktoken from mteb.model_meta import ModelMeta from mteb.requires_package import requires_package @@ -15,13 +16,22 @@ class OpenAIWrapper(Wrapper): - def __init__(self, model_name: str, embed_dim: int | None = None, **kwargs) -> None: + def __init__( + self, + model_name: str, + max_seq_length: int, + tokenizer_name: str = "cl100k_base", + embed_dim: int | None = None, + **kwargs, + ) -> None: requires_package(self, "openai", "Openai text embedding") from openai import OpenAI self._client = OpenAI() self._model_name = model_name self._embed_dim = embed_dim + self._max_seq_length = max_seq_length + self._tokenizer_name = tokenizer_name def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: requires_package(self, "openai", "Openai text embedding") @@ -32,10 +42,22 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: "Reducing embedding size available only for text-embedding-3-* models" ) + trimmed_sentences = [] + for sentence in sentences: + encoding = tiktoken.get_encoding(self._tokenizer_name) + encoded_sentence = encoding.encode(sentence) + if len(encoded_sentence) > self._max_seq_length: + trimmed_sentence = encoding.decode( + encoded_sentence[: self._max_seq_length] + ) + trimmed_sentences.append(trimmed_sentence) + else: + trimmed_sentences.append(sentence) + max_batch_size = 2048 sublists = [ - sentences[i : i + max_batch_size] - for i in range(0, len(sentences), max_batch_size) + trimmed_sentences[i : i + max_batch_size] + for i in range(0, len(trimmed_sentences), max_batch_size) ] all_embeddings = [] From 32fe482a3c849b068563864ffc276e9a9397a62f Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Fri, 29 Nov 2024 15:27:48 +0900 Subject: [PATCH 02/13] fix: import tiktoken library inside encode function --- mteb/models/openai_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index cd03a135d..08ff7f592 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -5,7 +5,6 @@ from typing import Any import numpy as np -import tiktoken from mteb.model_meta import ModelMeta from mteb.requires_package import requires_package @@ -20,7 +19,7 @@ def __init__( self, model_name: str, max_seq_length: int, - tokenizer_name: str = "cl100k_base", + tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now embed_dim: int | None = None, **kwargs, ) -> None: @@ -35,6 +34,7 @@ def __init__( def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: requires_package(self, "openai", "Openai text embedding") + import tiktoken from openai import NotGiven if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None: From f76e1c3e162098142098eed3356634aa1f8be9b5 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Fri, 29 Nov 2024 10:19:50 +0000 Subject: [PATCH 03/13] fix: check tokenizer library installed and update ModelMeta to pass tokenizer_name --- mteb/model_meta.py | 2 ++ mteb/models/openai_models.py | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 4a8146b3d..0b7ac9258 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -53,6 +53,7 @@ class ModelMeta(BaseModel): Attributes: loader: the function that loads the model. If None it will just default to loading the model using the sentence transformer library. name: The name of the model, ideally the name on huggingface. + tokenizer_name: The name of the tokenizer used by the model. n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the the number of parameters is not known (e.g. for proprietary models) or if the loader returns a SentenceTransformer model from which it can be derived. memory_usage: The amount of memory the model uses in GB. Can be None if the memory usage is not known (e.g. for proprietary models). @@ -80,6 +81,7 @@ class ModelMeta(BaseModel): name: str | None revision: str | None + tokenizer_name: str | None release_date: STR_DATE | None languages: list[ISO_LANGUAGE_SCRIPT] | None loader: Callable[..., Encoder] | None = None diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 6a6a4cd59..425902ed0 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -18,7 +18,7 @@ class OpenAIWrapper(Wrapper): def __init__( self, model_name: str, - max_seq_length: int, + max_tokens: int, tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now embed_dim: int | None = None, **kwargs, @@ -29,11 +29,12 @@ def __init__( self._client = OpenAI() self._model_name = model_name self._embed_dim = embed_dim - self._max_seq_length = max_seq_length + self._max_tokens = max_tokens self._tokenizer_name = tokenizer_name def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: requires_package(self, "openai", "Openai text embedding") + requires_package(self, "tiktoken", "Tiktoken package") import tiktoken from openai import NotGiven @@ -46,9 +47,9 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: for sentence in sentences: encoding = tiktoken.get_encoding(self._tokenizer_name) encoded_sentence = encoding.encode(sentence) - if len(encoded_sentence) > self._max_seq_length: + if len(encoded_sentence) > self._max_tokens: trimmed_sentence = encoding.decode( - encoded_sentence[: self._max_seq_length] + encoded_sentence[: self._max_tokens] ) trimmed_sentences.append(trimmed_sentence) else: @@ -80,6 +81,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_small = ModelMeta( name="openai/text-embedding-3-small", revision="1", + tokenizer_name="cl100k_base", release_date="2024-01-25", languages=None, # supported languages not specified loader=partial(OpenAIWrapper, model_name="text-embedding-3-small"), @@ -97,6 +99,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_large = ModelMeta( name="openai/text-embedding-3-large", revision="1", + tokenizer_name="cl100k_base", release_date="2024-01-25", languages=None, # supported languages not specified loader=partial(OpenAIWrapper, model_name="text-embedding-3-large"), @@ -111,6 +114,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", revision="1", + tokenizer_name="cl100k_base", release_date="2022-12-15", languages=None, # supported languages not specified loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002"), From 21a2937c826df86ddabecb03f867400fdfff477a Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Fri, 29 Nov 2024 10:25:45 +0000 Subject: [PATCH 04/13] fix: pass tokenizer_name, max_tokens to loader --- mteb/models/openai_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 425902ed0..31c42819c 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -84,7 +84,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: tokenizer_name="cl100k_base", release_date="2024-01-25", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-3-small"), + loader=partial(OpenAIWrapper, model_name="text-embedding-3-small", tokenizer_name="cl100k_base", max_tokens=8192), max_tokens=8191, embed_dim=1536, open_weights=False, @@ -102,7 +102,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: tokenizer_name="cl100k_base", release_date="2024-01-25", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-3-large"), + loader=partial(OpenAIWrapper, model_name="text-embedding-3-large", tokenizer_name="cl100k_base", max_tokens=8192), max_tokens=8191, embed_dim=3072, open_weights=False, @@ -117,7 +117,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: tokenizer_name="cl100k_base", release_date="2022-12-15", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002"), + loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", max_tokens=8192), max_tokens=8191, embed_dim=1536, open_weights=False, From 43e2463ae6efa3e57a9671fd5fa07bee858d6c71 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Fri, 29 Nov 2024 10:32:34 +0000 Subject: [PATCH 05/13] fix: make tokenizer_name None for default --- mteb/model_meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 0b7ac9258..db472d3bc 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -81,9 +81,9 @@ class ModelMeta(BaseModel): name: str | None revision: str | None - tokenizer_name: str | None release_date: STR_DATE | None languages: list[ISO_LANGUAGE_SCRIPT] | None + tokenizer_name: str | None = None loader: Callable[..., Encoder] | None = None n_parameters: int | None = None memory_usage: float | None = None From d58c84bcfac0bb03ffad934d8239cc9ba87f4938 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Fri, 29 Nov 2024 10:52:55 +0000 Subject: [PATCH 06/13] fix: delete changes for ModelMeta --- mteb/model_meta.py | 2 -- mteb/models/openai_models.py | 28 +++++++++++++++++++--------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index db472d3bc..4a8146b3d 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -53,7 +53,6 @@ class ModelMeta(BaseModel): Attributes: loader: the function that loads the model. If None it will just default to loading the model using the sentence transformer library. name: The name of the model, ideally the name on huggingface. - tokenizer_name: The name of the tokenizer used by the model. n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the the number of parameters is not known (e.g. for proprietary models) or if the loader returns a SentenceTransformer model from which it can be derived. memory_usage: The amount of memory the model uses in GB. Can be None if the memory usage is not known (e.g. for proprietary models). @@ -83,7 +82,6 @@ class ModelMeta(BaseModel): revision: str | None release_date: STR_DATE | None languages: list[ISO_LANGUAGE_SCRIPT] | None - tokenizer_name: str | None = None loader: Callable[..., Encoder] | None = None n_parameters: int | None = None memory_usage: float | None = None diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 31c42819c..72cf35c13 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -48,9 +48,7 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: encoding = tiktoken.get_encoding(self._tokenizer_name) encoded_sentence = encoding.encode(sentence) if len(encoded_sentence) > self._max_tokens: - trimmed_sentence = encoding.decode( - encoded_sentence[: self._max_tokens] - ) + trimmed_sentence = encoding.decode(encoded_sentence[: self._max_tokens]) trimmed_sentences.append(trimmed_sentence) else: trimmed_sentences.append(sentence) @@ -81,10 +79,14 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_small = ModelMeta( name="openai/text-embedding-3-small", revision="1", - tokenizer_name="cl100k_base", release_date="2024-01-25", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-3-small", tokenizer_name="cl100k_base", max_tokens=8192), + loader=partial( + OpenAIWrapper, + model_name="text-embedding-3-small", + tokenizer_name="cl100k_base", + max_tokens=8192, + ), max_tokens=8191, embed_dim=1536, open_weights=False, @@ -99,10 +101,14 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_large = ModelMeta( name="openai/text-embedding-3-large", revision="1", - tokenizer_name="cl100k_base", release_date="2024-01-25", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-3-large", tokenizer_name="cl100k_base", max_tokens=8192), + loader=partial( + OpenAIWrapper, + model_name="text-embedding-3-large", + tokenizer_name="cl100k_base", + max_tokens=8192, + ), max_tokens=8191, embed_dim=3072, open_weights=False, @@ -114,10 +120,14 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", revision="1", - tokenizer_name="cl100k_base", release_date="2022-12-15", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002", tokenizer_name="cl100k_base", max_tokens=8192), + loader=partial( + OpenAIWrapper, + model_name="text-embedding-ada-002", + tokenizer_name="cl100k_base", + max_tokens=8192, + ), max_tokens=8191, embed_dim=1536, open_weights=False, From 278de9cfd520e97a2de0b46509aee4c17be0604a Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Sun, 1 Dec 2024 23:05:30 +0900 Subject: [PATCH 07/13] fix: fix revision to 2 for OpenAI models --- mteb/models/openai_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 72cf35c13..557975670 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -78,7 +78,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: text_embedding_3_small = ModelMeta( name="openai/text-embedding-3-small", - revision="1", + revision="2", release_date="2024-01-25", languages=None, # supported languages not specified loader=partial( @@ -100,7 +100,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: ) text_embedding_3_large = ModelMeta( name="openai/text-embedding-3-large", - revision="1", + revision="2", release_date="2024-01-25", languages=None, # supported languages not specified loader=partial( @@ -119,7 +119,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: ) text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", - revision="1", + revision="2", release_date="2022-12-15", languages=None, # supported languages not specified loader=partial( From d1433dc32bf1caeedb40dd0b9f333d6bf9d7ca11 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Tue, 3 Dec 2024 15:22:27 +0900 Subject: [PATCH 08/13] fix: add docstring for OpenAIWrapper --- mteb/models/openai_models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 557975670..8e4b28d65 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -23,6 +23,9 @@ def __init__( embed_dim: int | None = None, **kwargs, ) -> None: + """Wrapper for OpenAIs embedding API. + To handle documents larger than 8192 tokens, we truncate the document to the specified sequence length. + """ requires_package(self, "openai", "Openai text embedding") from openai import OpenAI @@ -42,7 +45,7 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: logger.warning( "Reducing embedding size available only for text-embedding-3-* models" ) - + trimmed_sentences = [] for sentence in sentences: encoding = tiktoken.get_encoding(self._tokenizer_name) From 5cc69602277bf9d812f094898780b77fe7bfbaa0 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Tue, 3 Dec 2024 15:34:46 +0900 Subject: [PATCH 09/13] fix: lint --- mteb/leaderboard/app.py | 1 - mteb/models/openai_models.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index c4e5e80ef..c51dc7a50 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -5,7 +5,6 @@ from pathlib import Path import gradio as gr -import pandas as pd from gradio_rangeslider import RangeSlider import mteb diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 8fbe1acfe..211e3ce74 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -45,7 +45,7 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: logger.warning( "Reducing embedding size available only for text-embedding-3-* models" ) - + trimmed_sentences = [] for sentence in sentences: encoding = tiktoken.get_encoding(self._tokenizer_name) From 23600ea2e6a717c4bee72f86ef6f6ef2e460d5e4 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Wed, 4 Dec 2024 15:08:14 +0900 Subject: [PATCH 10/13] feat: add openai optional dependency set --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 3ce2d4a5e..34ffc2b70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] +openai = ["openai>=1.41.0", "tiktoken>=0.8.0"] [tool.coverage.report] From 80bb95fe90f3aa24cd1d0a1d8f75d866691be7a6 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Wed, 4 Dec 2024 18:24:45 +0900 Subject: [PATCH 11/13] fix: add sleep for too many requests --- mteb/models/openai_models.py | 62 +++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 211e3ce74..36188f7ed 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -19,7 +19,7 @@ def __init__( self, model_name: str, max_tokens: int, - tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now + tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now embed_dim: int | None = None, **kwargs, ) -> None: @@ -28,17 +28,23 @@ def __init__( """ requires_package(self, "openai", "Openai text embedding") from openai import OpenAI + requires_package(self, "tiktoken", "Tiktoken package") + import tiktoken self._client = OpenAI() self._model_name = model_name self._embed_dim = embed_dim self._max_tokens = max_tokens - self._tokenizer_name = tokenizer_name + self._encoding = tiktoken.get_encoding(tokenizer_name) + + def truncate_text_tokens(self, text): + """Truncate a string to have `max_tokens` according to the given encoding.""" + truncated_sentence = self._encoding.encode(text)[:self._max_tokens] + return self._encoding.decode(truncated_sentence) def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: requires_package(self, "openai", "Openai text embedding") - requires_package(self, "tiktoken", "Tiktoken package") - import tiktoken + from openai import NotGiven if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None: @@ -48,11 +54,10 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: trimmed_sentences = [] for sentence in sentences: - encoding = tiktoken.get_encoding(self._tokenizer_name) - encoded_sentence = encoding.encode(sentence) + encoded_sentence = self._encoding.encode(sentence) if len(encoded_sentence) > self._max_tokens: - trimmed_sentence = encoding.decode(encoded_sentence[: self._max_tokens]) - trimmed_sentences.append(trimmed_sentence) + truncated_sentence = self.truncate_text_tokens(sentence) + trimmed_sentences.append(truncated_sentence) else: trimmed_sentences.append(sentence) @@ -65,12 +70,34 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: all_embeddings = [] for sublist in sublists: - response = self._client.embeddings.create( - input=sublist, - model=self._model_name, - encoding_format="float", - dimensions=self._embed_dim or NotGiven(), - ) + try: + response = self._client.embeddings.create( + input=sublist, + model=self._model_name, + encoding_format="float", + dimensions=self._embed_dim or NotGiven(), + ) + except Exception as e: + # Sleep due to too many requests + logger.info("Sleeping for 10 seconds due to error", e) + import time + time.sleep(10) + try: + response = self._client.embeddings.create( + input=sublist, + model=self._model_name, + encoding_format="float", + dimensions=self._embed_dim or NotGiven(), + ) + except Exception as e: + logger.info("Sleeping for 60 seconds due to error", e) + time.sleep(60) + response = self._client.embeddings.create( + input=sublist, + model=self._model_name, + encoding_format="float", + dimensions=self._embed_dim or NotGiven(), + ) all_embeddings.extend(self._to_numpy(response)) return np.array(all_embeddings) @@ -126,7 +153,12 @@ def _to_numpy(self, embedding_response) -> np.ndarray: revision="2", release_date="2022-12-15", languages=None, # supported languages not specified - loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002"), + loader=partial( + OpenAIWrapper, + model_name="text-embedding-ada-002", + tokenizer_name="cl100k_base", + max_tokens=8192, + ), reference="https://openai.com/index/new-and-improved-embedding-model/", max_tokens=8191, embed_dim=1536, From 881913d2605c65a013917673af95aa895e82c0ad Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Wed, 4 Dec 2024 18:27:01 +0900 Subject: [PATCH 12/13] fix: add lint --- evaluate.py | 210 +++++++++++++++++++++++++++++++++++ mteb/models/openai_models.py | 10 +- 2 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 evaluate.py diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 000000000..8eff24cd3 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,210 @@ +# pip install git+https://github.com/taeminlee/mteb.git@ontheit 후 사용 +# streamlit run leaderboard.py 로 결과 확인 + +"""Example script for benchmarking all datasets constituting the MTEB Korean leaderboard & average scores""" + +from __future__ import annotations + +import argparse +import logging +import os +import traceback +from multiprocessing import Process, current_process + +import numpy as np +import torch +from sentence_transformers import SentenceTransformer +from sentence_transformers.models import StaticEmbedding + +# from dotenv import load_dotenv +from setproctitle import setproctitle + +import mteb +from mteb import MTEB, get_tasks +from mteb.encoder_interface import PromptType +from mteb.models.instruct_wrapper import instruct_wrapper +from mteb.models.openai_models import OpenAIWrapper +from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper +from mteb.requires_package import requires_package + +# import tiktoken + +logger = logging.getLogger("main") + + +class CustomOpenAIWrapper(OpenAIWrapper): + def encode(self, sentences: list[str], **kwargs) -> np.ndarray: + requires_package(self, "openai", "Openai text embedding") + from openai import NotGiven + + if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None: + logger.warning( + "Reducing embedding size available only for text-embedding-3-* models" + ) + + trimmed_sentences = [] + for sentence in sentences: + encoding = tiktoken.get_encoding("cl100k_base") + encoded_sentence = encoding.encode(sentence) + if len(encoded_sentence) > 8191: + trimmed_sentence = encoding.decode(encoded_sentence[:8191]) + trimmed_sentences.append(trimmed_sentence) + else: + trimmed_sentences.append(sentence) + + max_batch_size = 2048 + sublists = [ + trimmed_sentences[i : i + max_batch_size] + for i in range(0, len(trimmed_sentences), max_batch_size) + ] + + all_embeddings = [] + + for sublist in sublists: + response = self._client.embeddings.create( + input=sublist, + model=self._model_name, + encoding_format="float", + dimensions=self._embed_dim or NotGiven(), + ) + all_embeddings.extend(self._to_numpy(response)) + + return np.array(all_embeddings) + + +# load_dotenv() + +parser = argparse.ArgumentParser(description="Extract contexts") +parser.add_argument("--quantize", default=False, type=bool, help="quantize embeddings") +args = parser.parse_args() + +logging.basicConfig(level=logging.INFO) + +logger = logging.getLogger("main") + +TASK_LIST_CLASSIFICATION = [] + +TASK_LIST_CLUSTERING = [] + +TASK_LIST_PAIR_CLASSIFICATION = [] + +TASK_LIST_RERANKING = [] + +TASK_LIST_RETRIEVAL = [ + # "Ko-StrategyQA", + "AutoRAGRetrieval", + # "MIRACLRetrieval", + # "PublicHealthQA", + # "BelebeleRetrieval", + # "MrTidyRetrieval", + # "MultiLongDocRetrieval", + # "XPQARetrieval" +] + +TASK_LIST_STS = [] + +TASK_LIST = ( + TASK_LIST_CLASSIFICATION + + TASK_LIST_CLUSTERING + + TASK_LIST_PAIR_CLASSIFICATION + + TASK_LIST_RERANKING + + TASK_LIST_RETRIEVAL + + TASK_LIST_STS +) + + +model_names = [ + "BAAI/bge-m3/sparse", # 8192 +] + + +def evaluate_model(model_name, gpu_id): + try: + # Set the environment variable for the specific GPU + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + + model = None + if not os.path.exists(model_name): + if "m2v" in model_name: + static_embedding = StaticEmbedding.from_model2vec(model_name) + model = SentenceTransformer(modules=[static_embedding]) + else: + if model_name == "nlpai-lab/KoE5": + # mE5 기반의 모델이므로, 해당 프롬프트를 추가시킵니다. + model_prompts = { + PromptType.query.value: "query: ", + PromptType.passage.value: "passage: ", + } + model = SentenceTransformerWrapper( + model=model_name, model_prompts=model_prompts + ) + elif model_name == "BAAI/bge-multilingual-gemma2": + instruction_template = "{instruction}\n" + model = instruct_wrapper( + model_name_or_path=model_name, + instruction_template=instruction_template, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.float16, + normalized=True, + ) + elif "text-embedding-3" in model_name: + model = CustomOpenAIWrapper(model_name) + else: + model = mteb.get_model(model_name) + # from mteb.models.bge_models import BGEM3Wrapper + # model = BGEM3Wrapper(model_name) + else: + file_name = os.path.join(model_name, "model.safetensors") + if os.path.exists(file_name): + if "m2v" in model_name: + static_embedding = StaticEmbedding.from_model2vec(model_name) + model = SentenceTransformer(modules=[static_embedding]) + else: + model = mteb.get_model(model_name) + + if model: + setproctitle(f"{model_name}-{gpu_id}") + print( + f"Running task: {TASK_LIST} / {model_name} on GPU {gpu_id} in process {current_process().name}" + ) + evaluation = MTEB( + tasks=get_tasks( + tasks=TASK_LIST, languages=["kor-Kore", "kor-Hang", "kor_Hang"] + ) + ) + # 48GB VRAM 기준 적합한 batch sizes + if "multilingual-e5" in model_name: + batch_size = 256 + elif "jina" in model_name: + batch_size = 8 + elif "bge-m3" in model_name: + batch_size = 32 + elif "gemma2" in model_name: + batch_size = 256 + elif "Salesforce" in model_name: + batch_size = 128 + else: + batch_size = 64 + + evaluation.run( + model, + output_folder=f"results/{model_name}", + encode_kwargs={"batch_size": batch_size}, + ) + except Exception as ex: + print(ex) + traceback.print_exc() + + +if __name__ == "__main__": + processes = [] + for i, model_name in enumerate(model_names): + gpu_id = i + 3 # Cycle through available GPUs + p = Process(target=evaluate_model, args=(model_name, gpu_id)) + p.start() + processes.append(p) + + for p in processes: + p.join() diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 36188f7ed..ca2b32b2a 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -19,7 +19,7 @@ def __init__( self, model_name: str, max_tokens: int, - tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now + tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now embed_dim: int | None = None, **kwargs, ) -> None: @@ -28,6 +28,7 @@ def __init__( """ requires_package(self, "openai", "Openai text embedding") from openai import OpenAI + requires_package(self, "tiktoken", "Tiktoken package") import tiktoken @@ -36,15 +37,15 @@ def __init__( self._embed_dim = embed_dim self._max_tokens = max_tokens self._encoding = tiktoken.get_encoding(tokenizer_name) - + def truncate_text_tokens(self, text): """Truncate a string to have `max_tokens` according to the given encoding.""" - truncated_sentence = self._encoding.encode(text)[:self._max_tokens] + truncated_sentence = self._encoding.encode(text)[: self._max_tokens] return self._encoding.decode(truncated_sentence) def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: requires_package(self, "openai", "Openai text embedding") - + from openai import NotGiven if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None: @@ -81,6 +82,7 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray: # Sleep due to too many requests logger.info("Sleeping for 10 seconds due to error", e) import time + time.sleep(10) try: response = self._client.embeddings.create( From c6086ba49405199768f21a432281454479365ce5 Mon Sep 17 00:00:00 2001 From: yjoonjang Date: Wed, 4 Dec 2024 18:27:36 +0900 Subject: [PATCH 13/13] fix: delete evaluate file --- evaluate.py | 210 ---------------------------------------------------- 1 file changed, 210 deletions(-) delete mode 100644 evaluate.py diff --git a/evaluate.py b/evaluate.py deleted file mode 100644 index 8eff24cd3..000000000 --- a/evaluate.py +++ /dev/null @@ -1,210 +0,0 @@ -# pip install git+https://github.com/taeminlee/mteb.git@ontheit 후 사용 -# streamlit run leaderboard.py 로 결과 확인 - -"""Example script for benchmarking all datasets constituting the MTEB Korean leaderboard & average scores""" - -from __future__ import annotations - -import argparse -import logging -import os -import traceback -from multiprocessing import Process, current_process - -import numpy as np -import torch -from sentence_transformers import SentenceTransformer -from sentence_transformers.models import StaticEmbedding - -# from dotenv import load_dotenv -from setproctitle import setproctitle - -import mteb -from mteb import MTEB, get_tasks -from mteb.encoder_interface import PromptType -from mteb.models.instruct_wrapper import instruct_wrapper -from mteb.models.openai_models import OpenAIWrapper -from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper -from mteb.requires_package import requires_package - -# import tiktoken - -logger = logging.getLogger("main") - - -class CustomOpenAIWrapper(OpenAIWrapper): - def encode(self, sentences: list[str], **kwargs) -> np.ndarray: - requires_package(self, "openai", "Openai text embedding") - from openai import NotGiven - - if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None: - logger.warning( - "Reducing embedding size available only for text-embedding-3-* models" - ) - - trimmed_sentences = [] - for sentence in sentences: - encoding = tiktoken.get_encoding("cl100k_base") - encoded_sentence = encoding.encode(sentence) - if len(encoded_sentence) > 8191: - trimmed_sentence = encoding.decode(encoded_sentence[:8191]) - trimmed_sentences.append(trimmed_sentence) - else: - trimmed_sentences.append(sentence) - - max_batch_size = 2048 - sublists = [ - trimmed_sentences[i : i + max_batch_size] - for i in range(0, len(trimmed_sentences), max_batch_size) - ] - - all_embeddings = [] - - for sublist in sublists: - response = self._client.embeddings.create( - input=sublist, - model=self._model_name, - encoding_format="float", - dimensions=self._embed_dim or NotGiven(), - ) - all_embeddings.extend(self._to_numpy(response)) - - return np.array(all_embeddings) - - -# load_dotenv() - -parser = argparse.ArgumentParser(description="Extract contexts") -parser.add_argument("--quantize", default=False, type=bool, help="quantize embeddings") -args = parser.parse_args() - -logging.basicConfig(level=logging.INFO) - -logger = logging.getLogger("main") - -TASK_LIST_CLASSIFICATION = [] - -TASK_LIST_CLUSTERING = [] - -TASK_LIST_PAIR_CLASSIFICATION = [] - -TASK_LIST_RERANKING = [] - -TASK_LIST_RETRIEVAL = [ - # "Ko-StrategyQA", - "AutoRAGRetrieval", - # "MIRACLRetrieval", - # "PublicHealthQA", - # "BelebeleRetrieval", - # "MrTidyRetrieval", - # "MultiLongDocRetrieval", - # "XPQARetrieval" -] - -TASK_LIST_STS = [] - -TASK_LIST = ( - TASK_LIST_CLASSIFICATION - + TASK_LIST_CLUSTERING - + TASK_LIST_PAIR_CLASSIFICATION - + TASK_LIST_RERANKING - + TASK_LIST_RETRIEVAL - + TASK_LIST_STS -) - - -model_names = [ - "BAAI/bge-m3/sparse", # 8192 -] - - -def evaluate_model(model_name, gpu_id): - try: - # Set the environment variable for the specific GPU - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - - model = None - if not os.path.exists(model_name): - if "m2v" in model_name: - static_embedding = StaticEmbedding.from_model2vec(model_name) - model = SentenceTransformer(modules=[static_embedding]) - else: - if model_name == "nlpai-lab/KoE5": - # mE5 기반의 모델이므로, 해당 프롬프트를 추가시킵니다. - model_prompts = { - PromptType.query.value: "query: ", - PromptType.passage.value: "passage: ", - } - model = SentenceTransformerWrapper( - model=model_name, model_prompts=model_prompts - ) - elif model_name == "BAAI/bge-multilingual-gemma2": - instruction_template = "{instruction}\n" - model = instruct_wrapper( - model_name_or_path=model_name, - instruction_template=instruction_template, - attn="cccc", - pooling_method="lasttoken", - mode="embedding", - torch_dtype=torch.float16, - normalized=True, - ) - elif "text-embedding-3" in model_name: - model = CustomOpenAIWrapper(model_name) - else: - model = mteb.get_model(model_name) - # from mteb.models.bge_models import BGEM3Wrapper - # model = BGEM3Wrapper(model_name) - else: - file_name = os.path.join(model_name, "model.safetensors") - if os.path.exists(file_name): - if "m2v" in model_name: - static_embedding = StaticEmbedding.from_model2vec(model_name) - model = SentenceTransformer(modules=[static_embedding]) - else: - model = mteb.get_model(model_name) - - if model: - setproctitle(f"{model_name}-{gpu_id}") - print( - f"Running task: {TASK_LIST} / {model_name} on GPU {gpu_id} in process {current_process().name}" - ) - evaluation = MTEB( - tasks=get_tasks( - tasks=TASK_LIST, languages=["kor-Kore", "kor-Hang", "kor_Hang"] - ) - ) - # 48GB VRAM 기준 적합한 batch sizes - if "multilingual-e5" in model_name: - batch_size = 256 - elif "jina" in model_name: - batch_size = 8 - elif "bge-m3" in model_name: - batch_size = 32 - elif "gemma2" in model_name: - batch_size = 256 - elif "Salesforce" in model_name: - batch_size = 128 - else: - batch_size = 64 - - evaluation.run( - model, - output_folder=f"results/{model_name}", - encode_kwargs={"batch_size": batch_size}, - ) - except Exception as ex: - print(ex) - traceback.print_exc() - - -if __name__ == "__main__": - processes = [] - for i, model_name in enumerate(model_names): - gpu_id = i + 3 # Cycle through available GPUs - p = Process(target=evaluate_model, args=(model_name, gpu_id)) - p.start() - processes.append(p) - - for p in processes: - p.join()