Skip to content

Commit

Permalink
fix: add sleep for too many requests
Browse files Browse the repository at this point in the history
  • Loading branch information
yjoonjang committed Dec 4, 2024
1 parent 23600ea commit 80bb95f
Showing 1 changed file with 47 additions and 15 deletions.
62 changes: 47 additions & 15 deletions mteb/models/openai_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(
self,
model_name: str,
max_tokens: int,
tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now
tokenizer_name: str = "cl100k_base", # since all models use this tokenizer now
embed_dim: int | None = None,
**kwargs,
) -> None:
Expand All @@ -28,17 +28,23 @@ def __init__(
"""
requires_package(self, "openai", "Openai text embedding")
from openai import OpenAI
requires_package(self, "tiktoken", "Tiktoken package")
import tiktoken

self._client = OpenAI()
self._model_name = model_name
self._embed_dim = embed_dim
self._max_tokens = max_tokens
self._tokenizer_name = tokenizer_name
self._encoding = tiktoken.get_encoding(tokenizer_name)

def truncate_text_tokens(self, text):
"""Truncate a string to have `max_tokens` according to the given encoding."""
truncated_sentence = self._encoding.encode(text)[:self._max_tokens]
return self._encoding.decode(truncated_sentence)

def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
requires_package(self, "openai", "Openai text embedding")
requires_package(self, "tiktoken", "Tiktoken package")
import tiktoken

from openai import NotGiven

if self._model_name == "text-embedding-ada-002" and self._embed_dim is not None:
Expand All @@ -48,11 +54,10 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:

trimmed_sentences = []
for sentence in sentences:
encoding = tiktoken.get_encoding(self._tokenizer_name)
encoded_sentence = encoding.encode(sentence)
encoded_sentence = self._encoding.encode(sentence)
if len(encoded_sentence) > self._max_tokens:
trimmed_sentence = encoding.decode(encoded_sentence[: self._max_tokens])
trimmed_sentences.append(trimmed_sentence)
truncated_sentence = self.truncate_text_tokens(sentence)
trimmed_sentences.append(truncated_sentence)
else:
trimmed_sentences.append(sentence)

Expand All @@ -65,12 +70,34 @@ def encode(self, sentences: list[str], **kwargs: Any) -> np.ndarray:
all_embeddings = []

for sublist in sublists:
response = self._client.embeddings.create(
input=sublist,
model=self._model_name,
encoding_format="float",
dimensions=self._embed_dim or NotGiven(),
)
try:
response = self._client.embeddings.create(
input=sublist,
model=self._model_name,
encoding_format="float",
dimensions=self._embed_dim or NotGiven(),
)
except Exception as e:
# Sleep due to too many requests
logger.info("Sleeping for 10 seconds due to error", e)
import time
time.sleep(10)
try:
response = self._client.embeddings.create(
input=sublist,
model=self._model_name,
encoding_format="float",
dimensions=self._embed_dim or NotGiven(),
)
except Exception as e:
logger.info("Sleeping for 60 seconds due to error", e)
time.sleep(60)
response = self._client.embeddings.create(
input=sublist,
model=self._model_name,
encoding_format="float",
dimensions=self._embed_dim or NotGiven(),
)
all_embeddings.extend(self._to_numpy(response))

return np.array(all_embeddings)
Expand Down Expand Up @@ -126,7 +153,12 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
revision="2",
release_date="2022-12-15",
languages=None, # supported languages not specified
loader=partial(OpenAIWrapper, model_name="text-embedding-ada-002"),
loader=partial(
OpenAIWrapper,
model_name="text-embedding-ada-002",
tokenizer_name="cl100k_base",
max_tokens=8192,
),
reference="https://openai.com/index/new-and-improved-embedding-model/",
max_tokens=8191,
embed_dim=1536,
Expand Down

0 comments on commit 80bb95f

Please sign in to comment.