Skip to content

Commit

Permalink
fix: HuggingFaceTEITextEmbedder returning embedding of incorrect sh…
Browse files Browse the repository at this point in the history
…ape when used with Docker endpoint (#7319)

* Fix HuggingFaceTEITextEmbedder

* Update haystack/components/embedders/hugging_face_tei_text_embedder.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Improve imports; Add additional tests

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
awinml and anakin87 authored Mar 7, 2024
1 parent 95837ab commit 8d7a583
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from haystack.utils import Secret, deserialize_secrets_inplace
from haystack.utils.hf import HFModelType, check_valid_model

with LazyImport(message="Run 'pip install transformers'") as transformers_import:
with LazyImport(message="Run 'pip install huggingface_hub'") as huggingface_hub_import:
from huggingface_hub import InferenceClient

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -79,7 +79,7 @@ def __init__(
:param embedding_separator:
Separator used to concatenate the meta fields to the Document text.
"""
transformers_import.check()
huggingface_hub_import.check()

if url:
r = urlparse(url)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from haystack.utils import Secret, deserialize_secrets_inplace
from haystack.utils.hf import HFModelType, check_valid_model

with LazyImport(message="Run 'pip install transformers'") as transformers_import:
with LazyImport(message="Run 'pip install huggingface_hub'") as huggingface_hub_import:
from huggingface_hub import InferenceClient

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -62,7 +62,7 @@ def __init__(
:param suffix:
A string to add at the end of each text.
"""
transformers_import.check()
huggingface_hub_import.check()

if url:
r = urlparse(url)
Expand Down Expand Up @@ -135,8 +135,8 @@ def run(self, text: str):

text_to_embed = self.prefix + text + self.suffix

embedding = self.client.feature_extraction(text=text_to_embed)
embeddings = self.client.feature_extraction(text=[text_to_embed])
# The client returns a numpy array
embedding = embedding.tolist()
embedding = embeddings.tolist()[0]

return {"embedding": embedding}
5 changes: 5 additions & 0 deletions releasenotes/notes/hf-tei-bug-fix-07732c672600aadd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
fixes:
- |
Fixes `HuggingFaceTEITextEmbedder` returning an embedding of incorrect shape when used with a
Text-Embedding-Inference endpoint deployed using Docker.
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import numpy as np
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from haystack.utils.auth import Secret

from haystack.components.embedders.hugging_face_tei_document_embedder import HuggingFaceTEIDocumentEmbedder
from haystack.dataclasses import Document
from haystack.utils.auth import Secret


@pytest.fixture
Expand Down Expand Up @@ -222,6 +222,29 @@ def test_run(self, mock_check_valid_model):
assert len(doc.embedding) == 384
assert all(isinstance(x, float) for x in doc.embedding)

@pytest.mark.flaky(reruns=5, reruns_delay=5)
@pytest.mark.integration
def test_run_inference_api_endpoint(self):
docs = [
Document(content="I love cheese", meta={"topic": "Cuisine"}),
Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
]

embedder = HuggingFaceTEIDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2", meta_fields_to_embed=["topic"], embedding_separator=" | "
)

result = embedder.run(documents=docs)
documents_with_embeddings = result["documents"]

assert isinstance(documents_with_embeddings, list)
assert len(documents_with_embeddings) == len(docs)
for doc in documents_with_embeddings:
assert isinstance(doc, Document)
assert isinstance(doc.embedding, list)
assert len(doc.embedding) == 384
assert all(isinstance(x, float) for x in doc.embedding)

def test_run_custom_batch_size(self, mock_check_valid_model):
docs = [
Document(content="I love cheese", meta={"topic": "Cuisine"}),
Expand Down
15 changes: 12 additions & 3 deletions test/components/embedders/test_hugging_face_tei_text_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import numpy as np
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from haystack.utils.auth import Secret

from haystack.components.embedders.hugging_face_tei_text_embedder import HuggingFaceTEITextEmbedder
from haystack.utils.auth import Secret


@pytest.fixture
Expand All @@ -17,7 +17,7 @@ def mock_check_valid_model():


def mock_embedding_generation(text, **kwargs):
response = np.random.rand(384)
response = np.array([np.random.rand(384) for i in range(len(text))])
return response


Expand Down Expand Up @@ -107,7 +107,16 @@ def test_run(self, mock_check_valid_model):

result = embedder.run(text="The food was delicious")

mock_embedding_patch.assert_called_once_with(text="prefix The food was delicious suffix")
mock_embedding_patch.assert_called_once_with(text=["prefix The food was delicious suffix"])

assert len(result["embedding"]) == 384
assert all(isinstance(x, float) for x in result["embedding"])

@pytest.mark.flaky(reruns=5, reruns_delay=5)
@pytest.mark.integration
def test_run_inference_api_endpoint(self):
embedder = HuggingFaceTEITextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
result = embedder.run(text="The food was delicious")

assert len(result["embedding"]) == 384
assert all(isinstance(x, float) for x in result["embedding"])
Expand Down

0 comments on commit 8d7a583

Please sign in to comment.