From 301993e721741fec651df8b10120a6bbb20cf593 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 29 Feb 2024 12:15:57 +0100 Subject: [PATCH 1/3] wip --- .../components/embedders/jina/text_embedder.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index f99882f13..39f548714 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -17,12 +17,15 @@ class JinaTextEmbedder: Usage example: ```python - from jina_haystack import JinaTextEmbedder + import os + from haystack_integrations.components.embedders.jina import JinaTextEmbedder - text_to_embed = "I love pizza!" + os.environ("JINA_API_KEY") = "YOUR_JINA_API_KEY" text_embedder = JinaTextEmbedder() + text_to_embed = "I love pizza!" + print(text_embedder.run(text_to_embed)) # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], @@ -39,11 +42,10 @@ def __init__( suffix: str = "", ): """ - Create an JinaTextEmbedder component. - :param api_key: The Jina API key. It can be explicitly provided or automatically read from the - environment variable JINA_API_KEY (recommended). - :param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/` + environment variable `JINA_API_KEY` (recommended). + :param model: The name of the Jina model to use. + Check the list of available models on [Jina documentation](https://jina.ai/embeddings/). :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. """ From 873e3ac7c114db989aef134f950c56dc4ec29de8 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 29 Feb 2024 12:58:14 +0100 Subject: [PATCH 2/3] jina - review docstrings --- .../embedders/jina/document_embedder.py | 31 +++++++++++++------ .../embedders/jina/text_embedder.py | 27 ++++++++++++---- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 2ecc01bc3..9549d5c77 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -19,13 +19,15 @@ class JinaDocumentEmbedder: Usage example: ```python + import os from haystack import Document - from jina_haystack import JinaDocumentEmbedder - - doc = Document(content="I love pizza!") + from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder + os.environ("JINA_API_KEY") = "YOUR_JINA_API_KEY" document_embedder = JinaDocumentEmbedder() + doc = Document(content="I love pizza!") + result = document_embedder.run([doc]) print(result['documents'][0].embedding) @@ -45,9 +47,9 @@ def __init__( embedding_separator: str = "\n", ): """ - Create a JinaDocumentEmbedder component. :param api_key: The Jina API key. - :param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/` + :param model: The name of the Jina model to use. + Check the list of available models on [Jina documentation](https://jina.ai/embeddings/). :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param batch_size: Number of Documents to encode at once. @@ -83,8 +85,9 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - This method overrides the default serializer in order to avoid leaking the `api_key` value passed - to the constructor. + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. """ return default_to_dict( self, @@ -100,6 +103,13 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "JinaDocumentEmbedder": + """ + Deserializes the component from a dictionary. + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) @@ -151,10 +161,13 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List @component.output_types(documents=List[Document], meta=Dict[str, Any]) def run(self, documents: List[Document]): """ - Embed a list of Documents. - The embedding of each Document is stored in the `embedding` field of the Document. + Compute the embeddings for a list of Documents. :param documents: A list of Documents to embed. + :returns: A dictionary with following keys: + - `documents`: List of Documents, each with an `embedding` field containing the computed embedding. + - `meta`: A dictionary with metadata including the model name and usage statistics. + :raises TypeError: If the input is not a list of Documents. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): msg = ( diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 39f548714..79f1e540f 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -13,7 +13,7 @@ @component class JinaTextEmbedder: """ - A component for embedding strings using Jina models. + A component for embedding strings using Jina AI models. Usage example: ```python @@ -44,7 +44,7 @@ def __init__( """ :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable `JINA_API_KEY` (recommended). - :param model: The name of the Jina model to use. + :param model: The name of the Jina model to use. Check the list of available models on [Jina documentation](https://jina.ai/embeddings/). :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. @@ -73,22 +73,37 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - This method overrides the default serializer in order to avoid leaking the `api_key` value passed - to the constructor. + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. """ - return default_to_dict( self, api_key=self.api_key.to_dict(), model=self.model_name, prefix=self.prefix, suffix=self.suffix ) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder": + """ + Deserializes the component from a dictionary. + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) @component.output_types(embedding=List[float], meta=Dict[str, Any]) def run(self, text: str): - """Embed a string.""" + """ + Embed a string. + + :param text: The string to embed. + :returns: A dictionary with following keys: + - `embedding`: The embedding of the input string. + - `meta`: A dictionary with metadata including the model name and usage statistics. + :raises TypeError: If the input is not a string. + """ if not isinstance(text, str): msg = ( "JinaTextEmbedder expects a string as an input." From b235482569e289a3a157914fabf30043c711d097 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 29 Feb 2024 15:09:13 +0100 Subject: [PATCH 3/3] requested changes --- .../components/embedders/jina/document_embedder.py | 6 ++++-- .../components/embedders/jina/text_embedder.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py index 9549d5c77..6bcd94220 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py @@ -19,11 +19,11 @@ class JinaDocumentEmbedder: Usage example: ```python - import os from haystack import Document from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder - os.environ("JINA_API_KEY") = "YOUR_JINA_API_KEY" + # Make sure that the environment variable JINA_API_KEY is set + document_embedder = JinaDocumentEmbedder() doc = Document(content="I love pizza!") @@ -47,6 +47,8 @@ def __init__( embedding_separator: str = "\n", ): """ + Create a JinaDocumentEmbedder component. + :param api_key: The Jina API key. :param model: The name of the Jina model to use. Check the list of available models on [Jina documentation](https://jina.ai/embeddings/). diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py index 79f1e540f..6398122a4 100644 --- a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py +++ b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py @@ -17,10 +17,9 @@ class JinaTextEmbedder: Usage example: ```python - import os from haystack_integrations.components.embedders.jina import JinaTextEmbedder - os.environ("JINA_API_KEY") = "YOUR_JINA_API_KEY" + # Make sure that the environment variable JINA_API_KEY is set text_embedder = JinaTextEmbedder() @@ -42,6 +41,8 @@ def __init__( suffix: str = "", ): """ + Create a JinaTextEmbedder component. + :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable `JINA_API_KEY` (recommended). :param model: The name of the Jina model to use.