From 16c6457ed79cfe2fa8508903a8b9b9133fdf2187 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 1 Mar 2024 10:07:48 +0100 Subject: [PATCH 1/7] initial import --- .../gradient/gradient_document_embedder.py | 17 ++++++++++++++--- .../gradient/gradient_text_embedder.py | 16 ++++++++++++---- .../components/generators/gradient/base.py | 15 ++++++++++++--- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index 4ccfb9da5..29a238cf9 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -23,8 +23,10 @@ def _alt_progress_bar(x: Any) -> Any: class GradientDocumentEmbedder: """ A component for computing Document embeddings using Gradient AI API. + The embedding of each Document is stored in the `embedding` field of the Document. + Usage example: ```python embedder = GradientDocumentEmbedder(model="bge_large") p = Pipeline() @@ -53,7 +55,7 @@ def __init__( :param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs. :param access_token: The Gradient access token. :param workspace_id: The Gradient workspace ID. - :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses https://api.gradient.ai/. :param progress_bar: Whether to show a progress bar while embedding the documents. """ self._batch_size = batch_size @@ -75,8 +77,12 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> dict: """ - Serialize the component to a Python dictionary. + Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. """ + return default_to_dict( self, model=self._model_name, @@ -91,13 +97,17 @@ def to_dict(self) -> dict: def from_dict(cls, data: Dict[str, Any]) -> "GradientDocumentEmbedder": """ Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. """ deserialize_secrets_inplace(data["init_parameters"], keys=["access_token", "workspace_id"]) return default_from_dict(cls, data) def warm_up(self) -> None: """ - Load the embedding model. + Initializes the component. """ if not hasattr(self, "_embedding_model"): self._embedding_model = self._gradient.get_embeddings_model(slug=self._model_name) @@ -125,6 +135,7 @@ def _generate_embeddings(self, documents: List[Document], batch_size: int) -> Li def run(self, documents: List[Document]): """ Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. :param documents: A list of Documents to embed. diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py index 029d5c52f..0588df5d4 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py @@ -8,8 +8,9 @@ @component class GradientTextEmbedder: """ - A component for embedding strings using models hosted on Gradient AI (https://gradient.ai). + A component for embedding strings using models hosted on [Gradient AI](https://gradient.ai). + Usage example: ```python embedder = GradientTextEmbedder(model="bge_large") p = Pipeline() @@ -34,7 +35,7 @@ def __init__( :param model: The name of the model to use. :param access_token: The Gradient access token. :param workspace_id: The Gradient workspace ID. - :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses https://api.gradient.ai/. """ self._host = host self._model_name = model @@ -53,7 +54,10 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> dict: """ - Serialize the component to a Python dictionary. + Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. """ return default_to_dict( self, @@ -67,13 +71,17 @@ def to_dict(self) -> dict: def from_dict(cls, data: Dict[str, Any]) -> "GradientTextEmbedder": """ Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. """ deserialize_secrets_inplace(data["init_parameters"], keys=["access_token", "workspace_id"]) return default_from_dict(cls, data) def warm_up(self) -> None: """ - Load the embedding model. + Initializes the component. """ if not hasattr(self, "_embedding_model"): self._embedding_model = self._gradient.get_embeddings_model(slug=self._model_name) diff --git a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py index 9176c3e4b..f75735181 100644 --- a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py +++ b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py @@ -16,6 +16,7 @@ class GradientGenerator: Queries the LLM using Gradient AI's SDK ('gradientai' package). See [Gradient AI API](https://docs.gradient.ai/docs/sdk-quickstart) for more details. + Usage example: ```python llm = GradientGenerator(base_model_slug="llama2-7b-chat") llm.warm_up() @@ -41,16 +42,16 @@ def __init__( Create a GradientGenerator component. :param access_token: The Gradient access token. If not provided it's read from the environment - variable GRADIENT_ACCESS_TOKEN. + variable `GRADIENT_ACCESS_TOKEN`. :param base_model_slug: The base model slug to use. - :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses (gradient.ai)[https://api.gradient.ai/]. :param max_generated_token_count: The maximum number of tokens to generate. :param model_adapter_id: The model adapter ID to use. :param temperature: The temperature to use. :param top_k: The top k to use. :param top_p: The top p to use. :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment - variable GRADIENT_WORKSPACE_ID. + variable `GRADIENT_WORKSPACE_ID`. """ self._access_token = access_token self._base_model_slug = base_model_slug @@ -84,6 +85,9 @@ def __init__( def to_dict(self) -> Dict[str, Any]: """ Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. """ return default_to_dict( self, @@ -102,7 +106,12 @@ def to_dict(self) -> Dict[str, Any]: def from_dict(cls, data: Dict[str, Any]) -> "GradientGenerator": """ Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. """ + deserialize_secrets_inplace(data["init_parameters"], keys=["access_token", "workspace_id"]) return default_from_dict(cls, data) From 811900405423f3e1a5eef52726cbfaa1f4af9077 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 1 Mar 2024 10:19:35 +0100 Subject: [PATCH 2/7] initial import --- .../components/embedders/gradient/gradient_document_embedder.py | 2 +- .../components/embedders/gradient/gradient_text_embedder.py | 2 +- .../components/generators/gradient/base.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index 29a238cf9..eae85eaaa 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -55,7 +55,7 @@ def __init__( :param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs. :param access_token: The Gradient access token. :param workspace_id: The Gradient workspace ID. - :param host: The Gradient host. By default, it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). :param progress_bar: Whether to show a progress bar while embedding the documents. """ self._batch_size = batch_size diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py index 0588df5d4..fdd37a17a 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py @@ -35,7 +35,7 @@ def __init__( :param model: The name of the model to use. :param access_token: The Gradient access token. :param workspace_id: The Gradient workspace ID. - :param host: The Gradient host. By default, it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). """ self._host = host self._model_name = model diff --git a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py index f75735181..e3be06a6c 100644 --- a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py +++ b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py @@ -44,7 +44,7 @@ def __init__( :param access_token: The Gradient access token. If not provided it's read from the environment variable `GRADIENT_ACCESS_TOKEN`. :param base_model_slug: The base model slug to use. - :param host: The Gradient host. By default, it uses (gradient.ai)[https://api.gradient.ai/]. + :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). :param max_generated_token_count: The maximum number of tokens to generate. :param model_adapter_id: The model adapter ID to use. :param temperature: The temperature to use. From 3d0dfd80f518b3baa801597213cdafebcc6edb77 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 1 Mar 2024 17:46:40 +0100 Subject: [PATCH 3/7] adding returned Dict --- .../embedders/gradient/gradient_document_embedder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index eae85eaaa..6829f0af8 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -139,6 +139,10 @@ def run(self, documents: List[Document]): The embedding of each Document is stored in the `embedding` field of the Document. :param documents: A list of Documents to embed. + :returns: + A dictionary with the following keys: + - documents: The embedded Documents. + """ if not isinstance(documents, list) or documents and any(not isinstance(doc, Document) for doc in documents): msg = "GradientDocumentEmbedder expects a list of Documents as input.\ From 56f302e56932a6485bad4c0cb83fc313d781fa1a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 4 Mar 2024 15:32:40 +0100 Subject: [PATCH 4/7] attending PR comments --- .../gradient/gradient_document_embedder.py | 17 ++++++++++++++--- .../gradient/gradient_text_embedder.py | 4 ++++ .../components/generators/gradient/base.py | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index 6829f0af8..c5dded9f1 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -28,13 +28,24 @@ class GradientDocumentEmbedder: Usage example: ```python + from haystack_integrations.components.embedders.gradient import GradientDocumentEmbedder + from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever + from haystack import Pipeline + embedder = GradientDocumentEmbedder(model="bge_large") + + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), + ] + p = Pipeline() p.add_component(embedder, name="document_embedder") - p.add_component(instance=GradientDocumentEmbedder( + p.add_component(instance=GradientDocumentEmbedder(), name="document_embedder") p.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") p.connect("document_embedder", "document_writer") - p.run({"document_embedder": {"documents": documents}}) + p.run(data={"document_embedder": {"documents": documents}}) ``` """ @@ -141,7 +152,7 @@ def run(self, documents: List[Document]): :param documents: A list of Documents to embed. :returns: A dictionary with the following keys: - - documents: The embedded Documents. + - `documents`: The embedded Documents. """ if not isinstance(documents, list) or documents and any(not isinstance(doc, Document) for doc in documents): diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py index fdd37a17a..a33ad3d40 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py @@ -12,6 +12,10 @@ class GradientTextEmbedder: Usage example: ```python + from haystack_integrations.components.embedders.gradient import GradientTextEmbedder + from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever + from haystack import Pipeline + embedder = GradientTextEmbedder(model="bge_large") p = Pipeline() p.add_component(instance=embedder, name="text_embedder") diff --git a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py index e3be06a6c..5dee99a5c 100644 --- a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py +++ b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py @@ -18,6 +18,8 @@ class GradientGenerator: Usage example: ```python + from haystack_integrations.components.generators.gradient import GradientGenerator + llm = GradientGenerator(base_model_slug="llama2-7b-chat") llm.warm_up() print(llm.run(prompt="What is the meaning of life?")) From 564e3415135fa16283e8c87c66619c287ecee24f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 4 Mar 2024 17:25:29 +0100 Subject: [PATCH 5/7] attending PR comments --- .../gradient/gradient_document_embedder.py | 19 ++++++++++--------- .../gradient/gradient_text_embedder.py | 11 +++++++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index c5dded9f1..56aeb0a0a 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -28,11 +28,12 @@ class GradientDocumentEmbedder: Usage example: ```python - from haystack_integrations.components.embedders.gradient import GradientDocumentEmbedder - from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever from haystack import Pipeline + from haystack.document_stores.in_memory import InMemoryDocumentStore + from haystack.components.writers import DocumentWriter + from haystack import Document - embedder = GradientDocumentEmbedder(model="bge_large") + from haystack_integrations.components.embedders.gradient import GradientDocumentEmbedder documents = [ Document(content="My name is Jean and I live in Paris."), @@ -40,12 +41,12 @@ class GradientDocumentEmbedder: Document(content="My name is Giorgio and I live in Rome."), ] - p = Pipeline() - p.add_component(embedder, name="document_embedder") - p.add_component(instance=GradientDocumentEmbedder(), name="document_embedder") - p.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") - p.connect("document_embedder", "document_writer") - p.run(data={"document_embedder": {"documents": documents}}) + indexing_pipeline = Pipeline() + indexing_pipeline.add_component(instance=GradientDocumentEmbedder(), name="document_embedder") + indexing_pipeline.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") + indexing_pipeline.connect("document_embedder", "document_writer") + indexing_pipeline.run({"document_embedder": {"documents": documents}}) + >>> {'document_writer': {'documents_written': 3}} ``` """ diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py index a33ad3d40..77b2d6250 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py @@ -14,14 +14,17 @@ class GradientTextEmbedder: ```python from haystack_integrations.components.embedders.gradient import GradientTextEmbedder from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever + from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack import Pipeline - embedder = GradientTextEmbedder(model="bge_large") + embedder = p = Pipeline() - p.add_component(instance=embedder, name="text_embedder") - p.add_component(instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever") + p.add_component("text_embedder", GradientTextEmbedder(model="bge-large")) + p.add_component("retriever", InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore())) p.connect("text_embedder", "retriever") - p.run("embed me!!!") + p.run(data={"text_embedder": {"text":"You can embed me put I'll return no matching documents"}}) + >>> No Documents found with embeddings. Returning empty list. To generate embeddings, use a DocumentEmbedder. + >>> {'retriever': {'documents': []}} ``` """ From 72393c0a65bcd3fd0e9612ea33b9039839625842 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 5 Mar 2024 09:33:24 +0100 Subject: [PATCH 6/7] linting --- .../embedders/gradient/gradient_document_embedder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index 56aeb0a0a..a868c6c1b 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -43,7 +43,9 @@ class GradientDocumentEmbedder: indexing_pipeline = Pipeline() indexing_pipeline.add_component(instance=GradientDocumentEmbedder(), name="document_embedder") - indexing_pipeline.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") + indexing_pipeline.add_component( + instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") + ) indexing_pipeline.connect("document_embedder", "document_writer") indexing_pipeline.run({"document_embedder": {"documents": documents}}) >>> {'document_writer': {'documents_written': 3}} From d63dde61c66c089f166ad26879ef48856229b66b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 5 Mar 2024 09:38:19 +0100 Subject: [PATCH 7/7] fixing doc --- .../components/generators/gradient/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py index 5dee99a5c..71b39d309 100644 --- a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py +++ b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py @@ -43,7 +43,7 @@ def __init__( """ Create a GradientGenerator component. - :param access_token: The Gradient access token. If not provided it's read from the environment + :param access_token: The Gradient access token as a `Secret`. If not provided it's read from the environment variable `GRADIENT_ACCESS_TOKEN`. :param base_model_slug: The base model slug to use. :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). @@ -52,7 +52,7 @@ def __init__( :param temperature: The temperature to use. :param top_k: The top k to use. :param top_p: The top p to use. - :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment + :param workspace_id: The Gradient workspace ID as a `Secret`. If not provided it's read from the environment variable `GRADIENT_WORKSPACE_ID`. """ self._access_token = access_token