From 877f2c51e32b7c98ac2b3a99399d74019ddbb3f3 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 18 Oct 2023 22:42:32 +0200 Subject: [PATCH] Update text embedding component (#532) PR that modifies the text embedding component: * Change the name of the component to `embed_text` for consistency (we already have an `embed_image` component) * Added VertexAI as a possible model to use * Changed the base image to pytorch since some of the dependencies in the requirement have cuda deps * Fixed some tests (mainly path related) Tested the component with the CC rag pipeline and works fine (added the [`op`](https://github.com/ml6team/fondant/pull/528/commits/fcf86640db82310cc05ffbaddce07bbb8915608b) here). The component runs fine (exit code 0) however there seems to be an error that pops since the connection with the API does not seem to shutdown gracefully. ``` rag-cc-pipeline-embed_text-1 | [2023-10-18 13:58:45,227 | root | INFO] Writing data... [########################################] | 100% Completed | 52.61 ss rag-cc-pipeline-embed_text-1 | [2023-10-18 13:59:37,967 | fondant.executor | INFO] Saving output manifest to gs://soy-audio-379412_kfp-artifacts/custom_artifact/rag-cc-pipeline/rag-cc-pipeline-20231018155832/embed_text/manifest.json rag-cc-pipeline-embed_text-1 | [2023-10-18 13:59:37,967 | fondant.executor | INFO] Writing cache key to gs://soy-audio-379412_kfp-artifacts/custom_artifact/rag-cc-pipeline/cache/dd3a24cb288cd0eaba12063d2885bc9d.txt rag-cc-pipeline-embed_text-1 | Traceback (most recent call last): rag-cc-pipeline-embed_text-1 | File "src/python/grpcio/grpc/_cython/_cygrpc/aio/grpc_aio.pyx.pxi", line 110, in grpc._cython.cygrpc.shutdown_grpc_aio rag-cc-pipeline-embed_text-1 | File "src/python/grpcio/grpc/_cython/_cygrpc/aio/grpc_aio.pyx.pxi", line 114, in grpc._cython.cygrpc.shutdown_grpc_aio rag-cc-pipeline-embed_text-1 | File "src/python/grpcio/grpc/_cython/_cygrpc/aio/grpc_aio.pyx.pxi", line 78, in grpc._cython.cygrpc._actual_aio_shutdown rag-cc-pipeline-embed_text-1 | AttributeError: 'NoneType' object has no attribute 'POLLER' rag-cc-pipeline-embed_text-1 | Exception ignored in: 'grpc._cython.cygrpc.AioChannel.__dealloc__' rag-cc-pipeline-embed_text-1 | Traceback (most recent call last): rag-cc-pipeline-embed_text-1 | File "src/python/grpcio/grpc/_cython/_cygrpc/aio/grpc_aio.pyx.pxi", line 110, in grpc._cython.cygrpc.shutdown_grpc_aio rag-cc-pipeline-embed_text-1 | File "src/python/grpcio/grpc/_cython/_cygrpc/aio/grpc_aio.pyx.pxi", line 114, in grpc._cython.cygrpc.shutdown_grpc_aio rag-cc-pipeline-embed_text-1 | File "src/python/grpcio/grpc/_cython/_cygrpc/aio/grpc_aio.pyx.pxi", line 78, in grpc._cython.cygrpc._actual_aio_shutdown rag-cc-pipeline-embed_text-1 | AttributeError: 'NoneType' object has no attribute 'POLLER' ``` Normally this would be done with a client but in this case we're initializing the model directly --- .../Dockerfile | 18 +++++++--- .../README.md | 23 +++++++----- .../fondant_component.yaml | 16 ++++++--- .../requirements.txt | 1 + .../src/main.py | 36 ++++++++++++++----- components/embed_text/test_requirements.txt | 1 + .../tests/embed_text_test.py} | 11 +++--- .../tests/hello_world_embedding.txt | 0 .../tests/lorem_300.txt | 0 .../tests/lorem_400.txt | 0 components/generate_embeddings/src/utils.py | 6 ---- 11 files changed, 75 insertions(+), 37 deletions(-) rename components/{generate_embeddings => embed_text}/Dockerfile (59%) rename components/{generate_embeddings => embed_text}/README.md (68%) rename components/{generate_embeddings => embed_text}/fondant_component.yaml (79%) rename components/{generate_embeddings => embed_text}/requirements.txt (81%) rename components/{generate_embeddings => embed_text}/src/main.py (60%) create mode 100644 components/embed_text/test_requirements.txt rename components/{generate_embeddings/tests/generate_embeddings_test.py => embed_text/tests/embed_text_test.py} (80%) rename components/{generate_embeddings => embed_text}/tests/hello_world_embedding.txt (100%) rename components/{generate_embeddings => embed_text}/tests/lorem_300.txt (100%) rename components/{generate_embeddings => embed_text}/tests/lorem_400.txt (100%) delete mode 100644 components/generate_embeddings/src/utils.py diff --git a/components/generate_embeddings/Dockerfile b/components/embed_text/Dockerfile similarity index 59% rename from components/generate_embeddings/Dockerfile rename to components/embed_text/Dockerfile index 0d28da125..fd1dbc22f 100644 --- a/components/generate_embeddings/Dockerfile +++ b/components/embed_text/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim as base +FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime as base # System dependencies RUN apt-get update && \ @@ -6,17 +6,25 @@ RUN apt-get update && \ apt-get install git -y # Install requirements -COPY requirements.txt / +COPY requirements.txt ./ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching ARG FONDANT_VERSION=main RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + # Set the working directory to the component folder -WORKDIR /component/src +WORKDIR /component +COPY src/ src/ +ENV PYTHONPATH "${PYTHONPATH}:./src" -# Copy over src-files -COPY src/ . +FROM base as test +COPY test_requirements.txt . +RUN pip3 install --no-cache-dir -r test_requirements.txt +COPY tests/ tests/ +RUN python -m pytest tests +FROM base +WORKDIR /component/src ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file diff --git a/components/generate_embeddings/README.md b/components/embed_text/README.md similarity index 68% rename from components/generate_embeddings/README.md rename to components/embed_text/README.md index 53a404be6..9b2692d5b 100644 --- a/components/generate_embeddings/README.md +++ b/components/embed_text/README.md @@ -1,4 +1,4 @@ -# Generate embeddings +# Embed text ### Description Component that generates embeddings of text passages. @@ -22,9 +22,10 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| model_provider | str | The provider of the model - corresponding to langchain embedding classes. Currently the following providers are supported: aleph_alpha, cohere, huggingface, openai. | huggingface | -| model | str | The model to generate embeddings from. Choose an available model name to pass to the model provider's langchain embedding class. | all-MiniLM-L6-v2 | +| model_provider | str | The provider of the model - corresponding to langchain embedding classes. Currently the following providers are supported: aleph_alpha, cohere, huggingface, openai, vertexai. | huggingface | +| model | str | The model to generate embeddings from. Choose an available model name to pass to the model provider's langchain embedding class. | / | | api_keys | dict | The API keys to use for the model provider that are written to environment variables.Pass only the keys required by the model provider or conveniently pass all keys you will ever need. Pay attention how to name the dictionary keys so that they can be used by the model provider. | / | +| auth_kwargs | dict | Additional keyword arguments required for api initialization/authentication. | / | ### Usage @@ -34,15 +35,21 @@ You can add this component to your pipeline using the following code: from fondant.pipeline import ComponentOp -generate_embeddings_op = ComponentOp.from_registry( - name="generate_embeddings", +embed_text_op = ComponentOp.from_registry( + name="embed_text", arguments={ - # Add arguments # "model_provider": "huggingface", - # "model": "all-MiniLM-L6-v2", + # "model": , # "api_keys": {}, + # "auth_kwargs": {}, } ) -pipeline.add_op(generate_embeddings_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(embed_text_op, dependencies=[...]) #Add previous component as dependency ``` +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/generate_embeddings/fondant_component.yaml b/components/embed_text/fondant_component.yaml similarity index 79% rename from components/generate_embeddings/fondant_component.yaml rename to components/embed_text/fondant_component.yaml index 4e21e3605..7b893eb57 100644 --- a/components/generate_embeddings/fondant_component.yaml +++ b/components/embed_text/fondant_component.yaml @@ -1,6 +1,6 @@ -name: Generate embeddings +name: Embed text description: Component that generates embeddings of text passages. -image: generate_embeddings:latest +image: embed_text:latest consumes: text: @@ -22,7 +22,8 @@ args: model_provider: description: | The provider of the model - corresponding to langchain embedding classes. - Currently the following providers are supported: aleph_alpha, cohere, huggingface, openai. + Currently the following providers are supported: aleph_alpha, cohere, huggingface, openai, + vertexai. type: str default: huggingface model: @@ -30,12 +31,19 @@ args: The model to generate embeddings from. Choose an available model name to pass to the model provider's langchain embedding class. type: str - default: all-MiniLM-L6-v2 + default: None api_keys: description: | The API keys to use for the model provider that are written to environment variables. Pass only the keys required by the model provider or conveniently pass all keys you will ever need. Pay attention how to name the dictionary keys so that they can be used by the model provider. type: dict + default: {} + auth_kwargs: + description: | + Additional keyword arguments required for api initialization/authentication. + type: dict + default: {} + \ No newline at end of file diff --git a/components/generate_embeddings/requirements.txt b/components/embed_text/requirements.txt similarity index 81% rename from components/generate_embeddings/requirements.txt rename to components/embed_text/requirements.txt index 9953d205c..c3a913339 100644 --- a/components/generate_embeddings/requirements.txt +++ b/components/embed_text/requirements.txt @@ -1,5 +1,6 @@ aleph_alpha_client==3.5.1 cohere==4.27 +google-cloud-aiplatform==1.34.0 langchain==0.0.313 openai==0.28.1 pandas==1.5.0 diff --git a/components/generate_embeddings/src/main.py b/components/embed_text/src/main.py similarity index 60% rename from components/generate_embeddings/src/main.py rename to components/embed_text/src/main.py index 9f40749bf..c8c2acfde 100644 --- a/components/generate_embeddings/src/main.py +++ b/components/embed_text/src/main.py @@ -1,5 +1,7 @@ import logging +import os +import google.cloud.aiplatform as aip import pandas as pd from fondant.component import PandasTransformComponent from langchain.embeddings import ( @@ -7,27 +9,45 @@ CohereEmbeddings, HuggingFaceEmbeddings, OpenAIEmbeddings, + VertexAIEmbeddings, ) +from langchain.schema.embeddings import Embeddings from retry import retry -from utils import to_env_vars logger = logging.getLogger(__name__) -class GenerateEmbeddingsComponent(PandasTransformComponent): +def to_env_vars(api_keys: dict): + for key, value in api_keys.items(): + os.environ[key] = value + + +class EmbedTextComponent(PandasTransformComponent): def __init__( self, *_, model_provider: str, model: str, api_keys: dict, + auth_kwargs: dict, ): - self.model_provider = model_provider - self.model = model + self.embedding_model = self.get_embedding_model( + model_provider, + model, + auth_kwargs, + ) to_env_vars(api_keys) - def get_embedding_model(self, model_provider, model: str): + @staticmethod + def get_embedding_model( + model_provider, + model: str, + auth_kwargs: dict, + ) -> Embeddings: + if model_provider == "vertexai": + aip.init(**auth_kwargs) + return VertexAIEmbeddings(model=model) # contains a first selection of embedding models if model_provider == "aleph_alpha": return AlephAlphaAsymmetricSemanticEmbedding(model=model) @@ -41,13 +61,11 @@ def get_embedding_model(self, model_provider, model: str): raise ValueError(msg) @retry() # make sure to keep trying even when api call limit is reached - def get_embeddings_vectors(self, embedding_model, texts): - return embedding_model.embed_documents(texts.tolist()) + def get_embeddings_vectors(self, texts): + return self.embedding_model.embed_documents(texts.tolist()) def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - embedding_model = self.get_embedding_model(self.model_provider, self.model) dataframe[("text", "embedding")] = self.get_embeddings_vectors( - embedding_model, dataframe[("text", "data")], ) return dataframe diff --git a/components/embed_text/test_requirements.txt b/components/embed_text/test_requirements.txt new file mode 100644 index 000000000..de1887bec --- /dev/null +++ b/components/embed_text/test_requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 \ No newline at end of file diff --git a/components/generate_embeddings/tests/generate_embeddings_test.py b/components/embed_text/tests/embed_text_test.py similarity index 80% rename from components/generate_embeddings/tests/generate_embeddings_test.py rename to components/embed_text/tests/embed_text_test.py index 8d9804d0f..61ddc06d4 100644 --- a/components/generate_embeddings/tests/generate_embeddings_test.py +++ b/components/embed_text/tests/embed_text_test.py @@ -4,7 +4,7 @@ import pandas as pd -from components.generate_embeddings.src.main import GenerateEmbeddingsComponent +from src.main import EmbedTextComponent def embeddings_close(a, b): @@ -13,9 +13,9 @@ def embeddings_close(a, b): def test_run_component_test(): """Test generate embeddings component.""" - with open("lorem_300.txt", encoding="utf-8") as f: + with open("tests/lorem_300.txt", encoding="utf-8") as f: lorem_300 = f.read() - with open("lorem_400.txt", encoding="utf-8") as f: + with open("tests/lorem_400.txt", encoding="utf-8") as f: lorem_400 = f.read() # Given: Dataframe with text @@ -29,15 +29,16 @@ def test_run_component_test(): dataframe = pd.concat({"text": pd.DataFrame(data)}, axis=1, names=["text", "data"]) - component = GenerateEmbeddingsComponent( + component = EmbedTextComponent( model_provider="huggingface", model="all-MiniLM-L6-v2", api_keys={}, + auth_kwargs={}, ) dataframe = component.transform(dataframe=dataframe) - with open("hello_world_embedding.txt", encoding="utf-8") as f: + with open("tests/hello_world_embedding.txt", encoding="utf-8") as f: hello_world_embedding = f.read() hello_world_embedding = json.loads(hello_world_embedding) diff --git a/components/generate_embeddings/tests/hello_world_embedding.txt b/components/embed_text/tests/hello_world_embedding.txt similarity index 100% rename from components/generate_embeddings/tests/hello_world_embedding.txt rename to components/embed_text/tests/hello_world_embedding.txt diff --git a/components/generate_embeddings/tests/lorem_300.txt b/components/embed_text/tests/lorem_300.txt similarity index 100% rename from components/generate_embeddings/tests/lorem_300.txt rename to components/embed_text/tests/lorem_300.txt diff --git a/components/generate_embeddings/tests/lorem_400.txt b/components/embed_text/tests/lorem_400.txt similarity index 100% rename from components/generate_embeddings/tests/lorem_400.txt rename to components/embed_text/tests/lorem_400.txt diff --git a/components/generate_embeddings/src/utils.py b/components/generate_embeddings/src/utils.py deleted file mode 100644 index 34af078bc..000000000 --- a/components/generate_embeddings/src/utils.py +++ /dev/null @@ -1,6 +0,0 @@ -import os - - -def to_env_vars(api_keys: dict): - for key, value in api_keys.items(): - os.environ[key] = value