Merge main into feature/move-integration-test

ml6team · Jan 11, 2024 · c774827 · c774827
2 parents b42bbdd + 16888c8
commit c774827
Show file tree

Hide file tree

Showing 33 changed files with 589 additions and 50 deletions.
diff --git a/components/caption_images/README.md b/components/caption_images/README.md
@@ -31,9 +31,9 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| model_id | str | Id of the BLIP model on the Hugging Face hub | Salesforce/blip-image-captioning-base |
-| batch_size | int | Batch size to use for inference | 8 |
-| max_new_tokens | int | Maximum token length of each caption | 50 |
+| model_id |  | Id of the BLIP model on the Hugging Face hub | Salesforce/blip-image-captioning-base |
+| batch_size |  | Batch size to use for inference | 8 |
+| max_new_tokens |  | Maximum token length of each caption | 50 |
 
 <a id="caption_images#usage"></a>
 ## Usage 

diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md
@@ -43,7 +43,7 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| chunk_strategy | int | The strategy to use for chunking the text. One of ['RecursiveCharacterTextSplitter', 'HTMLHeaderTextSplitter', 'CharacterTextSplitter', 'Language', 'MarkdownHeaderTextSplitter', 'MarkdownTextSplitter', 'SentenceTransformersTokenTextSplitter', 'LatexTextSplitter', 'SpacyTextSplitter', 'TokenTextSplitter', 'NLTKTextSplitter', 'PythonCodeTextSplitter', 'character', 'NLTK', 'SpaCy'] | RecursiveCharacterTextSplitter |
+| chunk_strategy | str | The strategy to use for chunking the text. One of ['RecursiveCharacterTextSplitter', 'HTMLHeaderTextSplitter', 'CharacterTextSplitter', 'Language', 'MarkdownHeaderTextSplitter', 'MarkdownTextSplitter', 'SentenceTransformersTokenTextSplitter', 'LatexTextSplitter', 'SpacyTextSplitter', 'TokenTextSplitter', 'NLTKTextSplitter', 'PythonCodeTextSplitter', 'character', 'NLTK', 'SpaCy'] | RecursiveCharacterTextSplitter |
 | chunk_kwargs | dict | The arguments to pass to the chunking strategy | / |
 | language_text_splitter | str | The programming language to use for splitting text into sentences if "language" is selected as the splitter. Check  https://python.langchain.com/docs/modules/data_connection/document_transformers/code_splitter for more information on supported languages. | / |
 

diff --git a/components/embed_images/README.md b/components/embed_images/README.md
@@ -31,8 +31,8 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| model_id | str | Model id of a CLIP model on the Hugging Face hub | openai/clip-vit-large-patch14 |
-| batch_size | int | Batch size to use when embedding | 8 |
+| model_id |  | Model id of a CLIP model on the Hugging Face hub | openai/clip-vit-large-patch14 |
+| batch_size |  | Batch size to use when embedding | 8 |
 
 <a id="embed_images#usage"></a>
 ## Usage 

diff --git a/components/evaluate_ragas/README.md b/components/evaluate_ragas/README.md
@@ -34,9 +34,9 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| module | str | Module from which the LLM is imported. Defaults to langchain.llms | langchain.llms |
-| llm_name | str | Name of the selected llm | / |
-| llm_kwargs | dict | Arguments of the selected llm | / |
+| llm_module_name |  | Module from which the LLM is imported. Defaults to langchain.llms | langchain.chat_models |
+| llm_class_name |  | Name of the selected llm | ChatOpenAI |
+| llm_kwargs |  | Arguments of the selected llm | {'model_name': 'gpt-3.5-turbo'} |
 
 <a id="evalute_ragas#usage"></a>
 ## Usage 
@@ -55,9 +55,9 @@ dataset = dataset.apply(
     "evaluate_ragas",
     arguments={
         # Add arguments
-        # "module": "langchain.llms",
-        # "llm_name": ,
-        # "llm_kwargs": {},
+        # "llm_module_name": "langchain.chat_models",
+        # "llm_class_name": "ChatOpenAI",
+        # "llm_kwargs": {'model_name': 'gpt-3.5-turbo'},
     },
     produces={
          <field_name>: <field_schema>,

diff --git a/components/evaluate_ragas/fondant_component.yaml b/components/evaluate_ragas/fondant_component.yaml
@@ -19,13 +19,15 @@ produces:
 
 
 args:
-  module:
+  llm_module_name:
     description: Module from which the LLM is imported. Defaults to langchain.llms
     type: str
-    default: "langchain.llms"
-  llm_name:
+    default: "langchain.chat_models"
+  llm_class_name:
     description: Name of the selected llm
     type: str
+    default: "ChatOpenAI"
   llm_kwargs:
     description: Arguments of the selected llm
     type: dict
+    default: {"model_name":"gpt-3.5-turbo"}
diff --git a/components/evaluate_ragas/src/main.py b/components/evaluate_ragas/src/main.py
@@ -11,24 +11,26 @@ class RetrieverEval(PandasTransformComponent):
     def __init__(
         self,
         *,
-        module: str,
-        llm_name: str,
+        llm_module_name: str,
+        llm_class_name: str,
         llm_kwargs: dict,
         produces: t.Dict[str, t.Any],
         **kwargs,
     ) -> None:
         """
         Args:
+            llm_module_name: Module from which the LLM is imported. Defaults to
+             langchain.chat_models
+            llm_class_name: Name of the selected llm. Defaults to ChatOpenAI
             module: Module from which the LLM is imported. Defaults to langchain.llms
-            llm_name: Name of the selected llm
             llm_kwargs: Arguments of the selected llm
             produces: RAGAS metrics to compute.
             kwargs: Unhandled keyword arguments passed in by Fondant.
         """
         self.llm = self.extract_llm(
-            module=module,
-            model_name=llm_name,
-            model_kwargs=llm_kwargs,
+            llm_module_name=llm_module_name,
+            llm_class_name=llm_class_name,
+            llm_kwargs=llm_kwargs,
         )
         self.gpt_wrapper = LangchainLLM(llm=self.llm)
         self.metric_functions = self.extract_metric_functions(
@@ -38,13 +40,16 @@ def __init__(
 
     # import the metric functions selected
     @staticmethod
-    def import_from(module, name):
-        module = __import__(module, fromlist=[name])
-        return getattr(module, name)
+    def import_from(module_name: str, element_name: str):
+        module = __import__(module_name, fromlist=[element_name])
+        return getattr(module, element_name)
 
-    def extract_llm(self, module, model_name, model_kwargs):
-        module = self.import_from(module, model_name)
-        return module(**model_kwargs)
+    def extract_llm(self, llm_module_name: str, llm_class_name: str, llm_kwargs: dict):
+        module = self.import_from(
+            module_name=llm_module_name,
+            element_name=llm_class_name,
+        )
+        return module(**llm_kwargs)
 
     def extract_metric_functions(self, metrics: list):
         functions = []

diff --git a/components/filter_language/README.md b/components/filter_language/README.md
@@ -29,7 +29,7 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| language | str | A valid language code or identifier (e.g., "en", "fr", "de"). | en |
+| language |  | A valid language code or identifier (e.g., "en", "fr", "de"). | en |
 
 <a id="filter_language#usage"></a>
 ## Usage 

diff --git a/components/index_aws_opensearch/src/main.py b/components/index_aws_opensearch/src/main.py
@@ -39,6 +39,9 @@ def __init__(
         )
         self.create_index(index_body)
 
+    def teardown(self) -> None:
+        self.client.close()
+
     def create_index(self, index_body: Dict[str, Any]):
         """Creates an index if not existing in AWS OpenSearch.
 

diff --git a/components/index_qdrant/src/main.py b/components/index_qdrant/src/main.py
@@ -47,6 +47,9 @@ def __init__(
         self.batch_size = batch_size
         self.parallelism = parallelism
 
+    def teardown(self) -> None:
+        self.client.close()
+
     def write(self, dataframe: dd.DataFrame) -> None:
         """
         Writes the data from the given Dask DataFrame to the Qdrant collection.

diff --git a/components/index_weaviate/src/main.py b/components/index_weaviate/src/main.py
@@ -53,6 +53,9 @@ def __init__(
                 },
             )
 
+    def teardown(self) -> None:
+        del self.client
+
     def write(self, dataframe: dd.DataFrame) -> None:
         with self.client.batch as batch:
             for part in tqdm(

diff --git a/components/load_from_pdf/Dockerfile b/components/load_from_pdf/Dockerfile
@@ -0,0 +1,30 @@
+FROM --platform=linux/amd64 python:3.8-slim as base
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component
+COPY src/ src/
+
+FROM base as test
+COPY tests/ tests/
+RUN pip3 install --no-cache-dir -r tests/requirements.txt
+RUN python -m pytest tests
+
+FROM base
+COPY tests/ tests/
+WORKDIR /component/src
+ENTRYPOINT ["fondant", "execute", "main"]
+
diff --git a/components/load_from_pdf/README.md b/components/load_from_pdf/README.md
@@ -0,0 +1,69 @@
+# Load from pdf
+
+<a id="load_from_pdf#description"></a>
+## Description
+Load pdf data stored locally or remote using langchain loaders.
+
+
+<a id="load_from_pdf#inputs_outputs"></a>
+## Inputs / outputs 
+
+<a id="load_from_pdf#consumes"></a>
+### Consumes 
+
+
+**This component does not consume data.**
+
+
+<a id="load_from_pdf#produces"></a>  
+### Produces 
+**This component produces:**
+
+- pdf_path: string
+- file_name: string
+- text: string
+
+
+
+<a id="load_from_pdf#arguments"></a>
+## Arguments
+
+The component takes the following arguments to alter its behavior:
+
+| argument | type | description | default |
+| -------- | ---- | ----------- | ------- |
+| pdf_path | str | The path to the a pdf file or a folder containing pdf files to load. Can be a local path or a remote path. If the path is remote, the loader class will be determined by the scheme of the path. | / |
+| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / |
+| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / |
+| n_partitions | int | Number of partitions of the dask dataframe. If not specified, the number of partitions will be equal to the number of CPU cores. Set to high values if the data is large and the pipelineis running out of memory. | / |
+
+<a id="load_from_pdf#usage"></a>
+## Usage 
+
+You can add this component to your pipeline using the following code:
+
+```python
+from fondant.pipeline import Pipeline
+
+
+pipeline = Pipeline(...)
+
+dataset = pipeline.read(
+    "load_from_pdf",
+    arguments={
+        # Add arguments
+        # "pdf_path": ,
+        # "n_rows_to_load": 0,
+        # "index_column": ,
+        # "n_partitions": 0,
+    },
+)
+```
+
+<a id="load_from_pdf#testing"></a>
+## Testing
+
+You can run the tests using docker with BuildKit. From this directory, run:
+```
+docker build . --target test
+```
diff --git a/components/load_from_pdf/fondant_component.yaml b/components/load_from_pdf/fondant_component.yaml
@@ -0,0 +1,41 @@
+name: Load from pdf
+description: |
+  Load pdf data stored locally or remote using langchain loaders.
+image: fndnt/load_from_pdf:dev
+tags:
+  - Data loading
+
+produces:
+  pdf_path:
+    type: string
+  file_name:
+    type: string
+  text:
+    type: string
+
+args:
+  pdf_path:
+    description: |
+      The path to the a pdf file or a folder containing pdf files to load. 
+      Can be a local path or a remote path. If the path is remote, the loader class will be 
+      determined by the scheme of the path.
+    type: str
+  n_rows_to_load:
+    description: |
+      Optional argument that defines the number of rows to load. Useful for testing pipeline runs 
+      on a small scale
+    type: int
+    default: None
+  index_column:
+    description: |
+      Column to set index to in the load component, if not specified a default globally unique 
+      index will be set
+    type: str
+    default: None
+  n_partitions:
+    description: |
+      Number of partitions of the dask dataframe. If not specified, the number of partitions will 
+      be equal to the number of CPU cores. Set to high values if the data is large and the pipeline
+      is running out of memory.
+    type: int
+    default: None
diff --git a/components/load_from_pdf/requirements.txt b/components/load_from_pdf/requirements.txt
@@ -0,0 +1 @@
+PyMuPDF==1.23.8