From e67715775802e6b98133a7bab4d7ba85010d59c5 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 11 Jan 2024 10:41:48 +0100 Subject: [PATCH 1/6] Add teardown method (#767) PR that adds a teardown method to every component. Useful for shutting down database connections and clients instead of it happening abruptly after the container is shutdown. Related to https://github.com/ml6team/fondant-use-cases/issues/59 --- docs/components/component_spec.md | 35 +++++++++++++++++ src/fondant/component/component.py | 3 ++ src/fondant/component/executor.py | 4 +- tests/component/test_component.py | 63 ++++++++++++++++++++++++++++++ tests/examples/__init__.py | 0 5 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 tests/examples/__init__.py diff --git a/docs/components/component_spec.md b/docs/components/component_spec.md index 9299305a2..65e121c18 100644 --- a/docs/components/component_spec.md +++ b/docs/components/component_spec.md @@ -301,4 +301,39 @@ class ExampleComponent(PandasTransformComponent): Returns: A pandas dataframe containing the transformed data """ +``` + +Afterwards, we pass all keyword arguments to the `__init__()` method of the component. + + +You can also use the a `teardown()` method to perform any cleanup after the component has been executed. +This is a good place to close any open connections or files. + +```python +import pandas as pd +from fondant.component import PandasTransformComponent +from my_library import Client + + def __init__(self, *, client_url, **kwargs) -> None: + """ + Args: + x_argument: An argument passed to the component + """ + # Initialize your component here based on the arguments + self.client = Client(client_url) + + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + """Implement your custom logic in this single method + + Args: + dataframe: A Pandas dataframe containing the data + + Returns: + A pandas dataframe containing the transformed data + """ + + def teardown(self): + """Perform any cleanup after the component has been executed + """ + self.client.shutdown() ``` \ No newline at end of file diff --git a/src/fondant/component/component.py b/src/fondant/component/component.py index 82d539b84..7d9a86113 100644 --- a/src/fondant/component/component.py +++ b/src/fondant/component/component.py @@ -26,6 +26,9 @@ def __init__( ): pass + def teardown(self) -> None: + """Method called after the component has been executed.""" + class DaskLoadComponent(BaseComponent): """Component that loads data and returns a Dask DataFrame.""" diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 3026eb625..db3140703 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -335,7 +335,7 @@ def _run_execution( input_manifest: Manifest, ) -> Manifest: logging.info("Executing component") - component = component_cls( + component: Component = component_cls( consumes=self.operation_spec.inner_consumes, produces=self.operation_spec.inner_produces, **self.user_arguments, @@ -350,6 +350,8 @@ def _run_execution( ) self._write_data(dataframe=output_df, manifest=output_manifest) + component.teardown() + return output_manifest def execute(self, component_cls: t.Type[Component]) -> None: diff --git a/tests/component/test_component.py b/tests/component/test_component.py index 397ab210e..191e6e329 100644 --- a/tests/component/test_component.py +++ b/tests/component/test_component.py @@ -298,6 +298,69 @@ def load(self): load.mock.assert_called_once() +@pytest.mark.usefixtures("_patched_data_writing") +def test_teardown_method(metadata): + # Mock CLI arguments load + operation_spec = OperationSpec( + ComponentSpec.from_file(components_path / "component.yaml"), + ) + + sys.argv = [ + "", + "--metadata", + metadata.to_json(), + "--flag", + "success", + "--value", + "1", + "--output_manifest_path", + str(components_path / "output_manifest.json"), + "--operation_spec", + operation_spec.to_json(), + "--cache", + "False", + "--produces", + "{}", + ] + + class MockClient: + def __init__(self): + self.is_connected = True + + def shutdown(self): + if self.is_connected: + self.is_connected = False + + client = MockClient() + + class MyLoadComponent(DaskLoadComponent): + def __init__(self, *, flag, value, **kwargs): + self.flag = flag + self.value = value + self.client = client + + def load(self): + data = { + "id": [0, 1], + "captions_data": ["hello world", "this is another caption"], + } + return dd.DataFrame.from_dict(data, npartitions=N_PARTITIONS) + + def teardown(self) -> None: + self.client.shutdown() + + executor_factory = ExecutorFactory(MyLoadComponent) + executor = executor_factory.get_executor() + assert executor.input_partition_rows is None + + teardown = patch_method_class(MyLoadComponent.teardown) + assert client.is_connected is True + with mock.patch.object(MyLoadComponent, "teardown", teardown): + executor.execute(MyLoadComponent) + teardown.mock.assert_called_once() + assert client.is_connected is False + + @pytest.mark.usefixtures("_patched_data_loading", "_patched_data_writing") def test_dask_transform_component(metadata): operation_spec = OperationSpec( diff --git a/tests/examples/__init__.py b/tests/examples/__init__.py new file mode 100644 index 000000000..e69de29bb From b422fc315b0a46ee3b67da619299adfae6f81395 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 11 Jan 2024 10:47:38 +0100 Subject: [PATCH 2/6] Add load from pdf component (#765) Fixes https://github.com/ml6team/fondant-use-cases/issues/54 PR that adds the functionality to load pdf documents from different local and remote storage. The implementation differs from the suggested solution at [#54](https://github.com/ml6team/fondant-use-cases/issues/54) since: * Accumulating different loaders and loading each document individually seems to be inefficient since it would require the initialization of a client, temp storage, ... on every invocation [link](https://github.com/langchain-ai/langchain/blob/04caf07dee2e2843ab720e5b8f0c0e83d0b86a3e/libs/community/langchain_community/document_loaders/gcs_file.py#L62) * The langchain cloud loaders don't have a unified interface * Each would requires specific arguments to be passed (in contrast fsspec is much simpler) * Only the google loader enables defining a custom loader class, the rest uses the `Unstructured` loader which requires a lot of system and cuda dependencies to have it installed (a lot of overhead for just loading pdfs) The current implementation relies on copying the pdfs to a temporary local storage and loading them using the `PyPDFDirectoryLoader`, they are then loaded lazily. The assumption for now is that the loaded docs won't exceed the storage of the device which should be valid for most use cases. Later on, we can think on how to optimize this further. --- components/load_from_pdf/Dockerfile | 30 +++++ components/load_from_pdf/README.md | 69 ++++++++++ .../load_from_pdf/fondant_component.yaml | 41 ++++++ components/load_from_pdf/requirements.txt | 1 + components/load_from_pdf/src/main.py | 127 ++++++++++++++++++ .../load_from_pdf/tests/component_test.py | 47 +++++++ .../tests/fondant_component.yaml | 34 +++++ components/load_from_pdf/tests/pytest.ini | 2 + .../load_from_pdf/tests/requirements.txt | 1 + .../load_from_pdf/tests/test_file/dummy.pdf | Bin 0 -> 13264 bytes .../tests/test_folder/dummy_1.pdf | Bin 0 -> 13264 bytes .../tests/test_folder/dummy_2.pdf | Bin 0 -> 13264 bytes 12 files changed, 352 insertions(+) create mode 100644 components/load_from_pdf/Dockerfile create mode 100644 components/load_from_pdf/README.md create mode 100644 components/load_from_pdf/fondant_component.yaml create mode 100644 components/load_from_pdf/requirements.txt create mode 100644 components/load_from_pdf/src/main.py create mode 100644 components/load_from_pdf/tests/component_test.py create mode 100644 components/load_from_pdf/tests/fondant_component.yaml create mode 100644 components/load_from_pdf/tests/pytest.ini create mode 100644 components/load_from_pdf/tests/requirements.txt create mode 100644 components/load_from_pdf/tests/test_file/dummy.pdf create mode 100644 components/load_from_pdf/tests/test_folder/dummy_1.pdf create mode 100644 components/load_from_pdf/tests/test_folder/dummy_2.pdf diff --git a/components/load_from_pdf/Dockerfile b/components/load_from_pdf/Dockerfile new file mode 100644 index 000000000..a0810612f --- /dev/null +++ b/components/load_from_pdf/Dockerfile @@ -0,0 +1,30 @@ +FROM --platform=linux/amd64 python:3.8-slim as base + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=main +RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component +COPY src/ src/ + +FROM base as test +COPY tests/ tests/ +RUN pip3 install --no-cache-dir -r tests/requirements.txt +RUN python -m pytest tests + +FROM base +COPY tests/ tests/ +WORKDIR /component/src +ENTRYPOINT ["fondant", "execute", "main"] + diff --git a/components/load_from_pdf/README.md b/components/load_from_pdf/README.md new file mode 100644 index 000000000..d257f3dc9 --- /dev/null +++ b/components/load_from_pdf/README.md @@ -0,0 +1,69 @@ +# Load from pdf + + +## Description +Load pdf data stored locally or remote using langchain loaders. + + + +## Inputs / outputs + + +### Consumes + + +**This component does not consume data.** + + + +### Produces +**This component produces:** + +- pdf_path: string +- file_name: string +- text: string + + + + +## Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| pdf_path | str | The path to the a pdf file or a folder containing pdf files to load. Can be a local path or a remote path. If the path is remote, the loader class will be determined by the scheme of the path. | / | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / | +| n_partitions | int | Number of partitions of the dask dataframe. If not specified, the number of partitions will be equal to the number of CPU cores. Set to high values if the data is large and the pipelineis running out of memory. | / | + + +## Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import Pipeline + + +pipeline = Pipeline(...) + +dataset = pipeline.read( + "load_from_pdf", + arguments={ + # Add arguments + # "pdf_path": , + # "n_rows_to_load": 0, + # "index_column": , + # "n_partitions": 0, + }, +) +``` + + +## Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/load_from_pdf/fondant_component.yaml b/components/load_from_pdf/fondant_component.yaml new file mode 100644 index 000000000..d1ec61476 --- /dev/null +++ b/components/load_from_pdf/fondant_component.yaml @@ -0,0 +1,41 @@ +name: Load from pdf +description: | + Load pdf data stored locally or remote using langchain loaders. +image: fndnt/load_from_pdf:dev +tags: + - Data loading + +produces: + pdf_path: + type: string + file_name: + type: string + text: + type: string + +args: + pdf_path: + description: | + The path to the a pdf file or a folder containing pdf files to load. + Can be a local path or a remote path. If the path is remote, the loader class will be + determined by the scheme of the path. + type: str + n_rows_to_load: + description: | + Optional argument that defines the number of rows to load. Useful for testing pipeline runs + on a small scale + type: int + default: None + index_column: + description: | + Column to set index to in the load component, if not specified a default globally unique + index will be set + type: str + default: None + n_partitions: + description: | + Number of partitions of the dask dataframe. If not specified, the number of partitions will + be equal to the number of CPU cores. Set to high values if the data is large and the pipeline + is running out of memory. + type: int + default: None diff --git a/components/load_from_pdf/requirements.txt b/components/load_from_pdf/requirements.txt new file mode 100644 index 000000000..9b0233e4f --- /dev/null +++ b/components/load_from_pdf/requirements.txt @@ -0,0 +1 @@ +PyMuPDF==1.23.8 \ No newline at end of file diff --git a/components/load_from_pdf/src/main.py b/components/load_from_pdf/src/main.py new file mode 100644 index 000000000..f088f45ff --- /dev/null +++ b/components/load_from_pdf/src/main.py @@ -0,0 +1,127 @@ +import logging +import os +import typing as t + +import dask.dataframe as dd +import fitz +import fsspec as fs +import pandas as pd +from fondant.component import DaskLoadComponent +from fondant.core.component_spec import OperationSpec + +logger = logging.getLogger(__name__) + + +class PDFReader(DaskLoadComponent): + def __init__( + self, + spec: OperationSpec, + *, + pdf_path: str, + n_rows_to_load: t.Optional[int] = None, + index_column: t.Optional[str] = None, + n_partitions: t.Optional[int] = None, + ) -> None: + """ + Args: + spec: the operation spec for the component + pdf_path: Path to the PDF file + n_rows_to_load: optional argument that defines the number of rows to load. + Useful for testing pipeline runs on a small scale. + index_column: Column to set index to in the load component, if not specified a default + globally unique index will be set. + n_partitions: Number of partitions of the dask dataframe. If not specified, the number + of partitions will be equal to the number of CPU cores. Set to high values if + the data is large and the pipeline is running out of memory. + """ + self.spec = spec + self.pdf_path = pdf_path + self.n_rows_to_load = n_rows_to_load + self.index_column = index_column + self.protocol = fs.utils.get_protocol(self.pdf_path) + self.fs, _ = fs.core.url_to_fs(self.pdf_path) + self.n_partitions = n_partitions if n_partitions is not None else os.cpu_count() + + def set_df_index(self, dask_df: dd.DataFrame) -> dd.DataFrame: + if self.index_column is None: + logger.info( + "Index column not specified, setting a globally unique index", + ) + + def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): + """Function that sets a unique index based on the partition and row number.""" + dataframe["id"] = 1 + dataframe["id"] = ( + str(partition_info["number"]) + + "_" + + (dataframe.id.cumsum()).astype(str) + ) + dataframe.index = dataframe.pop("id") + return dataframe + + def _get_meta_df() -> pd.DataFrame: + meta_dict = {"id": pd.Series(dtype="object")} + for field_name, field in self.spec.inner_produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) + return pd.DataFrame(meta_dict).set_index("id") + + meta = _get_meta_df() + dask_df = dask_df.map_partitions(_set_unique_index, meta=meta) + else: + logger.info(f"Setting `{self.index_column}` as index") + dask_df = dask_df.set_index(self.index_column, drop=True) + + return dask_df + + def load_pdf_from_fs(self, file_path: str): + with self.fs.open(file_path, "rb") as pdf_file: + pdf_bytes = pdf_file.read() + + documents = fitz.open("pdf", pdf_bytes) + # get all text + text = "".join([document.get_text() for document in documents]) + documents.close() + + return text + + def process_pdf(self, row): + file_path = row["pdf_path"] + text = self.load_pdf_from_fs(file_path) + row["file_name"] = file_path.split("/")[-1] # Extracting filename + row["text"] = text + return row + + def load(self) -> dd.DataFrame: + try: + file_paths = self.fs.ls(self.pdf_path) + except NotADirectoryError: + file_paths = [self.pdf_path] + + file_paths = [ + file_path for file_path in file_paths if file_path.endswith(".pdf") + ] + + if self.n_rows_to_load is not None: + file_paths = file_paths[: self.n_rows_to_load] + + dask_df = dd.from_pandas( + pd.DataFrame({"pdf_path": file_paths}), + npartitions=self.n_partitions, + ) + + meta_dict = {} + for field_name, field in self.spec.inner_produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) + meta_dict = pd.DataFrame(meta_dict) + + dask_df = dask_df.map_partitions( + lambda part: part.apply(self.process_pdf, axis=1), + meta=meta_dict, + ) + + dask_df = self.set_df_index(dask_df) + return dask_df diff --git a/components/load_from_pdf/tests/component_test.py b/components/load_from_pdf/tests/component_test.py new file mode 100644 index 000000000..41c8eb66c --- /dev/null +++ b/components/load_from_pdf/tests/component_test.py @@ -0,0 +1,47 @@ +from pathlib import Path + +import yaml +from fondant.core.component_spec import ComponentSpec, OperationSpec + +from src.main import PDFReader + + +def test_pdf_reader(): + """Test the component with the ArxivReader. + + This test requires a stable internet connection, both to download the loader, and to download + the papers from Arxiv. + """ + with open(Path(__file__).with_name("fondant_component.yaml")) as f: + print(f.name) + spec = ComponentSpec(yaml.safe_load(f)) + spec = OperationSpec(spec) + + pdf_path = ["tests/test_file/dummy.pdf", "tests/test_folder"] + + for path in pdf_path: + component = PDFReader( + spec=spec, + pdf_path=path, + n_rows_to_load=None, + index_column=None, + ) + + output_dataframe = component.load().compute() + + assert output_dataframe.columns.tolist() == ["pdf_path", "file_name", "text"] + + if path == "tests/test_file/dummy.pdf": + assert output_dataframe.shape == (1, 3) + assert output_dataframe["file_name"].tolist() == ["dummy.pdf"] + assert output_dataframe["text"].tolist() == ["Dummy PDF file\n"] + else: + assert output_dataframe.shape == (2, 3) + assert output_dataframe["file_name"].tolist() == [ + "dummy_2.pdf", + "dummy_1.pdf", + ] + assert output_dataframe["text"].tolist() == [ + "Dummy PDF file\n", + "Dummy PDF file\n", + ] diff --git a/components/load_from_pdf/tests/fondant_component.yaml b/components/load_from_pdf/tests/fondant_component.yaml new file mode 100644 index 000000000..b255587e9 --- /dev/null +++ b/components/load_from_pdf/tests/fondant_component.yaml @@ -0,0 +1,34 @@ +name: Load from pdf +description: | + Load pdf data stored locally or remote using langchain loaders. +image: fndnt/load_from_pdf:dev +tags: + - Data loading + +produces: + pdf_path: + type: string + file_name: + type: string + text: + type: string + +args: + pdf_path: + description: | + The path to the a pdf file or a folder containing pdf files to load. + Can be a local path or a remote path. If the path is remote, the loader class will be + determined by the scheme of the path. + type: str + n_rows_to_load: + description: | + Optional argument that defines the number of rows to load. Useful for testing pipeline runs + on a small scale + type: int + default: None + index_column: + description: | + Column to set index to in the load component, if not specified a default globally unique + index will be set + type: str + default: None diff --git a/components/load_from_pdf/tests/pytest.ini b/components/load_from_pdf/tests/pytest.ini new file mode 100644 index 000000000..bf6a8a517 --- /dev/null +++ b/components/load_from_pdf/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = ../src \ No newline at end of file diff --git a/components/load_from_pdf/tests/requirements.txt b/components/load_from_pdf/tests/requirements.txt new file mode 100644 index 000000000..2a929edcc --- /dev/null +++ b/components/load_from_pdf/tests/requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 diff --git a/components/load_from_pdf/tests/test_file/dummy.pdf b/components/load_from_pdf/tests/test_file/dummy.pdf new file mode 100644 index 0000000000000000000000000000000000000000..774c2ea70c55104973794121eae56bcad918da97 GIT binary patch literal 13264 zcmaibWmsIxvUW%|5FkJZ7A&~y%m9Oj;I6>~WPrgfxD$eVfZ*=#?hsspJHa(bATYRn zGueBev(G*EKHr+BrK+pDs^6;aH9u<6Dv3$30@ygwX}fZ|TDt1G($Rqw927PN=I8~c_R69-cY5S*jJE@5Wr0JUS6u!J~3#h`{ZMo=LkbbALoD8vfgB}Fh|2>mhOnfS$3 zNV5}8Ox=$fj;C0=UKy*{myZZPRVS|0mqr-HxZAy;()@wxQ}MN`QWAZTXb3Z&Om9W2 zbnA^OWoQbAW|3W^fw#J;YzDato8*`rHQs+@W70D&SyT{wb`SN*3nI z5G%$wJlq932=n{60Eii*9H8dFih2ks?QY=>nAFL=5g^P@#b{YUEHt0S$D7WbX zx%TzvzIK%zpvzLEd9LNr0ch#LFf_(9 zEGt0C9v~%b54vynAc{~;v&2?S(-sTTft@9CABMNFZHtY1W0-99CEbUNfp_yu{LDBz z@8z^$LPN$wX4Hi+dZQs6K3QiKKF0}Nme@EII;;F}IplC(YvT*C3-Oh#(A}e5pIz01 zyR}D2|ftBF0T=1moHZy}$wS*PSCmSzHQ%x z2tCQQCx4jt7w1cuhY69~eH`31KC4)ZZJ^)f=IabocAkBPa zEeg25yPX&9-i_N(Qiq!I3RDrfx&0t^i)&MSQ1D(w%|%#LTNr>1cPiltAYO;6kBn(B?r11c^Bz~#)z5~~V+*`U)lDFtKbZ|;? z&4wTUtK=KE&uQIWUQv1mDE;LIhXXgx44PMa@%Z<7a& zx45^oYSnei^~%}`?!O-+cgfSmn_c?`=Gmm*Z^I(96ve&$zDs|)r84)IEEiE1kfQ$q zm3km*m1)PjdU9nkk9BTlidI1~M|O~WfP7AUu2T}d>5is9l$<%;7r2&Re06w>W$KM~ zqITBTd=Ln>^crw`_N?{ z;2d_=E0n!*NisQ|XYuX9q3+UcqdA(MC45|>2tz^c6HdZOmXTB?X2Elx@_0f)1z&-gS;UxN`>Ll-kWb0X0 zTrQis=w9sJ(q7k|@|k3SA~DJ@uMXP@4(Mgn+LJC+3F~3NHW71pIzY(aHg~{O+squi zWO_|F>78)L5*gcRXXRD9IzQ(ddSxh}E7(8sC~EYrOz$9BkSMBCkGGO9FuZ{#*mW+h zvwE7d)6Ag=a*R5URs>}qdqb_E6g)kN2Wel;pWe9=hZ)XvRZR!RQg&gxAPGj8J0!gR zrdV<2@MZQ?_Ocbd5@0zI?t>$z3eD80_h^{DI)H5lk`T4lbn8kteH3%fOBH^g26#lLN2&P^s zr&d05GDs)u_8OKzCgNxllk5pLC<2wKmghL{zW%}5^}%S$?d=3OzjaSzT3>uWYikZN z2ZcR7*L|%UMs|u)wMi7#vkN?cxlBcyAM80Tyzzv&zHMF1TH9?Mx5&E57P^)^zE5N| z^foq}!--if$Uj=U6Tc>EM!Pv)e^_SZSdvtQ=@>)(ONejQ!XW8u6>ESl<*s^6cH;Q1 z#n}nL{#|{l}}@td^zNSA;R{`3A&Jjr8L9(3^2FSyZ1W9$%;!XP#N2 z-SAzyRfxtgq^py7_3*GJFO%x_v<`xJ46`~S*IukgQDKfLxzFnS&GYL!1LA{I z!c#{A90{k(b*tUfbgjOH>}{#V;%^O+LUU<*#QkLtWzjho*Kb?Cr&wC38%wxpn}^Wy zG6EpV9x3xioCWA6H6=aE3)%jmZePu#Ji7wy0CmkDZNG`a{J1i-2`Bt&UrFb&<~V$^ zy9i`R1<35M&{mtCz144%v#7LKBTPPApjoV}#W-gDc5cn;A@Mbt#zXUK@J9^vj*ME( zo8(%K{c-KDr8n1-I&Mjn)*i|pF|7l*`fXvo8-z&j{$NOfUPM-xILbX1D29IHp|__B zL*JQ8*7-VrZVY*&$!PiE%zv@osg`qx0M8+w9iy7Az7;HYezs;5NRvrdNM~t@o}5Gc zjagk3Y_>6!Ct;ITqhu3FojJO^(^SG-($M4|frkp?4y-QoSmFcw9Z%(z?eC0kGi9@? zm(vAgXU|%!6_)CrnqYL-Hj@B5hA?#8C3G^cjd?0dMSZ!wbe%O4bWvlIG=nwOEInVj zhjzd`Bry8sXBTfIUr+juZH5JyE#7~UQiwR!gmG@wm}aNyo`13xEo)tzP64MWWG|j8 z8u8a2_=C2FdRZ9(eG&Au`@$mY9vvWldP-@wj5@38H0W2V8wnaQO?!)qoS_J=(ieoI zOvH}mkBRh_p1oTW66+?3u-GH2Ex~c=BQiwpJ zJlF7O2PBaCojRRL_mp44*Iq}vcRFpBD>V9M7do5{w&b;4^<_V~Vr{+O_&hz9k5Sm` zq3|%Z(6B5~wz2k0iH-QlafAa>1%ZebdxkR;6SdA?@dK|4Jf8PIO%64Fpw$6RYG2R# zX>Iq(xf`5Xk)79-@;BAQjlWu|w@Ss3sJv3Ew&%lBu-H?vYsC8XPJD!lkv*A~z_-k= zLOaM?B5}$Sf-KF5BWHoB51WFA{GlweQna618{*tqVn)YKUVq?khU_=QER9uW?N17xgAponbjg0W`=>f;sulH3?st)Y_@k$We2-__a>^{E78lUiI13qq!3# zwxMEl75MK1q`~J>ST#?`mUx#vr%-jwpZ+DV;W!0KNkZmO#sK)zt)H@`EQl6RRWhwb z0&E7|fG~@z)wlK1-RsxN#8Gr)D5=xpv=b}=CWPbwz@(9bIhD0Crd-Q>qEo>~Gh{X7 z77AK5>TfF0wK!?7Nx!<5uDy?D{Qg$SEc_R3J9EuH!Z@qmEJ*QRRHd3BPirM6783nv zAnab$>rhdDJ6pO@%Ox(}BYw{Ba<3|=A%Fg5_Hfxj{%CfzZCFO{?%h&=?%CNBvi&p; z(otqN>+5giLLa^*G?xzN30=IgQrV+r7dW4bX;zKtuD)O$UnwAKC?CpkPt{77nUArH ze-jKcCfRrOlp(Q^b&W}mrgt4n%wikNxeSBBE_n>K-IOIzi6!<)xGRYA)wGgqp^s@d46N#krDHPc#9SOgXhI7Vbj?B z%c6@8dCOGPYBoNE#3N7HD^ihbC9*xGm6chu;?fcuv)s01keHHZ1vXl5D;29O7wZBr zyPzyLZHKMtUI%PK+*X2zTFtaDzU1qn(H=hRRj-SoJw7I5i%4b0u=&InEAKgoae-lp zXk0SkjlJ52HruS*1QykTZ&aCN`PbcKuw$1st{peJ@&aF^aR@~{XA@L&YvK%+VU}G4 ze5iuesu&i6=*#nvHbm_v-ZLr5^Ij#|YSAper4XpsH;0x(2h1-tIobIy;0~2a( z!G($SB!iu#P;;hGeI~C`O=-3|d~zoB0!`*JrU-)Ko_X5#kSpy5o^z49RG;{j#l~45 zF?X9Ih4IdviT(8@+q|`BveLTprbESZ6^2I&ew|V3pDXRe9gSyXT)zzqKQ;gCD;p+( zM)2(;YJ%P5)X(N3ZSn>dn6UIcEcvQOXZBn}uD!7V0yXr$f+d@eTSYoquPit2S8cPW zA8t3dX)Cv{0cKF`@e|PP(xS0|z2_R0(P6)#+kC$0^5- z$7Hs|bOQanE z1oJ;uh(dYiDt}mVmtC3&HaGT6-dY429v#ySHJ7V)C8ow=PSmnEI)=b3_RJsU(S*+J zV$p3>RkK?DFvTc;(-T=h!1u~CP!pE=0eSSu#c@N7S0Z57CPg}!5z{QL#`2v?DJDt^ zCGN{0p-&&=)Sb28Xlo;ZXc^CGdwL9prf30uu$y5aPeWD6WIk4%%~DEhTiwOvy!rS% z&3z#DWo2qBA*=M2xIu=_R0sbrmP;Y?_rRa^k}3WYU6n9H^(})Zi-woMKKXfgbab@J zWx3DUr0MLpdDYk_LO8As}d*Z=x^K+uIv#T&SnY6&C$9 zBn1u`G#TBt+n5b%a;Cr0h^sm5Fl^OdxJ^8IebW);DWATq#Ba=#rggj*wNKy5NMzz& zBm`bk9bcSVPJbC`dHrI>o^=LSvTFpT`VAK`x_naOpvS~*l2$1vIk$avBA!|aeZ+7c z$_9Zzh>fc4$uX&w@-$VORCscG(B)OA@SPj>BNY3gxkkcPgNi9bE=?&3A4`3ekrdsb zn~`M;p8I>4?@@ZI{9Afv(tC@pp@Oe5BYUw-%&J_WaTBGls)&d8q?t$i<<@=_CNfH! z4H!ww7#gkp_^`bxZaJI9@C+A9x7@E1ZRoG5PL?w3GDi>`8Qq%I+0ygfT78%{Zt#mP zqX0CzaHKn@hAOQsv=^8UbfpuyFnT8Ht++Vmmx$~09!e{5t8fMkEjr~tfIxMlIpr4zGwvEIWKC2`Q#C)c7QF9wet?hE zLKoU?t@nqm=iBc` z8_((*(i(g}7z)3{%SJ!uya{?Ir-2^Fiap*VC4pF@N zpL5F*DG+(taLhdu4DbyAP(0&60n@%?G~hHugBI^-X6@_YOu}8UqwbQ8V`2vwDRLMz z)aRFo+r1f?5idT9xRF`cjgx$a-IpH3AH|bs$emw}d23*3aU0hYNh4(D0o-Z+wIX{d zeann?lzjgsAt62`er@<$`G755?i7tl%CHNgXp}#j>j&S1n5wZ;ofNbI>B2*4L1}@3 zq(LzPqn()w{KBsX!5*a&=dv<}t=R%II;TcQatbnKM7S4Q1PQIoT=^$#=>Y(m{mBYtl5W z6}|l4kxikOcJ`C3o{TSxIi?8|N6sH7Lkhq5qttl@uBTA|-cBluU$hU0&xYKvNidrL z4q>|j76}G1Db23Fa|XlFm%W&jW0h#7B$_FD-ZhqJ5#7i!0ZmCrereX z|Jlf`<1zR2akFe|boWv-r=}kM03o|%$mZA7Of2T99u~e56~6sh$P=yk9f!H6msn)n zvFOLF?W?iqi6fK9C)a42Sgt0kz4#M6 z-UY6451Er~=V;ITs1O-q*>}{;bs74MMZ(Z&=Z{5#q+i@cw^vI#0|Dh~-Dh-tn2I(S zTXXp-bLEG{p0#BbIqIcTM|DWZmr`&br8u)jQ`CR*^+g_fIX%=K+)x}F%Oak-Uh$6nIHUavnNV5M7YffU80QPRD%y>T{bIzn<6Rsy zb6cW6`?0EwSn;uJddPn@`?^Cry2s(6ccP1ykKr!kmDg2~zbTJq@+e(z5N>ZNr|8$j zPi-~ofp7E|Xx1#H+f@UR@AS}iLP!}}dRwf{u!avAq-_hNw#uaoOD{2jo*eRn8$~bDK`h1&ssOC6ekGV38+hU!KR z+kpnSzT;y#o|V2h|F?SY4-z1MFxz0;)@Lk`H>Cj zSl@fR%*@F79;HJcsX%L8_d!%TwmQyi$|n&C{oBMJ9~Xm!@@#lZdz(WB9SgJ#NIC%@ zy+~ZnI|4E`7f@W0Y9I@N7UTs1fTPD-ZiU%Lr2MnP+2h8AGh?(WGVf>h@W-_M>jRkD z(KNxvo(UJ7)o+*t%fCcM10;2XM$1NAFKwhp(c917^io_ynn-yv58IFIF*UJUw*2Ma zm?a-a1yp9B?WxpLzap-c^$HKkX_IfT_W8Lqaltl*A%vZSZWAe`Kv}vjz}>Tc;Hw9T zA+Nc49X&{WDmxY~ReV0YceXdL!$9mTL$Q@_vXIW6I{G=`$KR7jFcE&IsHwnKX;KldV#YL z(xwKAB5cFiz+r6m*5iJvo&E)XQqVWjmA}BfyVS&dm9&Y%$Sp^sW!JE3iI0v(kQHdo zmhWk|gC!e@CFKPv4BE*U;mYo0y}J0J-Fhu!c%v+paQf9+3Ed2EkfPt(D7|Ok#t)^PGr3Y)RGfvO=k;@Xry=Cf3fLCQ# zi`%oCt+vyB-t{iEgI&+2dczmnMXj>EOmSpMuuL8Ob`1$D;fc$wM6j2HH4Q$ zqaoj&M$2sLhpptdJMbs!krJId=iOd}HdP4Lt@yf42OZ{pOoQ4_gShz_sMoWYX}yQd zDQ8(tc7UvTt%`0#?9K!C^J>GpucEnBhnsWg102Z=uzOlwez^q^j7nV$krID#wC}A$ zcRfc2)T5Y~({6@1`{yL-Lzs;miT@C9|1SIFBMK7cz*E;v2H|EStZphjfb5mGMpw{q z!pl;Vw772tuvDH4o$;j4u8)@=m+&BIf4Ix(u75P?Q{4Y8^uvpq)mCW(enuQc)hx$B zOY{`_*%~bm%k*x6y;)D8_-yYbMsC8y#1H}89X;M=a#*HT>d*NFf}x$pQ&X?nFtvzA zKH|l8y;frsm|&}<%&*}Yu}Yn0M=Jy8qe%<1qXRR%Nut}Aqr+1pQS*D7Cp`+8Y`RO02p14DyVOmSYlEzZ;9&JzYhtybMZ%e4s zlks=V(+aJ!LK-()3ox`%9c)lx#3#y4{ulL6KpG|&>9`n?Uh#m3G-mZy-3h98Scyja zH^3Pb7?P z+2hAkyvg}g$#)n$Gs2fL19JNOZ|~>Nx(|}lmwesC!>?Y~72mpf4XZ8t^TIwbCk;i0 z+a2ymSZ^=OrtrSH!(y#Vn!8KWk#O7<1-!if+`dDDy18U7wS3k$lIeM}Z0fhYqI)+x zo*o4*S$S|hGf6vL>PaQ(OQ_%eskx-G-FV|dXHbTH<#w@RbeIx9I$d$xqHh`{*&d3y zevlYNk)}w@cuu4A$^DYJsOvO7VBaom@Rx@gb$V5IKJ{Xue16H-1H0j=U0brW-aVRG znWCQRkESBmD^4?a7mB@!jf2>(Hs=Bd-;XX1oEilevb9axB^NhIPLO>jl03S+Rw|fx z&oIsIk(~W!4$zzKF|uSR<@S#;{r;fKup)iDaxz_9JouroY>XHcrN(Mm@UHV?-8bCh zXGfY~7U`rCasv(h-R*ava)^ zF1`BMT*n3xQBTdM?`n&h2Ecf*XXuLo7Zyl_El(v~oh>}mK01$%0a@#uzyiX_g>Bav2XWwH%YekAxU%pBT!p*?%cS#zA zv;^eDC#KZP@7o=^GDc_V8<3w>`*L(+=A#(fcH)dGjqM}Vk_el+c>B`{9xm<>IZ-Zm zLL!-Yf*3nju_(8ZGUd9*K`iofWW+BYFnZF&+a|=yxqV?oUOcG#ulnSR$DMs|e5Tph%WW zVjzE3nMh7+rG!}av)+~;o$#+EHyPX zzOUO?^#)Jh*t^b7pTW+I%f;xy&JMPCO&5RR``BmHX-Mw{qoJp9BjKea$;A9%>-iEZ zvuUBm%0j5UWax~`ue!K6dDdip+zs3f{+qQKqH;9C(1Z@95()-Ew=`BdLh2VS3zI8qYGH&&7m9+vpUc+x8l!i-ATXKhw34XL2;ya_VIQz!OL^)8mtqnb?q=~&^h-$;Zn^HRZ2p(gH z39An;`AWT=i&VP0u&CUe7OYW51Icv=q%Vc7%Zm z_uAp9n}osEUdk2*pV)*i`WRSa-FWtCwGqS-75@K#V0)r;+0(0XVp9vnb7lWiMj!q= z>Zf(ioa@gSwA55Jil$lh)%4U<)$j@HTQU2KwuUUsZA*2O^QTKobak8g0Qb~ROMTW7 zfTF2yF*na6i(lQ*Nq^rPen^0>$$b`K!Kp{FVa-VF`kCiXZg0Vtr}i*rcpny_YOR!} z+?Jiv?dWlT`}o$s9Fxt%%684d7ek-q-Q~jS*I5+8HtvSw+Rp!D=+gVr!gqcYy9K74 z&eClx6f6{1Din;ynjz?XZlJ~W7^A@0wiHIt8$aou;f>MYpU%gUlDwAK*nX0#vHtyl z_C=B+ZkOffY|oR^2>(+IlZCTMFirZMhn>bqzR=38hvJpcM4-@gUYY7_k^G*FW9;5r zc9q4c>C?hd{uS3{MThN*(w!3e05e?bI#SNlo$U&%>((Dz0_JeqbG|}!wI$& z%q2JQ)Vas;i0RYqNXW!CC~QK%u$K$beGI zT2KuzMjus26(zmofK;m2gY%d*o~sHBKA#`RBNc9c*-GLmbgh?*9V;^TBSot2E%~Q5 zl+R!WA_h_JT;+irbJ#Z-tSy-;B^t&&dOSwPV(T!CB)no8Y4sP%k(MD^0P!NL1vK&7 z`3luW2$gkI#Zf>IZT2=m4R&e@d zeo#B=Q|9`w8}%|)f%GBjYO01&Dk5qjm$+#1yia#CE=Sh~88Vdp%|VU}0a6mF@JkhUY&~W3f#rHK-1Qdo z>0*z5?#-hQUY}k^X7~1bkI?($-~3#c3mF4Cl@2%|0@1=ARZ z^qlNaN63&>;O_~mmto}?tAhznb}p;GpyIq1Z^yf<_6Ui~cpbbP;uV7W!+ke>wYG-f zPPz2~%UgSs(>vsKFle%uo=WIDYz;BR!doAy)aQ0QCpE_Wz1XK+3Kpr=V_H8w zqzaizn9ALx#?fo-N)_CtENYH*1|ID|x=xa9d#;9~1Wgrcx^8=evrfky*Xj`269~A;kh^O|ewZnM}=SmM7NX=?h#jjLh&1kIT+A z)If4luYo@s+e_L&eRJ$gw1`)>u#efOq=M0iYIPS$GII0z`T56eNxK@~Y%*^~Q&w$1b)jM9Z~kuRc~YX`6r#ySCskW5cq|#a39s;ZiaL~OdEpgu z1k*sKkLZ&?6fAi=)77yKI1xii%)@DG8r}663xkJcwLTj?s`h{GP@_2}`A|;w7zrzk4QOQ*O$(e|M^<`vLD*1^i>Nr*= z+A`y@f{!zLi)ys9OrFM5`Qw0292Ciyq>zC>8(TkG1O;#UUh?#I08kuwpS_vhufJ0v&p^Yr`=^WG7!qVG(8n9u7=J64fr zQq7B|9rzl7s)I_|8UeVp?=cqGILQ}0O(n+^vJz=vFBU9JmG$=DWzi+qCHw@D0a7`M zA`%pmU8+8W{u0{2*^tg&3;I&i`4`{YJe_n8 z{viTJZL?$}#l9w${3mydrW>Z%nY!WXf$HJv5$Zw4F%7^mXWsZ-s&olv31;C*KlH)j z?j?Eika^cI`l>)WJ*ga?%>0HwJm{%<)OP8pdvwMG@fm;Ca`jfy7ixY-sic42*f&ld zJg3(O0~;=Zsp@cdUj@&Zj~#~LX=F5Ws@!Ik0-~(wlbJO6&)S~s6WrAW9lrQ%6+S03 z&P&xJ{;BC%2s%J#uxZy3=Fc}fkwE9(T}QAK9b{FT!L3^PQ~;#X$T|9v&JFq)ru$h|ls zvPxYyWT}V&Dol3#)t6pVE4nIClEq=r++eGcG-tkOW4{n$Ra~3z?`@_gXRUiR`SrhY4K z#>C+t>pNtm>!Zw*;p^qI0|g<)Ob`r0jaN6asw2ZGLT}bMbHnQ$OH8cR7{Rq?=4%&x z2Qe&O`w$~b%fuo>fkgT`PVx=uto@&SdDpIXL)<da|A*x(b?o zdUj^iN+B9%;2{1URo7=%m@r*RJi3fQNO_`AZY;b#tClm;A}NQF#!Y;pMMdh=^fO@9 z>J>Xv^joKJM>M7x=xh!oSLO3JlxVwTn$DPHdGsnkAvB)9d)IE6ZHgd1vd+Z;W1d682CBy4zti z&6;T6!rzSKIy&zKKfAx9J%7q-=Mac{u-_GIYEaZt*`h25Ne?ch`E_c2{pGA<;nVkx z102u6#||N$g5MhA{!rFwaI(;8$S{1DePGc^L~j6?Q$2QMIO09 zPdma#_kX(|;oOau(pX877ac9V4O8x3g{Mdbr6oS)7 zN0v#H_j!bhUNl;q>GrkeA~){;lCg@&Mg5(z%E1HV`d7{>_}@9JZ(VJn>=HKC4q{My zLpw8D2OD@&E}T?=SV7rE-XI?4H+E(aOI8sZOC$NW=!leE6MG6ycn2;fB4XpB!^#Z= zQ?P=-+!R0#4h{+c2LPbUF6{uZG&6i-ZDI+f;6P`8V{ZtxcA((p;6i6ds6r4x005m` z6k;m{H8U}FK+J;+syaZe)G2u2J;eI(G+`)^0+C~@0#BIzJLi_?-}e8NR15?I|34|k zx>2LneiYApj|7nW4k1sp9h-vz^G);Jq7ONB*clw!(IJ2QT3sYWS)>yb_Ual2Um3r5 zw706UJD48HLY73$&Gm=sl|EYND&Uk>VT!eN_p49f6HS<{TU>u{4&#WYh1dwy^E8il ziH`_=$2m8k)y$Q2yDZQluP+AZbND!Yi7Co@fwHnw2pV1bo*=wGx2n7Urt$y1@imz1&#&nK47Nw zT-dLY@^1NHY?5B#-Qf9?`lA_={@NnLpmwJGQG7&oU}0>) ziZ`GdjY(jIKi2Q?e+d=de}nq3pkP;ZG;lyf$Xh!{=x?qF#2$)p%>NM^W_I=tqNWf# zgv;e1fAtY=)-W@2FtyhKb8%3Bfj|mw00#vR4=)857d&XdU z(4fLD4>dA_AWjHkeJ)-u3LZ|NF1w_ijiW6*A6^xXD#Y5}7O{k(E4!#F{9rhl8A4Sg zMcAb&9N>rx39*a9v4(4~r$8jq|MLt0{*hTPYU2nu0sub&aQG~$!9>qU@%LGVw1{ZAdD5crj3WAdl2KV62-uIT7sX=aUZ*>8aV1F3(c z_P=p-FtxG!8!9*^U<3>RcoByeFaipAK|lhB5)AqaI)n^@hmeEwxOw0OKK@%C0pZ{C z5o^F{FbEE(DEt!$_$B<8DlYiaV7ME855ql#Py+_S#o(c8`L;d6lqRR~$cn(zq-4};(pf)4`xt=`PWS`7YO27?$MdgtpDP{`vCa4 z{2x3Z5bm@8-~oUj5Zv+q!Gl}N`CoDX0N4M*gTIpgb1nb?;)Y)s|FIqb0Ot6gw!m#h zTnhg~j+YZ2)c?r?0yzIm4hZ1=FTFrc;D6}=a`OJeW(PY6{AFi{I1;L6ZcsR+>?$@k z@FNVDLEL!K*2XpzfZwk|I3Y%%Lm?mm76XGtKw?0k2(JV$kO#;s#>p!o!6gRf5#f;l j@(7{-|3%=32kuUL2Z)`+Z(jm{U>-0!Ev>ks1p5C2Hj`#V literal 0 HcmV?d00001 diff --git a/components/load_from_pdf/tests/test_folder/dummy_1.pdf b/components/load_from_pdf/tests/test_folder/dummy_1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..774c2ea70c55104973794121eae56bcad918da97 GIT binary patch literal 13264 zcmaibWmsIxvUW%|5FkJZ7A&~y%m9Oj;I6>~WPrgfxD$eVfZ*=#?hsspJHa(bATYRn zGueBev(G*EKHr+BrK+pDs^6;aH9u<6Dv3$30@ygwX}fZ|TDt1G($Rqw927PN=I8~c_R69-cY5S*jJE@5Wr0JUS6u!J~3#h`{ZMo=LkbbALoD8vfgB}Fh|2>mhOnfS$3 zNV5}8Ox=$fj;C0=UKy*{myZZPRVS|0mqr-HxZAy;()@wxQ}MN`QWAZTXb3Z&Om9W2 zbnA^OWoQbAW|3W^fw#J;YzDato8*`rHQs+@W70D&SyT{wb`SN*3nI z5G%$wJlq932=n{60Eii*9H8dFih2ks?QY=>nAFL=5g^P@#b{YUEHt0S$D7WbX zx%TzvzIK%zpvzLEd9LNr0ch#LFf_(9 zEGt0C9v~%b54vynAc{~;v&2?S(-sTTft@9CABMNFZHtY1W0-99CEbUNfp_yu{LDBz z@8z^$LPN$wX4Hi+dZQs6K3QiKKF0}Nme@EII;;F}IplC(YvT*C3-Oh#(A}e5pIz01 zyR}D2|ftBF0T=1moHZy}$wS*PSCmSzHQ%x z2tCQQCx4jt7w1cuhY69~eH`31KC4)ZZJ^)f=IabocAkBPa zEeg25yPX&9-i_N(Qiq!I3RDrfx&0t^i)&MSQ1D(w%|%#LTNr>1cPiltAYO;6kBn(B?r11c^Bz~#)z5~~V+*`U)lDFtKbZ|;? z&4wTUtK=KE&uQIWUQv1mDE;LIhXXgx44PMa@%Z<7a& zx45^oYSnei^~%}`?!O-+cgfSmn_c?`=Gmm*Z^I(96ve&$zDs|)r84)IEEiE1kfQ$q zm3km*m1)PjdU9nkk9BTlidI1~M|O~WfP7AUu2T}d>5is9l$<%;7r2&Re06w>W$KM~ zqITBTd=Ln>^crw`_N?{ z;2d_=E0n!*NisQ|XYuX9q3+UcqdA(MC45|>2tz^c6HdZOmXTB?X2Elx@_0f)1z&-gS;UxN`>Ll-kWb0X0 zTrQis=w9sJ(q7k|@|k3SA~DJ@uMXP@4(Mgn+LJC+3F~3NHW71pIzY(aHg~{O+squi zWO_|F>78)L5*gcRXXRD9IzQ(ddSxh}E7(8sC~EYrOz$9BkSMBCkGGO9FuZ{#*mW+h zvwE7d)6Ag=a*R5URs>}qdqb_E6g)kN2Wel;pWe9=hZ)XvRZR!RQg&gxAPGj8J0!gR zrdV<2@MZQ?_Ocbd5@0zI?t>$z3eD80_h^{DI)H5lk`T4lbn8kteH3%fOBH^g26#lLN2&P^s zr&d05GDs)u_8OKzCgNxllk5pLC<2wKmghL{zW%}5^}%S$?d=3OzjaSzT3>uWYikZN z2ZcR7*L|%UMs|u)wMi7#vkN?cxlBcyAM80Tyzzv&zHMF1TH9?Mx5&E57P^)^zE5N| z^foq}!--if$Uj=U6Tc>EM!Pv)e^_SZSdvtQ=@>)(ONejQ!XW8u6>ESl<*s^6cH;Q1 z#n}nL{#|{l}}@td^zNSA;R{`3A&Jjr8L9(3^2FSyZ1W9$%;!XP#N2 z-SAzyRfxtgq^py7_3*GJFO%x_v<`xJ46`~S*IukgQDKfLxzFnS&GYL!1LA{I z!c#{A90{k(b*tUfbgjOH>}{#V;%^O+LUU<*#QkLtWzjho*Kb?Cr&wC38%wxpn}^Wy zG6EpV9x3xioCWA6H6=aE3)%jmZePu#Ji7wy0CmkDZNG`a{J1i-2`Bt&UrFb&<~V$^ zy9i`R1<35M&{mtCz144%v#7LKBTPPApjoV}#W-gDc5cn;A@Mbt#zXUK@J9^vj*ME( zo8(%K{c-KDr8n1-I&Mjn)*i|pF|7l*`fXvo8-z&j{$NOfUPM-xILbX1D29IHp|__B zL*JQ8*7-VrZVY*&$!PiE%zv@osg`qx0M8+w9iy7Az7;HYezs;5NRvrdNM~t@o}5Gc zjagk3Y_>6!Ct;ITqhu3FojJO^(^SG-($M4|frkp?4y-QoSmFcw9Z%(z?eC0kGi9@? zm(vAgXU|%!6_)CrnqYL-Hj@B5hA?#8C3G^cjd?0dMSZ!wbe%O4bWvlIG=nwOEInVj zhjzd`Bry8sXBTfIUr+juZH5JyE#7~UQiwR!gmG@wm}aNyo`13xEo)tzP64MWWG|j8 z8u8a2_=C2FdRZ9(eG&Au`@$mY9vvWldP-@wj5@38H0W2V8wnaQO?!)qoS_J=(ieoI zOvH}mkBRh_p1oTW66+?3u-GH2Ex~c=BQiwpJ zJlF7O2PBaCojRRL_mp44*Iq}vcRFpBD>V9M7do5{w&b;4^<_V~Vr{+O_&hz9k5Sm` zq3|%Z(6B5~wz2k0iH-QlafAa>1%ZebdxkR;6SdA?@dK|4Jf8PIO%64Fpw$6RYG2R# zX>Iq(xf`5Xk)79-@;BAQjlWu|w@Ss3sJv3Ew&%lBu-H?vYsC8XPJD!lkv*A~z_-k= zLOaM?B5}$Sf-KF5BWHoB51WFA{GlweQna618{*tqVn)YKUVq?khU_=QER9uW?N17xgAponbjg0W`=>f;sulH3?st)Y_@k$We2-__a>^{E78lUiI13qq!3# zwxMEl75MK1q`~J>ST#?`mUx#vr%-jwpZ+DV;W!0KNkZmO#sK)zt)H@`EQl6RRWhwb z0&E7|fG~@z)wlK1-RsxN#8Gr)D5=xpv=b}=CWPbwz@(9bIhD0Crd-Q>qEo>~Gh{X7 z77AK5>TfF0wK!?7Nx!<5uDy?D{Qg$SEc_R3J9EuH!Z@qmEJ*QRRHd3BPirM6783nv zAnab$>rhdDJ6pO@%Ox(}BYw{Ba<3|=A%Fg5_Hfxj{%CfzZCFO{?%h&=?%CNBvi&p; z(otqN>+5giLLa^*G?xzN30=IgQrV+r7dW4bX;zKtuD)O$UnwAKC?CpkPt{77nUArH ze-jKcCfRrOlp(Q^b&W}mrgt4n%wikNxeSBBE_n>K-IOIzi6!<)xGRYA)wGgqp^s@d46N#krDHPc#9SOgXhI7Vbj?B z%c6@8dCOGPYBoNE#3N7HD^ihbC9*xGm6chu;?fcuv)s01keHHZ1vXl5D;29O7wZBr zyPzyLZHKMtUI%PK+*X2zTFtaDzU1qn(H=hRRj-SoJw7I5i%4b0u=&InEAKgoae-lp zXk0SkjlJ52HruS*1QykTZ&aCN`PbcKuw$1st{peJ@&aF^aR@~{XA@L&YvK%+VU}G4 ze5iuesu&i6=*#nvHbm_v-ZLr5^Ij#|YSAper4XpsH;0x(2h1-tIobIy;0~2a( z!G($SB!iu#P;;hGeI~C`O=-3|d~zoB0!`*JrU-)Ko_X5#kSpy5o^z49RG;{j#l~45 zF?X9Ih4IdviT(8@+q|`BveLTprbESZ6^2I&ew|V3pDXRe9gSyXT)zzqKQ;gCD;p+( zM)2(;YJ%P5)X(N3ZSn>dn6UIcEcvQOXZBn}uD!7V0yXr$f+d@eTSYoquPit2S8cPW zA8t3dX)Cv{0cKF`@e|PP(xS0|z2_R0(P6)#+kC$0^5- z$7Hs|bOQanE z1oJ;uh(dYiDt}mVmtC3&HaGT6-dY429v#ySHJ7V)C8ow=PSmnEI)=b3_RJsU(S*+J zV$p3>RkK?DFvTc;(-T=h!1u~CP!pE=0eSSu#c@N7S0Z57CPg}!5z{QL#`2v?DJDt^ zCGN{0p-&&=)Sb28Xlo;ZXc^CGdwL9prf30uu$y5aPeWD6WIk4%%~DEhTiwOvy!rS% z&3z#DWo2qBA*=M2xIu=_R0sbrmP;Y?_rRa^k}3WYU6n9H^(})Zi-woMKKXfgbab@J zWx3DUr0MLpdDYk_LO8As}d*Z=x^K+uIv#T&SnY6&C$9 zBn1u`G#TBt+n5b%a;Cr0h^sm5Fl^OdxJ^8IebW);DWATq#Ba=#rggj*wNKy5NMzz& zBm`bk9bcSVPJbC`dHrI>o^=LSvTFpT`VAK`x_naOpvS~*l2$1vIk$avBA!|aeZ+7c z$_9Zzh>fc4$uX&w@-$VORCscG(B)OA@SPj>BNY3gxkkcPgNi9bE=?&3A4`3ekrdsb zn~`M;p8I>4?@@ZI{9Afv(tC@pp@Oe5BYUw-%&J_WaTBGls)&d8q?t$i<<@=_CNfH! z4H!ww7#gkp_^`bxZaJI9@C+A9x7@E1ZRoG5PL?w3GDi>`8Qq%I+0ygfT78%{Zt#mP zqX0CzaHKn@hAOQsv=^8UbfpuyFnT8Ht++Vmmx$~09!e{5t8fMkEjr~tfIxMlIpr4zGwvEIWKC2`Q#C)c7QF9wet?hE zLKoU?t@nqm=iBc` z8_((*(i(g}7z)3{%SJ!uya{?Ir-2^Fiap*VC4pF@N zpL5F*DG+(taLhdu4DbyAP(0&60n@%?G~hHugBI^-X6@_YOu}8UqwbQ8V`2vwDRLMz z)aRFo+r1f?5idT9xRF`cjgx$a-IpH3AH|bs$emw}d23*3aU0hYNh4(D0o-Z+wIX{d zeann?lzjgsAt62`er@<$`G755?i7tl%CHNgXp}#j>j&S1n5wZ;ofNbI>B2*4L1}@3 zq(LzPqn()w{KBsX!5*a&=dv<}t=R%II;TcQatbnKM7S4Q1PQIoT=^$#=>Y(m{mBYtl5W z6}|l4kxikOcJ`C3o{TSxIi?8|N6sH7Lkhq5qttl@uBTA|-cBluU$hU0&xYKvNidrL z4q>|j76}G1Db23Fa|XlFm%W&jW0h#7B$_FD-ZhqJ5#7i!0ZmCrereX z|Jlf`<1zR2akFe|boWv-r=}kM03o|%$mZA7Of2T99u~e56~6sh$P=yk9f!H6msn)n zvFOLF?W?iqi6fK9C)a42Sgt0kz4#M6 z-UY6451Er~=V;ITs1O-q*>}{;bs74MMZ(Z&=Z{5#q+i@cw^vI#0|Dh~-Dh-tn2I(S zTXXp-bLEG{p0#BbIqIcTM|DWZmr`&br8u)jQ`CR*^+g_fIX%=K+)x}F%Oak-Uh$6nIHUavnNV5M7YffU80QPRD%y>T{bIzn<6Rsy zb6cW6`?0EwSn;uJddPn@`?^Cry2s(6ccP1ykKr!kmDg2~zbTJq@+e(z5N>ZNr|8$j zPi-~ofp7E|Xx1#H+f@UR@AS}iLP!}}dRwf{u!avAq-_hNw#uaoOD{2jo*eRn8$~bDK`h1&ssOC6ekGV38+hU!KR z+kpnSzT;y#o|V2h|F?SY4-z1MFxz0;)@Lk`H>Cj zSl@fR%*@F79;HJcsX%L8_d!%TwmQyi$|n&C{oBMJ9~Xm!@@#lZdz(WB9SgJ#NIC%@ zy+~ZnI|4E`7f@W0Y9I@N7UTs1fTPD-ZiU%Lr2MnP+2h8AGh?(WGVf>h@W-_M>jRkD z(KNxvo(UJ7)o+*t%fCcM10;2XM$1NAFKwhp(c917^io_ynn-yv58IFIF*UJUw*2Ma zm?a-a1yp9B?WxpLzap-c^$HKkX_IfT_W8Lqaltl*A%vZSZWAe`Kv}vjz}>Tc;Hw9T zA+Nc49X&{WDmxY~ReV0YceXdL!$9mTL$Q@_vXIW6I{G=`$KR7jFcE&IsHwnKX;KldV#YL z(xwKAB5cFiz+r6m*5iJvo&E)XQqVWjmA}BfyVS&dm9&Y%$Sp^sW!JE3iI0v(kQHdo zmhWk|gC!e@CFKPv4BE*U;mYo0y}J0J-Fhu!c%v+paQf9+3Ed2EkfPt(D7|Ok#t)^PGr3Y)RGfvO=k;@Xry=Cf3fLCQ# zi`%oCt+vyB-t{iEgI&+2dczmnMXj>EOmSpMuuL8Ob`1$D;fc$wM6j2HH4Q$ zqaoj&M$2sLhpptdJMbs!krJId=iOd}HdP4Lt@yf42OZ{pOoQ4_gShz_sMoWYX}yQd zDQ8(tc7UvTt%`0#?9K!C^J>GpucEnBhnsWg102Z=uzOlwez^q^j7nV$krID#wC}A$ zcRfc2)T5Y~({6@1`{yL-Lzs;miT@C9|1SIFBMK7cz*E;v2H|EStZphjfb5mGMpw{q z!pl;Vw772tuvDH4o$;j4u8)@=m+&BIf4Ix(u75P?Q{4Y8^uvpq)mCW(enuQc)hx$B zOY{`_*%~bm%k*x6y;)D8_-yYbMsC8y#1H}89X;M=a#*HT>d*NFf}x$pQ&X?nFtvzA zKH|l8y;frsm|&}<%&*}Yu}Yn0M=Jy8qe%<1qXRR%Nut}Aqr+1pQS*D7Cp`+8Y`RO02p14DyVOmSYlEzZ;9&JzYhtybMZ%e4s zlks=V(+aJ!LK-()3ox`%9c)lx#3#y4{ulL6KpG|&>9`n?Uh#m3G-mZy-3h98Scyja zH^3Pb7?P z+2hAkyvg}g$#)n$Gs2fL19JNOZ|~>Nx(|}lmwesC!>?Y~72mpf4XZ8t^TIwbCk;i0 z+a2ymSZ^=OrtrSH!(y#Vn!8KWk#O7<1-!if+`dDDy18U7wS3k$lIeM}Z0fhYqI)+x zo*o4*S$S|hGf6vL>PaQ(OQ_%eskx-G-FV|dXHbTH<#w@RbeIx9I$d$xqHh`{*&d3y zevlYNk)}w@cuu4A$^DYJsOvO7VBaom@Rx@gb$V5IKJ{Xue16H-1H0j=U0brW-aVRG znWCQRkESBmD^4?a7mB@!jf2>(Hs=Bd-;XX1oEilevb9axB^NhIPLO>jl03S+Rw|fx z&oIsIk(~W!4$zzKF|uSR<@S#;{r;fKup)iDaxz_9JouroY>XHcrN(Mm@UHV?-8bCh zXGfY~7U`rCasv(h-R*ava)^ zF1`BMT*n3xQBTdM?`n&h2Ecf*XXuLo7Zyl_El(v~oh>}mK01$%0a@#uzyiX_g>Bav2XWwH%YekAxU%pBT!p*?%cS#zA zv;^eDC#KZP@7o=^GDc_V8<3w>`*L(+=A#(fcH)dGjqM}Vk_el+c>B`{9xm<>IZ-Zm zLL!-Yf*3nju_(8ZGUd9*K`iofWW+BYFnZF&+a|=yxqV?oUOcG#ulnSR$DMs|e5Tph%WW zVjzE3nMh7+rG!}av)+~;o$#+EHyPX zzOUO?^#)Jh*t^b7pTW+I%f;xy&JMPCO&5RR``BmHX-Mw{qoJp9BjKea$;A9%>-iEZ zvuUBm%0j5UWax~`ue!K6dDdip+zs3f{+qQKqH;9C(1Z@95()-Ew=`BdLh2VS3zI8qYGH&&7m9+vpUc+x8l!i-ATXKhw34XL2;ya_VIQz!OL^)8mtqnb?q=~&^h-$;Zn^HRZ2p(gH z39An;`AWT=i&VP0u&CUe7OYW51Icv=q%Vc7%Zm z_uAp9n}osEUdk2*pV)*i`WRSa-FWtCwGqS-75@K#V0)r;+0(0XVp9vnb7lWiMj!q= z>Zf(ioa@gSwA55Jil$lh)%4U<)$j@HTQU2KwuUUsZA*2O^QTKobak8g0Qb~ROMTW7 zfTF2yF*na6i(lQ*Nq^rPen^0>$$b`K!Kp{FVa-VF`kCiXZg0Vtr}i*rcpny_YOR!} z+?Jiv?dWlT`}o$s9Fxt%%684d7ek-q-Q~jS*I5+8HtvSw+Rp!D=+gVr!gqcYy9K74 z&eClx6f6{1Din;ynjz?XZlJ~W7^A@0wiHIt8$aou;f>MYpU%gUlDwAK*nX0#vHtyl z_C=B+ZkOffY|oR^2>(+IlZCTMFirZMhn>bqzR=38hvJpcM4-@gUYY7_k^G*FW9;5r zc9q4c>C?hd{uS3{MThN*(w!3e05e?bI#SNlo$U&%>((Dz0_JeqbG|}!wI$& z%q2JQ)Vas;i0RYqNXW!CC~QK%u$K$beGI zT2KuzMjus26(zmofK;m2gY%d*o~sHBKA#`RBNc9c*-GLmbgh?*9V;^TBSot2E%~Q5 zl+R!WA_h_JT;+irbJ#Z-tSy-;B^t&&dOSwPV(T!CB)no8Y4sP%k(MD^0P!NL1vK&7 z`3luW2$gkI#Zf>IZT2=m4R&e@d zeo#B=Q|9`w8}%|)f%GBjYO01&Dk5qjm$+#1yia#CE=Sh~88Vdp%|VU}0a6mF@JkhUY&~W3f#rHK-1Qdo z>0*z5?#-hQUY}k^X7~1bkI?($-~3#c3mF4Cl@2%|0@1=ARZ z^qlNaN63&>;O_~mmto}?tAhznb}p;GpyIq1Z^yf<_6Ui~cpbbP;uV7W!+ke>wYG-f zPPz2~%UgSs(>vsKFle%uo=WIDYz;BR!doAy)aQ0QCpE_Wz1XK+3Kpr=V_H8w zqzaizn9ALx#?fo-N)_CtENYH*1|ID|x=xa9d#;9~1Wgrcx^8=evrfky*Xj`269~A;kh^O|ewZnM}=SmM7NX=?h#jjLh&1kIT+A z)If4luYo@s+e_L&eRJ$gw1`)>u#efOq=M0iYIPS$GII0z`T56eNxK@~Y%*^~Q&w$1b)jM9Z~kuRc~YX`6r#ySCskW5cq|#a39s;ZiaL~OdEpgu z1k*sKkLZ&?6fAi=)77yKI1xii%)@DG8r}663xkJcwLTj?s`h{GP@_2}`A|;w7zrzk4QOQ*O$(e|M^<`vLD*1^i>Nr*= z+A`y@f{!zLi)ys9OrFM5`Qw0292Ciyq>zC>8(TkG1O;#UUh?#I08kuwpS_vhufJ0v&p^Yr`=^WG7!qVG(8n9u7=J64fr zQq7B|9rzl7s)I_|8UeVp?=cqGILQ}0O(n+^vJz=vFBU9JmG$=DWzi+qCHw@D0a7`M zA`%pmU8+8W{u0{2*^tg&3;I&i`4`{YJe_n8 z{viTJZL?$}#l9w${3mydrW>Z%nY!WXf$HJv5$Zw4F%7^mXWsZ-s&olv31;C*KlH)j z?j?Eika^cI`l>)WJ*ga?%>0HwJm{%<)OP8pdvwMG@fm;Ca`jfy7ixY-sic42*f&ld zJg3(O0~;=Zsp@cdUj@&Zj~#~LX=F5Ws@!Ik0-~(wlbJO6&)S~s6WrAW9lrQ%6+S03 z&P&xJ{;BC%2s%J#uxZy3=Fc}fkwE9(T}QAK9b{FT!L3^PQ~;#X$T|9v&JFq)ru$h|ls zvPxYyWT}V&Dol3#)t6pVE4nIClEq=r++eGcG-tkOW4{n$Ra~3z?`@_gXRUiR`SrhY4K z#>C+t>pNtm>!Zw*;p^qI0|g<)Ob`r0jaN6asw2ZGLT}bMbHnQ$OH8cR7{Rq?=4%&x z2Qe&O`w$~b%fuo>fkgT`PVx=uto@&SdDpIXL)<da|A*x(b?o zdUj^iN+B9%;2{1URo7=%m@r*RJi3fQNO_`AZY;b#tClm;A}NQF#!Y;pMMdh=^fO@9 z>J>Xv^joKJM>M7x=xh!oSLO3JlxVwTn$DPHdGsnkAvB)9d)IE6ZHgd1vd+Z;W1d682CBy4zti z&6;T6!rzSKIy&zKKfAx9J%7q-=Mac{u-_GIYEaZt*`h25Ne?ch`E_c2{pGA<;nVkx z102u6#||N$g5MhA{!rFwaI(;8$S{1DePGc^L~j6?Q$2QMIO09 zPdma#_kX(|;oOau(pX877ac9V4O8x3g{Mdbr6oS)7 zN0v#H_j!bhUNl;q>GrkeA~){;lCg@&Mg5(z%E1HV`d7{>_}@9JZ(VJn>=HKC4q{My zLpw8D2OD@&E}T?=SV7rE-XI?4H+E(aOI8sZOC$NW=!leE6MG6ycn2;fB4XpB!^#Z= zQ?P=-+!R0#4h{+c2LPbUF6{uZG&6i-ZDI+f;6P`8V{ZtxcA((p;6i6ds6r4x005m` z6k;m{H8U}FK+J;+syaZe)G2u2J;eI(G+`)^0+C~@0#BIzJLi_?-}e8NR15?I|34|k zx>2LneiYApj|7nW4k1sp9h-vz^G);Jq7ONB*clw!(IJ2QT3sYWS)>yb_Ual2Um3r5 zw706UJD48HLY73$&Gm=sl|EYND&Uk>VT!eN_p49f6HS<{TU>u{4&#WYh1dwy^E8il ziH`_=$2m8k)y$Q2yDZQluP+AZbND!Yi7Co@fwHnw2pV1bo*=wGx2n7Urt$y1@imz1&#&nK47Nw zT-dLY@^1NHY?5B#-Qf9?`lA_={@NnLpmwJGQG7&oU}0>) ziZ`GdjY(jIKi2Q?e+d=de}nq3pkP;ZG;lyf$Xh!{=x?qF#2$)p%>NM^W_I=tqNWf# zgv;e1fAtY=)-W@2FtyhKb8%3Bfj|mw00#vR4=)857d&XdU z(4fLD4>dA_AWjHkeJ)-u3LZ|NF1w_ijiW6*A6^xXD#Y5}7O{k(E4!#F{9rhl8A4Sg zMcAb&9N>rx39*a9v4(4~r$8jq|MLt0{*hTPYU2nu0sub&aQG~$!9>qU@%LGVw1{ZAdD5crj3WAdl2KV62-uIT7sX=aUZ*>8aV1F3(c z_P=p-FtxG!8!9*^U<3>RcoByeFaipAK|lhB5)AqaI)n^@hmeEwxOw0OKK@%C0pZ{C z5o^F{FbEE(DEt!$_$B<8DlYiaV7ME855ql#Py+_S#o(c8`L;d6lqRR~$cn(zq-4};(pf)4`xt=`PWS`7YO27?$MdgtpDP{`vCa4 z{2x3Z5bm@8-~oUj5Zv+q!Gl}N`CoDX0N4M*gTIpgb1nb?;)Y)s|FIqb0Ot6gw!m#h zTnhg~j+YZ2)c?r?0yzIm4hZ1=FTFrc;D6}=a`OJeW(PY6{AFi{I1;L6ZcsR+>?$@k z@FNVDLEL!K*2XpzfZwk|I3Y%%Lm?mm76XGtKw?0k2(JV$kO#;s#>p!o!6gRf5#f;l j@(7{-|3%=32kuUL2Z)`+Z(jm{U>-0!Ev>ks1p5C2Hj`#V literal 0 HcmV?d00001 diff --git a/components/load_from_pdf/tests/test_folder/dummy_2.pdf b/components/load_from_pdf/tests/test_folder/dummy_2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..774c2ea70c55104973794121eae56bcad918da97 GIT binary patch literal 13264 zcmaibWmsIxvUW%|5FkJZ7A&~y%m9Oj;I6>~WPrgfxD$eVfZ*=#?hsspJHa(bATYRn zGueBev(G*EKHr+BrK+pDs^6;aH9u<6Dv3$30@ygwX}fZ|TDt1G($Rqw927PN=I8~c_R69-cY5S*jJE@5Wr0JUS6u!J~3#h`{ZMo=LkbbALoD8vfgB}Fh|2>mhOnfS$3 zNV5}8Ox=$fj;C0=UKy*{myZZPRVS|0mqr-HxZAy;()@wxQ}MN`QWAZTXb3Z&Om9W2 zbnA^OWoQbAW|3W^fw#J;YzDato8*`rHQs+@W70D&SyT{wb`SN*3nI z5G%$wJlq932=n{60Eii*9H8dFih2ks?QY=>nAFL=5g^P@#b{YUEHt0S$D7WbX zx%TzvzIK%zpvzLEd9LNr0ch#LFf_(9 zEGt0C9v~%b54vynAc{~;v&2?S(-sTTft@9CABMNFZHtY1W0-99CEbUNfp_yu{LDBz z@8z^$LPN$wX4Hi+dZQs6K3QiKKF0}Nme@EII;;F}IplC(YvT*C3-Oh#(A}e5pIz01 zyR}D2|ftBF0T=1moHZy}$wS*PSCmSzHQ%x z2tCQQCx4jt7w1cuhY69~eH`31KC4)ZZJ^)f=IabocAkBPa zEeg25yPX&9-i_N(Qiq!I3RDrfx&0t^i)&MSQ1D(w%|%#LTNr>1cPiltAYO;6kBn(B?r11c^Bz~#)z5~~V+*`U)lDFtKbZ|;? z&4wTUtK=KE&uQIWUQv1mDE;LIhXXgx44PMa@%Z<7a& zx45^oYSnei^~%}`?!O-+cgfSmn_c?`=Gmm*Z^I(96ve&$zDs|)r84)IEEiE1kfQ$q zm3km*m1)PjdU9nkk9BTlidI1~M|O~WfP7AUu2T}d>5is9l$<%;7r2&Re06w>W$KM~ zqITBTd=Ln>^crw`_N?{ z;2d_=E0n!*NisQ|XYuX9q3+UcqdA(MC45|>2tz^c6HdZOmXTB?X2Elx@_0f)1z&-gS;UxN`>Ll-kWb0X0 zTrQis=w9sJ(q7k|@|k3SA~DJ@uMXP@4(Mgn+LJC+3F~3NHW71pIzY(aHg~{O+squi zWO_|F>78)L5*gcRXXRD9IzQ(ddSxh}E7(8sC~EYrOz$9BkSMBCkGGO9FuZ{#*mW+h zvwE7d)6Ag=a*R5URs>}qdqb_E6g)kN2Wel;pWe9=hZ)XvRZR!RQg&gxAPGj8J0!gR zrdV<2@MZQ?_Ocbd5@0zI?t>$z3eD80_h^{DI)H5lk`T4lbn8kteH3%fOBH^g26#lLN2&P^s zr&d05GDs)u_8OKzCgNxllk5pLC<2wKmghL{zW%}5^}%S$?d=3OzjaSzT3>uWYikZN z2ZcR7*L|%UMs|u)wMi7#vkN?cxlBcyAM80Tyzzv&zHMF1TH9?Mx5&E57P^)^zE5N| z^foq}!--if$Uj=U6Tc>EM!Pv)e^_SZSdvtQ=@>)(ONejQ!XW8u6>ESl<*s^6cH;Q1 z#n}nL{#|{l}}@td^zNSA;R{`3A&Jjr8L9(3^2FSyZ1W9$%;!XP#N2 z-SAzyRfxtgq^py7_3*GJFO%x_v<`xJ46`~S*IukgQDKfLxzFnS&GYL!1LA{I z!c#{A90{k(b*tUfbgjOH>}{#V;%^O+LUU<*#QkLtWzjho*Kb?Cr&wC38%wxpn}^Wy zG6EpV9x3xioCWA6H6=aE3)%jmZePu#Ji7wy0CmkDZNG`a{J1i-2`Bt&UrFb&<~V$^ zy9i`R1<35M&{mtCz144%v#7LKBTPPApjoV}#W-gDc5cn;A@Mbt#zXUK@J9^vj*ME( zo8(%K{c-KDr8n1-I&Mjn)*i|pF|7l*`fXvo8-z&j{$NOfUPM-xILbX1D29IHp|__B zL*JQ8*7-VrZVY*&$!PiE%zv@osg`qx0M8+w9iy7Az7;HYezs;5NRvrdNM~t@o}5Gc zjagk3Y_>6!Ct;ITqhu3FojJO^(^SG-($M4|frkp?4y-QoSmFcw9Z%(z?eC0kGi9@? zm(vAgXU|%!6_)CrnqYL-Hj@B5hA?#8C3G^cjd?0dMSZ!wbe%O4bWvlIG=nwOEInVj zhjzd`Bry8sXBTfIUr+juZH5JyE#7~UQiwR!gmG@wm}aNyo`13xEo)tzP64MWWG|j8 z8u8a2_=C2FdRZ9(eG&Au`@$mY9vvWldP-@wj5@38H0W2V8wnaQO?!)qoS_J=(ieoI zOvH}mkBRh_p1oTW66+?3u-GH2Ex~c=BQiwpJ zJlF7O2PBaCojRRL_mp44*Iq}vcRFpBD>V9M7do5{w&b;4^<_V~Vr{+O_&hz9k5Sm` zq3|%Z(6B5~wz2k0iH-QlafAa>1%ZebdxkR;6SdA?@dK|4Jf8PIO%64Fpw$6RYG2R# zX>Iq(xf`5Xk)79-@;BAQjlWu|w@Ss3sJv3Ew&%lBu-H?vYsC8XPJD!lkv*A~z_-k= zLOaM?B5}$Sf-KF5BWHoB51WFA{GlweQna618{*tqVn)YKUVq?khU_=QER9uW?N17xgAponbjg0W`=>f;sulH3?st)Y_@k$We2-__a>^{E78lUiI13qq!3# zwxMEl75MK1q`~J>ST#?`mUx#vr%-jwpZ+DV;W!0KNkZmO#sK)zt)H@`EQl6RRWhwb z0&E7|fG~@z)wlK1-RsxN#8Gr)D5=xpv=b}=CWPbwz@(9bIhD0Crd-Q>qEo>~Gh{X7 z77AK5>TfF0wK!?7Nx!<5uDy?D{Qg$SEc_R3J9EuH!Z@qmEJ*QRRHd3BPirM6783nv zAnab$>rhdDJ6pO@%Ox(}BYw{Ba<3|=A%Fg5_Hfxj{%CfzZCFO{?%h&=?%CNBvi&p; z(otqN>+5giLLa^*G?xzN30=IgQrV+r7dW4bX;zKtuD)O$UnwAKC?CpkPt{77nUArH ze-jKcCfRrOlp(Q^b&W}mrgt4n%wikNxeSBBE_n>K-IOIzi6!<)xGRYA)wGgqp^s@d46N#krDHPc#9SOgXhI7Vbj?B z%c6@8dCOGPYBoNE#3N7HD^ihbC9*xGm6chu;?fcuv)s01keHHZ1vXl5D;29O7wZBr zyPzyLZHKMtUI%PK+*X2zTFtaDzU1qn(H=hRRj-SoJw7I5i%4b0u=&InEAKgoae-lp zXk0SkjlJ52HruS*1QykTZ&aCN`PbcKuw$1st{peJ@&aF^aR@~{XA@L&YvK%+VU}G4 ze5iuesu&i6=*#nvHbm_v-ZLr5^Ij#|YSAper4XpsH;0x(2h1-tIobIy;0~2a( z!G($SB!iu#P;;hGeI~C`O=-3|d~zoB0!`*JrU-)Ko_X5#kSpy5o^z49RG;{j#l~45 zF?X9Ih4IdviT(8@+q|`BveLTprbESZ6^2I&ew|V3pDXRe9gSyXT)zzqKQ;gCD;p+( zM)2(;YJ%P5)X(N3ZSn>dn6UIcEcvQOXZBn}uD!7V0yXr$f+d@eTSYoquPit2S8cPW zA8t3dX)Cv{0cKF`@e|PP(xS0|z2_R0(P6)#+kC$0^5- z$7Hs|bOQanE z1oJ;uh(dYiDt}mVmtC3&HaGT6-dY429v#ySHJ7V)C8ow=PSmnEI)=b3_RJsU(S*+J zV$p3>RkK?DFvTc;(-T=h!1u~CP!pE=0eSSu#c@N7S0Z57CPg}!5z{QL#`2v?DJDt^ zCGN{0p-&&=)Sb28Xlo;ZXc^CGdwL9prf30uu$y5aPeWD6WIk4%%~DEhTiwOvy!rS% z&3z#DWo2qBA*=M2xIu=_R0sbrmP;Y?_rRa^k}3WYU6n9H^(})Zi-woMKKXfgbab@J zWx3DUr0MLpdDYk_LO8As}d*Z=x^K+uIv#T&SnY6&C$9 zBn1u`G#TBt+n5b%a;Cr0h^sm5Fl^OdxJ^8IebW);DWATq#Ba=#rggj*wNKy5NMzz& zBm`bk9bcSVPJbC`dHrI>o^=LSvTFpT`VAK`x_naOpvS~*l2$1vIk$avBA!|aeZ+7c z$_9Zzh>fc4$uX&w@-$VORCscG(B)OA@SPj>BNY3gxkkcPgNi9bE=?&3A4`3ekrdsb zn~`M;p8I>4?@@ZI{9Afv(tC@pp@Oe5BYUw-%&J_WaTBGls)&d8q?t$i<<@=_CNfH! z4H!ww7#gkp_^`bxZaJI9@C+A9x7@E1ZRoG5PL?w3GDi>`8Qq%I+0ygfT78%{Zt#mP zqX0CzaHKn@hAOQsv=^8UbfpuyFnT8Ht++Vmmx$~09!e{5t8fMkEjr~tfIxMlIpr4zGwvEIWKC2`Q#C)c7QF9wet?hE zLKoU?t@nqm=iBc` z8_((*(i(g}7z)3{%SJ!uya{?Ir-2^Fiap*VC4pF@N zpL5F*DG+(taLhdu4DbyAP(0&60n@%?G~hHugBI^-X6@_YOu}8UqwbQ8V`2vwDRLMz z)aRFo+r1f?5idT9xRF`cjgx$a-IpH3AH|bs$emw}d23*3aU0hYNh4(D0o-Z+wIX{d zeann?lzjgsAt62`er@<$`G755?i7tl%CHNgXp}#j>j&S1n5wZ;ofNbI>B2*4L1}@3 zq(LzPqn()w{KBsX!5*a&=dv<}t=R%II;TcQatbnKM7S4Q1PQIoT=^$#=>Y(m{mBYtl5W z6}|l4kxikOcJ`C3o{TSxIi?8|N6sH7Lkhq5qttl@uBTA|-cBluU$hU0&xYKvNidrL z4q>|j76}G1Db23Fa|XlFm%W&jW0h#7B$_FD-ZhqJ5#7i!0ZmCrereX z|Jlf`<1zR2akFe|boWv-r=}kM03o|%$mZA7Of2T99u~e56~6sh$P=yk9f!H6msn)n zvFOLF?W?iqi6fK9C)a42Sgt0kz4#M6 z-UY6451Er~=V;ITs1O-q*>}{;bs74MMZ(Z&=Z{5#q+i@cw^vI#0|Dh~-Dh-tn2I(S zTXXp-bLEG{p0#BbIqIcTM|DWZmr`&br8u)jQ`CR*^+g_fIX%=K+)x}F%Oak-Uh$6nIHUavnNV5M7YffU80QPRD%y>T{bIzn<6Rsy zb6cW6`?0EwSn;uJddPn@`?^Cry2s(6ccP1ykKr!kmDg2~zbTJq@+e(z5N>ZNr|8$j zPi-~ofp7E|Xx1#H+f@UR@AS}iLP!}}dRwf{u!avAq-_hNw#uaoOD{2jo*eRn8$~bDK`h1&ssOC6ekGV38+hU!KR z+kpnSzT;y#o|V2h|F?SY4-z1MFxz0;)@Lk`H>Cj zSl@fR%*@F79;HJcsX%L8_d!%TwmQyi$|n&C{oBMJ9~Xm!@@#lZdz(WB9SgJ#NIC%@ zy+~ZnI|4E`7f@W0Y9I@N7UTs1fTPD-ZiU%Lr2MnP+2h8AGh?(WGVf>h@W-_M>jRkD z(KNxvo(UJ7)o+*t%fCcM10;2XM$1NAFKwhp(c917^io_ynn-yv58IFIF*UJUw*2Ma zm?a-a1yp9B?WxpLzap-c^$HKkX_IfT_W8Lqaltl*A%vZSZWAe`Kv}vjz}>Tc;Hw9T zA+Nc49X&{WDmxY~ReV0YceXdL!$9mTL$Q@_vXIW6I{G=`$KR7jFcE&IsHwnKX;KldV#YL z(xwKAB5cFiz+r6m*5iJvo&E)XQqVWjmA}BfyVS&dm9&Y%$Sp^sW!JE3iI0v(kQHdo zmhWk|gC!e@CFKPv4BE*U;mYo0y}J0J-Fhu!c%v+paQf9+3Ed2EkfPt(D7|Ok#t)^PGr3Y)RGfvO=k;@Xry=Cf3fLCQ# zi`%oCt+vyB-t{iEgI&+2dczmnMXj>EOmSpMuuL8Ob`1$D;fc$wM6j2HH4Q$ zqaoj&M$2sLhpptdJMbs!krJId=iOd}HdP4Lt@yf42OZ{pOoQ4_gShz_sMoWYX}yQd zDQ8(tc7UvTt%`0#?9K!C^J>GpucEnBhnsWg102Z=uzOlwez^q^j7nV$krID#wC}A$ zcRfc2)T5Y~({6@1`{yL-Lzs;miT@C9|1SIFBMK7cz*E;v2H|EStZphjfb5mGMpw{q z!pl;Vw772tuvDH4o$;j4u8)@=m+&BIf4Ix(u75P?Q{4Y8^uvpq)mCW(enuQc)hx$B zOY{`_*%~bm%k*x6y;)D8_-yYbMsC8y#1H}89X;M=a#*HT>d*NFf}x$pQ&X?nFtvzA zKH|l8y;frsm|&}<%&*}Yu}Yn0M=Jy8qe%<1qXRR%Nut}Aqr+1pQS*D7Cp`+8Y`RO02p14DyVOmSYlEzZ;9&JzYhtybMZ%e4s zlks=V(+aJ!LK-()3ox`%9c)lx#3#y4{ulL6KpG|&>9`n?Uh#m3G-mZy-3h98Scyja zH^3Pb7?P z+2hAkyvg}g$#)n$Gs2fL19JNOZ|~>Nx(|}lmwesC!>?Y~72mpf4XZ8t^TIwbCk;i0 z+a2ymSZ^=OrtrSH!(y#Vn!8KWk#O7<1-!if+`dDDy18U7wS3k$lIeM}Z0fhYqI)+x zo*o4*S$S|hGf6vL>PaQ(OQ_%eskx-G-FV|dXHbTH<#w@RbeIx9I$d$xqHh`{*&d3y zevlYNk)}w@cuu4A$^DYJsOvO7VBaom@Rx@gb$V5IKJ{Xue16H-1H0j=U0brW-aVRG znWCQRkESBmD^4?a7mB@!jf2>(Hs=Bd-;XX1oEilevb9axB^NhIPLO>jl03S+Rw|fx z&oIsIk(~W!4$zzKF|uSR<@S#;{r;fKup)iDaxz_9JouroY>XHcrN(Mm@UHV?-8bCh zXGfY~7U`rCasv(h-R*ava)^ zF1`BMT*n3xQBTdM?`n&h2Ecf*XXuLo7Zyl_El(v~oh>}mK01$%0a@#uzyiX_g>Bav2XWwH%YekAxU%pBT!p*?%cS#zA zv;^eDC#KZP@7o=^GDc_V8<3w>`*L(+=A#(fcH)dGjqM}Vk_el+c>B`{9xm<>IZ-Zm zLL!-Yf*3nju_(8ZGUd9*K`iofWW+BYFnZF&+a|=yxqV?oUOcG#ulnSR$DMs|e5Tph%WW zVjzE3nMh7+rG!}av)+~;o$#+EHyPX zzOUO?^#)Jh*t^b7pTW+I%f;xy&JMPCO&5RR``BmHX-Mw{qoJp9BjKea$;A9%>-iEZ zvuUBm%0j5UWax~`ue!K6dDdip+zs3f{+qQKqH;9C(1Z@95()-Ew=`BdLh2VS3zI8qYGH&&7m9+vpUc+x8l!i-ATXKhw34XL2;ya_VIQz!OL^)8mtqnb?q=~&^h-$;Zn^HRZ2p(gH z39An;`AWT=i&VP0u&CUe7OYW51Icv=q%Vc7%Zm z_uAp9n}osEUdk2*pV)*i`WRSa-FWtCwGqS-75@K#V0)r;+0(0XVp9vnb7lWiMj!q= z>Zf(ioa@gSwA55Jil$lh)%4U<)$j@HTQU2KwuUUsZA*2O^QTKobak8g0Qb~ROMTW7 zfTF2yF*na6i(lQ*Nq^rPen^0>$$b`K!Kp{FVa-VF`kCiXZg0Vtr}i*rcpny_YOR!} z+?Jiv?dWlT`}o$s9Fxt%%684d7ek-q-Q~jS*I5+8HtvSw+Rp!D=+gVr!gqcYy9K74 z&eClx6f6{1Din;ynjz?XZlJ~W7^A@0wiHIt8$aou;f>MYpU%gUlDwAK*nX0#vHtyl z_C=B+ZkOffY|oR^2>(+IlZCTMFirZMhn>bqzR=38hvJpcM4-@gUYY7_k^G*FW9;5r zc9q4c>C?hd{uS3{MThN*(w!3e05e?bI#SNlo$U&%>((Dz0_JeqbG|}!wI$& z%q2JQ)Vas;i0RYqNXW!CC~QK%u$K$beGI zT2KuzMjus26(zmofK;m2gY%d*o~sHBKA#`RBNc9c*-GLmbgh?*9V;^TBSot2E%~Q5 zl+R!WA_h_JT;+irbJ#Z-tSy-;B^t&&dOSwPV(T!CB)no8Y4sP%k(MD^0P!NL1vK&7 z`3luW2$gkI#Zf>IZT2=m4R&e@d zeo#B=Q|9`w8}%|)f%GBjYO01&Dk5qjm$+#1yia#CE=Sh~88Vdp%|VU}0a6mF@JkhUY&~W3f#rHK-1Qdo z>0*z5?#-hQUY}k^X7~1bkI?($-~3#c3mF4Cl@2%|0@1=ARZ z^qlNaN63&>;O_~mmto}?tAhznb}p;GpyIq1Z^yf<_6Ui~cpbbP;uV7W!+ke>wYG-f zPPz2~%UgSs(>vsKFle%uo=WIDYz;BR!doAy)aQ0QCpE_Wz1XK+3Kpr=V_H8w zqzaizn9ALx#?fo-N)_CtENYH*1|ID|x=xa9d#;9~1Wgrcx^8=evrfky*Xj`269~A;kh^O|ewZnM}=SmM7NX=?h#jjLh&1kIT+A z)If4luYo@s+e_L&eRJ$gw1`)>u#efOq=M0iYIPS$GII0z`T56eNxK@~Y%*^~Q&w$1b)jM9Z~kuRc~YX`6r#ySCskW5cq|#a39s;ZiaL~OdEpgu z1k*sKkLZ&?6fAi=)77yKI1xii%)@DG8r}663xkJcwLTj?s`h{GP@_2}`A|;w7zrzk4QOQ*O$(e|M^<`vLD*1^i>Nr*= z+A`y@f{!zLi)ys9OrFM5`Qw0292Ciyq>zC>8(TkG1O;#UUh?#I08kuwpS_vhufJ0v&p^Yr`=^WG7!qVG(8n9u7=J64fr zQq7B|9rzl7s)I_|8UeVp?=cqGILQ}0O(n+^vJz=vFBU9JmG$=DWzi+qCHw@D0a7`M zA`%pmU8+8W{u0{2*^tg&3;I&i`4`{YJe_n8 z{viTJZL?$}#l9w${3mydrW>Z%nY!WXf$HJv5$Zw4F%7^mXWsZ-s&olv31;C*KlH)j z?j?Eika^cI`l>)WJ*ga?%>0HwJm{%<)OP8pdvwMG@fm;Ca`jfy7ixY-sic42*f&ld zJg3(O0~;=Zsp@cdUj@&Zj~#~LX=F5Ws@!Ik0-~(wlbJO6&)S~s6WrAW9lrQ%6+S03 z&P&xJ{;BC%2s%J#uxZy3=Fc}fkwE9(T}QAK9b{FT!L3^PQ~;#X$T|9v&JFq)ru$h|ls zvPxYyWT}V&Dol3#)t6pVE4nIClEq=r++eGcG-tkOW4{n$Ra~3z?`@_gXRUiR`SrhY4K z#>C+t>pNtm>!Zw*;p^qI0|g<)Ob`r0jaN6asw2ZGLT}bMbHnQ$OH8cR7{Rq?=4%&x z2Qe&O`w$~b%fuo>fkgT`PVx=uto@&SdDpIXL)<da|A*x(b?o zdUj^iN+B9%;2{1URo7=%m@r*RJi3fQNO_`AZY;b#tClm;A}NQF#!Y;pMMdh=^fO@9 z>J>Xv^joKJM>M7x=xh!oSLO3JlxVwTn$DPHdGsnkAvB)9d)IE6ZHgd1vd+Z;W1d682CBy4zti z&6;T6!rzSKIy&zKKfAx9J%7q-=Mac{u-_GIYEaZt*`h25Ne?ch`E_c2{pGA<;nVkx z102u6#||N$g5MhA{!rFwaI(;8$S{1DePGc^L~j6?Q$2QMIO09 zPdma#_kX(|;oOau(pX877ac9V4O8x3g{Mdbr6oS)7 zN0v#H_j!bhUNl;q>GrkeA~){;lCg@&Mg5(z%E1HV`d7{>_}@9JZ(VJn>=HKC4q{My zLpw8D2OD@&E}T?=SV7rE-XI?4H+E(aOI8sZOC$NW=!leE6MG6ycn2;fB4XpB!^#Z= zQ?P=-+!R0#4h{+c2LPbUF6{uZG&6i-ZDI+f;6P`8V{ZtxcA((p;6i6ds6r4x005m` z6k;m{H8U}FK+J;+syaZe)G2u2J;eI(G+`)^0+C~@0#BIzJLi_?-}e8NR15?I|34|k zx>2LneiYApj|7nW4k1sp9h-vz^G);Jq7ONB*clw!(IJ2QT3sYWS)>yb_Ual2Um3r5 zw706UJD48HLY73$&Gm=sl|EYND&Uk>VT!eN_p49f6HS<{TU>u{4&#WYh1dwy^E8il ziH`_=$2m8k)y$Q2yDZQluP+AZbND!Yi7Co@fwHnw2pV1bo*=wGx2n7Urt$y1@imz1&#&nK47Nw zT-dLY@^1NHY?5B#-Qf9?`lA_={@NnLpmwJGQG7&oU}0>) ziZ`GdjY(jIKi2Q?e+d=de}nq3pkP;ZG;lyf$Xh!{=x?qF#2$)p%>NM^W_I=tqNWf# zgv;e1fAtY=)-W@2FtyhKb8%3Bfj|mw00#vR4=)857d&XdU z(4fLD4>dA_AWjHkeJ)-u3LZ|NF1w_ijiW6*A6^xXD#Y5}7O{k(E4!#F{9rhl8A4Sg zMcAb&9N>rx39*a9v4(4~r$8jq|MLt0{*hTPYU2nu0sub&aQG~$!9>qU@%LGVw1{ZAdD5crj3WAdl2KV62-uIT7sX=aUZ*>8aV1F3(c z_P=p-FtxG!8!9*^U<3>RcoByeFaipAK|lhB5)AqaI)n^@hmeEwxOw0OKK@%C0pZ{C z5o^F{FbEE(DEt!$_$B<8DlYiaV7ME855ql#Py+_S#o(c8`L;d6lqRR~$cn(zq-4};(pf)4`xt=`PWS`7YO27?$MdgtpDP{`vCa4 z{2x3Z5bm@8-~oUj5Zv+q!Gl}N`CoDX0N4M*gTIpgb1nb?;)Y)s|FIqb0Ot6gw!m#h zTnhg~j+YZ2)c?r?0yzIm4hZ1=FTFrc;D6}=a`OJeW(PY6{AFi{I1;L6ZcsR+>?$@k z@FNVDLEL!K*2XpzfZwk|I3Y%%Lm?mm76XGtKw?0k2(JV$kO#;s#>p!o!6gRf5#f;l j@(7{-|3%=32kuUL2Z)`+Z(jm{U>-0!Ev>ks1p5C2Hj`#V literal 0 HcmV?d00001 From 616a9a3e458c64c155fe2f85d89e0ab772fad72c Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 11 Jan 2024 13:36:47 +0100 Subject: [PATCH 3/6] Fix ragas component (#759) Fix to issue that @mrchtr raised * `model_name` was actually referring to the class name used to import the model which can be `OpenAI` or `ChatOpenAI`. This caused some issues because we were trying to set `model_name` to `gpt-4`. Changed the arguments and default to reflect that. * ChatOpenAI seems to be the new default of defining models, changed the default model class to that one When initializing GPT-4 with `OpenAI()`: ``` UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain_community.chat_models import ChatOpenAI` ``` Also how it's set in RAGAS [LINK](https://docs.ragas.io/en/latest/howtos/customisations/llms.html#:~:text=Copy%20code-,from%20langchain.chat_models%20import%20ChatOpenAI,-gpt4%20%3D%20ChatOpenAI) Changes will be required to RAG pipeline after new release --- components/evaluate_ragas/README.md | 12 ++++---- .../evaluate_ragas/fondant_component.yaml | 8 +++-- components/evaluate_ragas/src/main.py | 29 +++++++++++-------- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/components/evaluate_ragas/README.md b/components/evaluate_ragas/README.md index 2e25a1897..f52f3ff48 100644 --- a/components/evaluate_ragas/README.md +++ b/components/evaluate_ragas/README.md @@ -34,9 +34,9 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| module | str | Module from which the LLM is imported. Defaults to langchain.llms | langchain.llms | -| llm_name | str | Name of the selected llm | / | -| llm_kwargs | dict | Arguments of the selected llm | / | +| llm_module_name | str | Module from which the LLM is imported. Defaults to langchain.llms | langchain.chat_models | +| llm_class_name | str | Name of the selected llm | ChatOpenAI | +| llm_kwargs | dict | Arguments of the selected llm | {'model_name': 'gpt-3.5-turbo'} | ## Usage @@ -55,9 +55,9 @@ dataset = dataset.apply( "evaluate_ragas", arguments={ # Add arguments - # "module": "langchain.llms", - # "llm_name": , - # "llm_kwargs": {}, + # "llm_module_name": "langchain.chat_models", + # "llm_class_name": "ChatOpenAI", + # "llm_kwargs": {'model_name': 'gpt-3.5-turbo'}, }, produces={ : , diff --git a/components/evaluate_ragas/fondant_component.yaml b/components/evaluate_ragas/fondant_component.yaml index cfa6fe22d..5dc3db0bb 100644 --- a/components/evaluate_ragas/fondant_component.yaml +++ b/components/evaluate_ragas/fondant_component.yaml @@ -19,13 +19,15 @@ produces: args: - module: + llm_module_name: description: Module from which the LLM is imported. Defaults to langchain.llms type: str - default: "langchain.llms" - llm_name: + default: "langchain.chat_models" + llm_class_name: description: Name of the selected llm type: str + default: "ChatOpenAI" llm_kwargs: description: Arguments of the selected llm type: dict + default: {"model_name":"gpt-3.5-turbo"} diff --git a/components/evaluate_ragas/src/main.py b/components/evaluate_ragas/src/main.py index 184b50b24..ba299b2fb 100644 --- a/components/evaluate_ragas/src/main.py +++ b/components/evaluate_ragas/src/main.py @@ -11,24 +11,26 @@ class RetrieverEval(PandasTransformComponent): def __init__( self, *, - module: str, - llm_name: str, + llm_module_name: str, + llm_class_name: str, llm_kwargs: dict, produces: t.Dict[str, t.Any], **kwargs, ) -> None: """ Args: + llm_module_name: Module from which the LLM is imported. Defaults to + langchain.chat_models + llm_class_name: Name of the selected llm. Defaults to ChatOpenAI module: Module from which the LLM is imported. Defaults to langchain.llms - llm_name: Name of the selected llm llm_kwargs: Arguments of the selected llm produces: RAGAS metrics to compute. kwargs: Unhandled keyword arguments passed in by Fondant. """ self.llm = self.extract_llm( - module=module, - model_name=llm_name, - model_kwargs=llm_kwargs, + llm_module_name=llm_module_name, + llm_class_name=llm_class_name, + llm_kwargs=llm_kwargs, ) self.gpt_wrapper = LangchainLLM(llm=self.llm) self.metric_functions = self.extract_metric_functions( @@ -38,13 +40,16 @@ def __init__( # import the metric functions selected @staticmethod - def import_from(module, name): - module = __import__(module, fromlist=[name]) - return getattr(module, name) + def import_from(module_name: str, element_name: str): + module = __import__(module_name, fromlist=[element_name]) + return getattr(module, element_name) - def extract_llm(self, module, model_name, model_kwargs): - module = self.import_from(module, model_name) - return module(**model_kwargs) + def extract_llm(self, llm_module_name: str, llm_class_name: str, llm_kwargs: dict): + module = self.import_from( + module_name=llm_module_name, + element_name=llm_class_name, + ) + return module(**llm_kwargs) def extract_metric_functions(self, metrics: list): functions = [] From 836a5bba4c577aa93f1dde714bdd725981c39dc9 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 11 Jan 2024 14:25:55 +0100 Subject: [PATCH 4/6] Fixing data type in chunk_text component (#772) Co-authored-by: Philippe Moussalli --- components/chunk_text/README.md | 2 +- components/chunk_text/fondant_component.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md index 26aab59fa..490f9e4e2 100644 --- a/components/chunk_text/README.md +++ b/components/chunk_text/README.md @@ -43,7 +43,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| chunk_strategy | int | The strategy to use for chunking the text. One of ['RecursiveCharacterTextSplitter', 'HTMLHeaderTextSplitter', 'CharacterTextSplitter', 'Language', 'MarkdownHeaderTextSplitter', 'MarkdownTextSplitter', 'SentenceTransformersTokenTextSplitter', 'LatexTextSplitter', 'SpacyTextSplitter', 'TokenTextSplitter', 'NLTKTextSplitter', 'PythonCodeTextSplitter', 'character', 'NLTK', 'SpaCy'] | RecursiveCharacterTextSplitter | +| chunk_strategy | str | The strategy to use for chunking the text. One of ['RecursiveCharacterTextSplitter', 'HTMLHeaderTextSplitter', 'CharacterTextSplitter', 'Language', 'MarkdownHeaderTextSplitter', 'MarkdownTextSplitter', 'SentenceTransformersTokenTextSplitter', 'LatexTextSplitter', 'SpacyTextSplitter', 'TokenTextSplitter', 'NLTKTextSplitter', 'PythonCodeTextSplitter', 'character', 'NLTK', 'SpaCy'] | RecursiveCharacterTextSplitter | | chunk_kwargs | dict | The arguments to pass to the chunking strategy | / | | language_text_splitter | str | The programming language to use for splitting text into sentences if "language" is selected as the splitter. Check https://python.langchain.com/docs/modules/data_connection/document_transformers/code_splitter for more information on supported languages. | / | diff --git a/components/chunk_text/fondant_component.yaml b/components/chunk_text/fondant_component.yaml index bb1d0088e..b673ef5ff 100644 --- a/components/chunk_text/fondant_component.yaml +++ b/components/chunk_text/fondant_component.yaml @@ -35,7 +35,7 @@ args: 'SentenceTransformersTokenTextSplitter', 'LatexTextSplitter', 'SpacyTextSplitter', 'TokenTextSplitter', 'NLTKTextSplitter', 'PythonCodeTextSplitter', 'character', 'NLTK', 'SpaCy'] - type: int + type: str default: RecursiveCharacterTextSplitter chunk_kwargs: description: The arguments to pass to the chunking strategy From c67f7a1e99f35f396957bb85ce0d2e8d506d9daa Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 11 Jan 2024 14:56:37 +0100 Subject: [PATCH 5/6] Add teardown method to components (#773) **Weaviate:** https://github.com/weaviate/weaviate-python-client/blob/fa28abb55c96c0e1034d1c24cc1780f25a369ad4/weaviate/client.py#L295C1-L298C37 **Qdrant:** https://github.com/qdrant/qdrant-client/blob/6d019e67a133bea87a96bce388f5b901cdae1287/qdrant_client/qdrant_client.py#L131 **Opensearch:** https://github.com/opensearch-project/opensearch-py/blob/2ab3a40307d23a63d83977aa3adf146b6f02b5fa/opensearchpy/client/__init__.py#L254 ClipClient does not have an implement method https://github.com/rom1504/clip-retrieval/blob/37e482b99ee36fe51aa4f8ab1e6930dc809edb6d/clip_retrieval/clip_client.py#L17 --- components/index_aws_opensearch/src/main.py | 3 +++ components/index_qdrant/src/main.py | 3 +++ components/index_weaviate/src/main.py | 3 +++ components/retrieve_from_weaviate/src/main.py | 3 +++ 4 files changed, 12 insertions(+) diff --git a/components/index_aws_opensearch/src/main.py b/components/index_aws_opensearch/src/main.py index b04820e82..f1d7cd995 100644 --- a/components/index_aws_opensearch/src/main.py +++ b/components/index_aws_opensearch/src/main.py @@ -39,6 +39,9 @@ def __init__( ) self.create_index(index_body) + def teardown(self) -> None: + self.client.close() + def create_index(self, index_body: Dict[str, Any]): """Creates an index if not existing in AWS OpenSearch. diff --git a/components/index_qdrant/src/main.py b/components/index_qdrant/src/main.py index e74019308..90fd202cb 100644 --- a/components/index_qdrant/src/main.py +++ b/components/index_qdrant/src/main.py @@ -47,6 +47,9 @@ def __init__( self.batch_size = batch_size self.parallelism = parallelism + def teardown(self) -> None: + self.client.close() + def write(self, dataframe: dd.DataFrame) -> None: """ Writes the data from the given Dask DataFrame to the Qdrant collection. diff --git a/components/index_weaviate/src/main.py b/components/index_weaviate/src/main.py index fb3a94f3a..9761b2351 100644 --- a/components/index_weaviate/src/main.py +++ b/components/index_weaviate/src/main.py @@ -53,6 +53,9 @@ def __init__( }, ) + def teardown(self) -> None: + del self.client + def write(self, dataframe: dd.DataFrame) -> None: with self.client.batch as batch: for part in tqdm( diff --git a/components/retrieve_from_weaviate/src/main.py b/components/retrieve_from_weaviate/src/main.py index 7bd3fe9dc..a1de66b11 100644 --- a/components/retrieve_from_weaviate/src/main.py +++ b/components/retrieve_from_weaviate/src/main.py @@ -24,6 +24,9 @@ def __init__( self.class_name = class_name self.k = top_k + def teardown(self) -> None: + del self.client + def retrieve_chunks(self, vector_query: list): """Get results from weaviate database.""" result = ( From 16888c80e69b21b62b8ba42b7e00a526ef77d20b Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 11 Jan 2024 15:08:50 +0100 Subject: [PATCH 6/6] Handle docker compose errors (#769) PR that handles errors when docker compose faces an issue, we now a raise a valid python error with the error command that is being thrown. This make it more clear for the user that something went off with the pipeline. @janvanlooy --- src/fondant/core/exceptions.py | 4 ++++ src/fondant/pipeline/runner.py | 16 +++++++++---- tests/pipeline/test_runner.py | 43 +++++++++++++++++++++++++++++----- tests/test_cli.py | 40 ++++++++++++++++++++++++------- 4 files changed, 84 insertions(+), 19 deletions(-) diff --git a/src/fondant/core/exceptions.py b/src/fondant/core/exceptions.py index 4143f389a..5560b9aab 100644 --- a/src/fondant/core/exceptions.py +++ b/src/fondant/core/exceptions.py @@ -25,3 +25,7 @@ class InvalidTypeSchema(ValidationError, FondantException): class UnsupportedTypeAnnotation(FondantException): """Thrown when an unsupported type annotation is encountered during type inference.""" + + +class PipelineRunError(ValidationError, FondantException): + """Thrown when a pipeline run results in an error.""" diff --git a/src/fondant/pipeline/runner.py b/src/fondant/pipeline/runner.py index 1b5d4491c..62b158c1c 100644 --- a/src/fondant/pipeline/runner.py +++ b/src/fondant/pipeline/runner.py @@ -6,6 +6,7 @@ import yaml +from fondant.core.exceptions import PipelineRunError from fondant.pipeline import Pipeline from fondant.pipeline.compiler import ( DockerCompiler, @@ -38,15 +39,23 @@ def _run(self, input_spec: str, *args, **kwargs): "--pull", "always", "--remove-orphans", + "--abort-on-container-exit", ] print("Starting pipeline run...") # copy the current environment with the DOCKER_DEFAULT_PLATFORM argument - subprocess.call( # nosec + output = subprocess.run( # nosec cmd, env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), + capture_output=True, + encoding="utf8", ) + + if output.returncode != 0: + msg = f"Command failed with error: '{output.stderr}'" + raise PipelineRunError(msg) + print("Finished pipeline run.") def run( @@ -55,12 +64,11 @@ def run( *, extra_volumes: t.Union[t.Optional[list], t.Optional[str]] = None, build_args: t.Optional[t.List[str]] = None, - ) -> None: + ): """Run a pipeline, either from a compiled docker-compose spec or from a fondant pipeline. Args: input: the pipeline to compile or a path to a already compiled docker-compose spec - output_path: the path where to save the docker-compose spec extra_volumes: a list of extra volumes (using the Short syntax: https://docs.docker.com/compose/compose-file/05-services/#short-syntax-5) to mount in the docker-compose spec. @@ -258,8 +266,6 @@ def run( pipeline_name: the name of the pipeline to create role_arn: the Amazon Resource Name role to use for the processing steps, if none provided the `sagemaker.get_execution_role()` role will be used. - instance_type: the instance type to use for the processing steps - (see: https://aws.amazon.com/ec2/instance-types/ for options). """ if isinstance(input, Pipeline): os.makedirs(".fondant", exist_ok=True) diff --git a/tests/pipeline/test_runner.py b/tests/pipeline/test_runner.py index a59f63c80..be7c736af 100644 --- a/tests/pipeline/test_runner.py +++ b/tests/pipeline/test_runner.py @@ -5,6 +5,7 @@ from unittest import mock import pytest +from fondant.core.exceptions import PipelineRunError from fondant.pipeline import Pipeline from fondant.pipeline.runner import ( DockerRunner, @@ -22,11 +23,23 @@ ) -def test_docker_runner(): +@pytest.fixture() +def mock_subprocess_run(): + def _mock_subprocess_run(*args, **kwargs): + class MockCompletedProcess: + returncode = 0 + + return MockCompletedProcess() + + return _mock_subprocess_run + + +def test_docker_runner(mock_subprocess_run): """Test that the docker runner while mocking subprocess.call.""" - with mock.patch("subprocess.call") as mock_call: + with mock.patch("subprocess.run") as mock_run: + mock_run.side_effect = mock_subprocess_run DockerRunner().run("some/path") - mock_call.assert_called_once_with( + mock_run.assert_called_once_with( [ "docker", "compose", @@ -37,15 +50,19 @@ def test_docker_runner(): "--pull", "always", "--remove-orphans", + "--abort-on-container-exit", ], env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), + capture_output=True, + encoding="utf8", ) -def test_docker_runner_from_pipeline(): - with mock.patch("subprocess.call") as mock_call: +def test_docker_runner_from_pipeline(mock_subprocess_run): + with mock.patch("subprocess.run") as mock_run: + mock_run.side_effect = mock_subprocess_run DockerRunner().run(PIPELINE) - mock_call.assert_called_once_with( + mock_run.assert_called_once_with( [ "docker", "compose", @@ -56,11 +73,25 @@ def test_docker_runner_from_pipeline(): "--pull", "always", "--remove-orphans", + "--abort-on-container-exit", ], env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), + capture_output=True, + encoding="utf8", ) +def test_invalid_docker_run(): + """Test that the docker runner throws the correct error.""" + spec_path = "some/path" + resolved_spec_path = str(Path(spec_path).resolve()) + with pytest.raises( + PipelineRunError, + match=f"stat {resolved_spec_path}: no such file or directory", + ): + DockerRunner().run(spec_path) + + class MockKfpClient: def __init__(self, host): self.host = host diff --git a/tests/test_cli.py b/tests/test_cli.py index 48130982a..23a4e2d0c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -44,6 +44,17 @@ def load(self): pass +@pytest.fixture() +def mock_subprocess_run(): + def _mock_subprocess_run(*args, **kwargs): + class MockCompletedProcess: + returncode = 0 + + return MockCompletedProcess() + + return _mock_subprocess_run + + @pytest.mark.parametrize("command", commands) def test_basic_invocation(command): """Test that the CLI (sub)commands can be invoked without errors.""" @@ -262,7 +273,7 @@ def test_sagemaker_compile(tmp_path_factory): ) -def test_local_run(): +def test_local_run(mock_subprocess_run): """Test that the run command works with different arguments.""" args = argparse.Namespace( local=True, @@ -275,9 +286,11 @@ def test_local_run(): extra_volumes=[], build_arg=[], ) - with patch("subprocess.call") as mock_call: + + with patch("subprocess.run") as mock_run: + mock_run.side_effect = mock_subprocess_run run_local(args) - mock_call.assert_called_once_with( + mock_run.assert_called_once_with( [ "docker", "compose", @@ -288,11 +301,15 @@ def test_local_run(): "--pull", "always", "--remove-orphans", + "--abort-on-container-exit", ], env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), + capture_output=True, + encoding="utf8", ) - with patch("subprocess.call") as mock_call: + with patch("subprocess.run") as mock_run: + mock_run.side_effect = mock_subprocess_run args1 = argparse.Namespace( local=True, vertex=False, @@ -306,7 +323,7 @@ def test_local_run(): credentials=None, ) run_local(args1) - mock_call.assert_called_once_with( + mock_run.assert_called_once_with( [ "docker", "compose", @@ -317,12 +334,15 @@ def test_local_run(): "--pull", "always", "--remove-orphans", + "--abort-on-container-exit", ], env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), + capture_output=True, + encoding="utf8", ) -def test_local_run_cloud_credentials(): +def test_local_run_cloud_credentials(mock_subprocess_run): namespace_creds_kwargs = [ {"auth_gcp": True, "auth_azure": False, "auth_aws": False}, {"auth_gcp": False, "auth_azure": True, "auth_aws": False}, @@ -333,8 +353,10 @@ def test_local_run_cloud_credentials(): with patch( "fondant.pipeline.compiler.DockerCompiler.compile", ) as mock_compiler, patch( - "subprocess.call", + "subprocess.run", ) as mock_runner: + mock_runner.side_effect = mock_subprocess_run + args = argparse.Namespace( local=True, vertex=False, @@ -360,7 +382,6 @@ def test_local_run_cloud_credentials(): output_path=".fondant/compose.yaml", build_args=[], ) - mock_runner.assert_called_once_with( [ "docker", @@ -372,8 +393,11 @@ def test_local_run_cloud_credentials(): "--pull", "always", "--remove-orphans", + "--abort-on-container-exit", ], env=dict(os.environ, DOCKER_DEFAULT_PLATFORM="linux/amd64"), + capture_output=True, + encoding="utf8", )