diff --git a/integrations/unstructured/README.md b/integrations/unstructured/README.md index db74c5306..2fadab835 100644 --- a/integrations/unstructured/README.md +++ b/integrations/unstructured/README.md @@ -18,6 +18,14 @@ pip install unstructured-fileconverter-haystack ``` +## Usage + +You can use `UnstructuredFileConverter` and `UnstructuredLocalFileConverter` by importing as: + +```python +from unstructured_fileconverter_haystack.converter import UnstructuredLocalFileConverter, UnstructuredFileConverter +``` + ## License `unstructured-fileconverter-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/unstructured/pyproject.toml b/integrations/unstructured/pyproject.toml index e199b3c3e..e362d6f88 100644 --- a/integrations/unstructured/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "unstructured<0.11.4", # FIXME: investigate why 0.11.4 broke the tests + "unstructured[pdf]<0.11.4", # FIXME: investigate why 0.11.4 broke the tests ] [project.urls] diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py index 26f14134b..1c1c3f01d 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from .converter import UnstructuredFileConverter +from .converter import UnstructuredFileConverter, UnstructuredLocalFileConverter -__all__ = ["UnstructuredFileConverter"] +__all__ = ["UnstructuredFileConverter", "UnstructuredLocalFileConverter"] diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index bee1d9a7b..4fc34121a 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -14,6 +14,7 @@ from unstructured.documents.elements import Element # type: ignore[import] from unstructured.partition.api import partition_via_api # type: ignore[import] +from unstructured.partition.auto import partition # type: ignore[import] logger = logging.getLogger(__name__) @@ -21,15 +22,13 @@ @component -class UnstructuredFileConverter: +class UnstructuredLocalFileConverter: """ - Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + Convert files to Haystack Documents using the Unstructured package """ def __init__( self, - api_url: str = UNSTRUCTURED_HOSTED_API_URL, - api_key: Optional[str] = None, document_creation_mode: Literal[ "one-doc-per-file", "one-doc-per-page", "one-doc-per-element" ] = "one-doc-per-file", @@ -38,12 +37,6 @@ def __init__( progress_bar: bool = True, # noqa: FBT001, FBT002 ): """ - :param api_url: URL of the Unstructured API. Defaults to the hosted version. - If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general). - See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information. - :param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key). - If you run the API locally, it is not needed. - If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. :param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured. - "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field. - "one-doc-per-page": One Haystack Document per page. @@ -55,35 +48,17 @@ def __init__( See https://unstructured-io.github.io/unstructured/api.html. :param progress_bar: Show a progress bar for the conversion. Defaults to True. """ - - self.api_url = api_url self.document_creation_mode = document_creation_mode self.unstructured_kwargs = unstructured_kwargs or {} self.separator = separator self.progress_bar = progress_bar - is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL - - api_key = api_key or os.environ.get("UNSTRUCTURED_API_KEY") - # we check whether api_key is None or an empty string - if is_hosted_api and not api_key: - msg = ( - "To use the hosted version of Unstructured, you need to set the environment variable " - "UNSTRUCTURED_API_KEY (recommended) or explictly pass the parameter api_key." - ) - raise ValueError(msg) - - self.api_key = api_key - def to_dict(self) -> Dict[str, Any]: """ Serialize this component to a dictionary. """ - - # do not serialize api_key return default_to_dict( self, - api_url=self.api_url, document_creation_mode=self.document_creation_mode, separator=self.separator, unstructured_kwargs=self.unstructured_kwargs, @@ -182,6 +157,88 @@ def _create_documents( docs.append(doc) return docs + def _partition_file_into_elements(self, filepath: Path) -> List[Element]: + """ + Partition a file into elements using the Unstructured package. + """ + elements = [] + try: + elements = partition(filename=str(filepath), **self.unstructured_kwargs) + except Exception as e: + logger.warning("Unstructured could not process file %s. Error: %s", filepath, e) + return elements + + +class UnstructuredFileConverter(UnstructuredLocalFileConverter): + """ + Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + """ + + def __init__( + self, + api_url: str = UNSTRUCTURED_HOSTED_API_URL, + api_key: Optional[str] = None, + document_creation_mode: Literal[ + "one-doc-per-file", "one-doc-per-page", "one-doc-per-element" + ] = "one-doc-per-file", + separator: str = "\n\n", + unstructured_kwargs: Optional[Dict[str, Any]] = None, + progress_bar: bool = True, # noqa: FBT001, FBT002 + ): + """ + :param api_url: URL of the Unstructured API. Defaults to the hosted version. + If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general). + See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information. + :param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key). + If you run the API locally, it is not needed. + If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. + :param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured. + - "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field. + - "one-doc-per-page": One Haystack Document per page. + All elements on a page are concatenated into one text field. + - "one-doc-per-element": One Haystack Document per element. + Each element is converted to a Haystack Document. + :param separator: Separator between elements when concatenating them into one text field. + :param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API. + See https://unstructured-io.github.io/unstructured/api.html. + :param progress_bar: Show a progress bar for the conversion. Defaults to True. + """ + super().__init__( + document_creation_mode=document_creation_mode, + separator=separator, + unstructured_kwargs=unstructured_kwargs, + progress_bar=progress_bar + ) + + self.api_url = api_url + is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL + + api_key = api_key or os.environ.get("UNSTRUCTURED_API_KEY") + # we check whether api_key is None or an empty string + if is_hosted_api and not api_key: + msg = ( + "To use the hosted version of Unstructured, you need to set the environment variable " + "UNSTRUCTURED_API_KEY (recommended) or explicitly pass the parameter api_key." + ) + raise ValueError(msg) + + self.api_key = api_key + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + + # do not serialize api_key + return default_to_dict( + self, + api_url=self.api_url, + document_creation_mode=self.document_creation_mode, + separator=self.separator, + unstructured_kwargs=self.unstructured_kwargs, + progress_bar=self.progress_bar, + ) + def _partition_file_into_elements(self, filepath: Path) -> List[Element]: """ Partition a file into elements using the Unstructured API. diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index d5266ac62..e5de8e50b 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -4,7 +4,10 @@ from pathlib import Path import pytest -from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter +from haystack_integrations.components.converters.unstructured import ( + UnstructuredFileConverter, + UnstructuredLocalFileConverter, +) @pytest.fixture @@ -188,3 +191,149 @@ def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): assert "category" in doc.meta assert "common_meta" in doc.meta assert doc.meta["common_meta"] == "common" + + +class TestUnstructuredLocalFileConverter: + def test_init_default(self): + converter = UnstructuredLocalFileConverter() + assert converter.document_creation_mode == "one-doc-per-file" + assert converter.separator == "\n\n" + assert converter.unstructured_kwargs == {} + assert converter.progress_bar + + def test_init_with_parameters(self): + converter = UnstructuredLocalFileConverter( + document_creation_mode="one-doc-per-element", + separator="|", + unstructured_kwargs={"foo": "bar"}, + progress_bar=False, + ) + assert converter.document_creation_mode == "one-doc-per-element" + assert converter.separator == "|" + assert converter.unstructured_kwargs == {"foo": "bar"} + assert not converter.progress_bar + + def test_to_dict(self): + converter = UnstructuredLocalFileConverter() + converter_dict = converter.to_dict() + + assert converter_dict == { + "type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredLocalFileConverter", + "init_parameters": { + "document_creation_mode": "one-doc-per-file", + "separator": "\n\n", + "unstructured_kwargs": {}, + "progress_bar": True, + }, + } + + @pytest.mark.integration + def test_run_one_doc_per_file(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-file") + documents = local_converter.run([pdf_path])["documents"] + assert len(documents) == 1 + assert documents[0].meta == {"file_path": str(pdf_path)} + + @pytest.mark.integration + def test_run_one_doc_per_page(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-page") + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) == 4 + for i, doc in enumerate(documents, start=1): + assert doc.meta["file_path"] == str(pdf_path) + assert doc.meta["page_number"] == i + + @pytest.mark.integration + def test_run_one_doc_per_element(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element") + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) > 4 + for doc in documents: + assert doc.meta["file_path"] == str(pdf_path) + assert "page_number" in doc.meta + + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + + @pytest.mark.integration + def test_run_one_doc_per_file_with_meta(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + meta = {"custom_meta": "foobar"} + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-file") + documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] + + assert len(documents) == 1 + assert documents[0].meta["file_path"] == str(pdf_path) + assert "custom_meta" in documents[0].meta + assert documents[0].meta["custom_meta"] == "foobar" + assert documents[0].meta == {"file_path": str(pdf_path), "custom_meta": "foobar"} + + @pytest.mark.integration + def test_run_one_doc_per_page_with_meta(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + meta = {"custom_meta": "foobar"} + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-page") + + documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] + + assert len(documents) == 4 + for i, doc in enumerate(documents, start=1): + assert doc.meta["file_path"] == str(pdf_path) + assert doc.meta["page_number"] == i + assert "custom_meta" in doc.meta + assert doc.meta["custom_meta"] == "foobar" + + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + meta = {"custom_meta": "foobar"} + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element") + documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] + + assert len(documents) > 4 + for doc in documents: + assert doc.meta["file_path"] == str(pdf_path) + assert "page_number" in doc.meta + + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + assert "custom_meta" in doc.meta + assert doc.meta["custom_meta"] == "foobar" + + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): + pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"] + meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element") + + documents = local_converter.run(paths=pdf_path, meta=meta)["documents"] + + assert len(documents) > 4 + for doc in documents: + assert "file_path" in doc.meta + assert "page_number" in doc.meta + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + assert "common_meta" in doc.meta + assert doc.meta["common_meta"] == "common" + + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): + pdf_path = [samples_path] + meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] + local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element") + documents = local_converter.run(paths=pdf_path, meta=meta)["documents"] + + assert len(documents) > 4 + for doc in documents: + assert "file_path" in doc.meta + assert "page_number" in doc.meta + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + assert "common_meta" in doc.meta + assert doc.meta["common_meta"] == "common"