Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Non-api version of Unstrctured file converter #258

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions integrations/unstructured/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
pip install unstructured-fileconverter-haystack
```

## Usage

You can use `UnstructuredFileConverter` and `UnstructuredLocalFileConverter` by importing as:

```python
from unstructured_fileconverter_haystack.converter import UnstructuredLocalFileConverter, UnstructuredFileConverter
```

## License

`unstructured-fileconverter-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
Expand Down
2 changes: 1 addition & 1 deletion integrations/unstructured/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ classifiers = [
]
dependencies = [
"haystack-ai",
"unstructured<0.11.4", # FIXME: investigate why 0.11.4 broke the tests
"unstructured[pdf]<0.11.4", # FIXME: investigate why 0.11.4 broke the tests
]

[project.urls]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from .converter import UnstructuredFileConverter
from .converter import UnstructuredFileConverter, UnstructuredLocalFileConverter

__all__ = ["UnstructuredFileConverter"]
__all__ = ["UnstructuredFileConverter", "UnstructuredLocalFileConverter"]
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,21 @@

from unstructured.documents.elements import Element # type: ignore[import]
from unstructured.partition.api import partition_via_api # type: ignore[import]
from unstructured.partition.auto import partition # type: ignore[import]

logger = logging.getLogger(__name__)

UNSTRUCTURED_HOSTED_API_URL = "https://api.unstructured.io/general/v0/general"


@component
class UnstructuredFileConverter:
class UnstructuredLocalFileConverter:
"""
Convert files to Haystack Documents using the Unstructured API (hosted or running locally).
Convert files to Haystack Documents using the Unstructured package
"""

def __init__(
self,
api_url: str = UNSTRUCTURED_HOSTED_API_URL,
api_key: Optional[str] = None,
document_creation_mode: Literal[
"one-doc-per-file", "one-doc-per-page", "one-doc-per-element"
] = "one-doc-per-file",
Expand All @@ -38,12 +37,6 @@ def __init__(
progress_bar: bool = True, # noqa: FBT001, FBT002
):
"""
:param api_url: URL of the Unstructured API. Defaults to the hosted version.
If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general).
See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information.
:param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key).
If you run the API locally, it is not needed.
If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY.
:param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured.
- "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field.
- "one-doc-per-page": One Haystack Document per page.
Expand All @@ -55,35 +48,17 @@ def __init__(
See https://unstructured-io.github.io/unstructured/api.html.
:param progress_bar: Show a progress bar for the conversion. Defaults to True.
"""

self.api_url = api_url
self.document_creation_mode = document_creation_mode
self.unstructured_kwargs = unstructured_kwargs or {}
self.separator = separator
self.progress_bar = progress_bar

is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL

api_key = api_key or os.environ.get("UNSTRUCTURED_API_KEY")
# we check whether api_key is None or an empty string
if is_hosted_api and not api_key:
msg = (
"To use the hosted version of Unstructured, you need to set the environment variable "
"UNSTRUCTURED_API_KEY (recommended) or explictly pass the parameter api_key."
)
raise ValueError(msg)

self.api_key = api_key

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""

# do not serialize api_key
return default_to_dict(
self,
api_url=self.api_url,
document_creation_mode=self.document_creation_mode,
separator=self.separator,
unstructured_kwargs=self.unstructured_kwargs,
Expand Down Expand Up @@ -182,6 +157,88 @@ def _create_documents(
docs.append(doc)
return docs

def _partition_file_into_elements(self, filepath: Path) -> List[Element]:
"""
Partition a file into elements using the Unstructured package.
"""
elements = []
try:
elements = partition(filename=str(filepath), **self.unstructured_kwargs)
except Exception as e:
logger.warning("Unstructured could not process file %s. Error: %s", filepath, e)
return elements


class UnstructuredFileConverter(UnstructuredLocalFileConverter):
"""
Convert files to Haystack Documents using the Unstructured API (hosted or running locally).
"""

def __init__(
self,
api_url: str = UNSTRUCTURED_HOSTED_API_URL,
api_key: Optional[str] = None,
document_creation_mode: Literal[
"one-doc-per-file", "one-doc-per-page", "one-doc-per-element"
] = "one-doc-per-file",
separator: str = "\n\n",
unstructured_kwargs: Optional[Dict[str, Any]] = None,
progress_bar: bool = True, # noqa: FBT001, FBT002
):
"""
:param api_url: URL of the Unstructured API. Defaults to the hosted version.
If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general).
See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information.
:param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key).
If you run the API locally, it is not needed.
If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY.
:param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured.
- "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field.
- "one-doc-per-page": One Haystack Document per page.
All elements on a page are concatenated into one text field.
- "one-doc-per-element": One Haystack Document per element.
Each element is converted to a Haystack Document.
:param separator: Separator between elements when concatenating them into one text field.
:param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API.
See https://unstructured-io.github.io/unstructured/api.html.
:param progress_bar: Show a progress bar for the conversion. Defaults to True.
"""
super().__init__(
document_creation_mode=document_creation_mode,
separator=separator,
unstructured_kwargs=unstructured_kwargs,
progress_bar=progress_bar
)

self.api_url = api_url
is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL

api_key = api_key or os.environ.get("UNSTRUCTURED_API_KEY")
# we check whether api_key is None or an empty string
if is_hosted_api and not api_key:
msg = (
"To use the hosted version of Unstructured, you need to set the environment variable "
"UNSTRUCTURED_API_KEY (recommended) or explicitly pass the parameter api_key."
)
raise ValueError(msg)

self.api_key = api_key

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""

# do not serialize api_key
return default_to_dict(
self,
api_url=self.api_url,
document_creation_mode=self.document_creation_mode,
separator=self.separator,
unstructured_kwargs=self.unstructured_kwargs,
progress_bar=self.progress_bar,
)

def _partition_file_into_elements(self, filepath: Path) -> List[Element]:
"""
Partition a file into elements using the Unstructured API.
Expand Down
151 changes: 150 additions & 1 deletion integrations/unstructured/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from pathlib import Path

import pytest
from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter
from haystack_integrations.components.converters.unstructured import (
UnstructuredFileConverter,
UnstructuredLocalFileConverter,
)


@pytest.fixture
Expand Down Expand Up @@ -188,3 +191,149 @@ def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):
assert "category" in doc.meta
assert "common_meta" in doc.meta
assert doc.meta["common_meta"] == "common"


class TestUnstructuredLocalFileConverter:
def test_init_default(self):
converter = UnstructuredLocalFileConverter()
assert converter.document_creation_mode == "one-doc-per-file"
assert converter.separator == "\n\n"
assert converter.unstructured_kwargs == {}
assert converter.progress_bar

def test_init_with_parameters(self):
converter = UnstructuredLocalFileConverter(
document_creation_mode="one-doc-per-element",
separator="|",
unstructured_kwargs={"foo": "bar"},
progress_bar=False,
)
assert converter.document_creation_mode == "one-doc-per-element"
assert converter.separator == "|"
assert converter.unstructured_kwargs == {"foo": "bar"}
assert not converter.progress_bar

def test_to_dict(self):
converter = UnstructuredLocalFileConverter()
converter_dict = converter.to_dict()

assert converter_dict == {
"type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredLocalFileConverter",
"init_parameters": {
"document_creation_mode": "one-doc-per-file",
"separator": "\n\n",
"unstructured_kwargs": {},
"progress_bar": True,
},
}

@pytest.mark.integration
def test_run_one_doc_per_file(self, samples_path):
pdf_path = samples_path / "sample_pdf.pdf"
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-file")
documents = local_converter.run([pdf_path])["documents"]
assert len(documents) == 1
assert documents[0].meta == {"file_path": str(pdf_path)}

@pytest.mark.integration
def test_run_one_doc_per_page(self, samples_path):
pdf_path = samples_path / "sample_pdf.pdf"
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-page")
documents = local_converter.run([pdf_path])["documents"]

assert len(documents) == 4
for i, doc in enumerate(documents, start=1):
assert doc.meta["file_path"] == str(pdf_path)
assert doc.meta["page_number"] == i

@pytest.mark.integration
def test_run_one_doc_per_element(self, samples_path):
pdf_path = samples_path / "sample_pdf.pdf"
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element")
documents = local_converter.run([pdf_path])["documents"]

assert len(documents) > 4
for doc in documents:
assert doc.meta["file_path"] == str(pdf_path)
assert "page_number" in doc.meta

# elements have a category attribute that is saved in the document meta
assert "category" in doc.meta

@pytest.mark.integration
def test_run_one_doc_per_file_with_meta(self, samples_path):
pdf_path = samples_path / "sample_pdf.pdf"
meta = {"custom_meta": "foobar"}
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-file")
documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"]

assert len(documents) == 1
assert documents[0].meta["file_path"] == str(pdf_path)
assert "custom_meta" in documents[0].meta
assert documents[0].meta["custom_meta"] == "foobar"
assert documents[0].meta == {"file_path": str(pdf_path), "custom_meta": "foobar"}

@pytest.mark.integration
def test_run_one_doc_per_page_with_meta(self, samples_path):
pdf_path = samples_path / "sample_pdf.pdf"
meta = {"custom_meta": "foobar"}
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-page")

documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"]

assert len(documents) == 4
for i, doc in enumerate(documents, start=1):
assert doc.meta["file_path"] == str(pdf_path)
assert doc.meta["page_number"] == i
assert "custom_meta" in doc.meta
assert doc.meta["custom_meta"] == "foobar"

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta(self, samples_path):
pdf_path = samples_path / "sample_pdf.pdf"
meta = {"custom_meta": "foobar"}
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element")
documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"]

assert len(documents) > 4
for doc in documents:
assert doc.meta["file_path"] == str(pdf_path)
assert "page_number" in doc.meta

# elements have a category attribute that is saved in the document meta
assert "category" in doc.meta
assert "custom_meta" in doc.meta
assert doc.meta["custom_meta"] == "foobar"

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):
pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"]
meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}]
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element")

documents = local_converter.run(paths=pdf_path, meta=meta)["documents"]

assert len(documents) > 4
for doc in documents:
assert "file_path" in doc.meta
assert "page_number" in doc.meta
# elements have a category attribute that is saved in the document meta
assert "category" in doc.meta
assert "common_meta" in doc.meta
assert doc.meta["common_meta"] == "common"

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):
pdf_path = [samples_path]
meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}]
local_converter = UnstructuredLocalFileConverter(document_creation_mode="one-doc-per-element")
documents = local_converter.run(paths=pdf_path, meta=meta)["documents"]

assert len(documents) > 4
for doc in documents:
assert "file_path" in doc.meta
assert "page_number" in doc.meta
# elements have a category attribute that is saved in the document meta
assert "category" in doc.meta
assert "common_meta" in doc.meta
assert doc.meta["common_meta"] == "common"
Loading