Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add DocxToDocument converter #7838

Merged
merged 25 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
69fda9e
first fucntioning DocxFileToDocument
CarlosFerLo Jun 10, 2024
2d3d617
fix lazy import message
CarlosFerLo Jun 10, 2024
8984c48
add reno
CarlosFerLo Jun 10, 2024
d76d4cf
Add license headder
CarlosFerLo Jun 11, 2024
0f41730
change DocxFileToDocument to DocxToDocument
CarlosFerLo Jun 11, 2024
a5f2055
Merge branch 'issue-7797' of https://github.com/carlosFerLo/haystack …
CarlosFerLo Jun 11, 2024
9173433
Update library install to the maintained version
CarlosFerLo Jun 11, 2024
f391948
clan try-exvept to only take non haystack errors into account
CarlosFerLo Jun 11, 2024
de44301
Add wanring on docstring of component ignoring page brakes, mark test…
CarlosFerLo Jun 11, 2024
bb77328
make warnings lazy evaluations
CarlosFerLo Jun 11, 2024
cbd2a90
make warnings lazy evaluations
CarlosFerLo Jun 11, 2024
fe720ba
Make warnings lazy evaluated
CarlosFerLo Jun 11, 2024
224ec43
Solve f bug
CarlosFerLo Jun 11, 2024
0802e17
Get more metadata from docx files
CarlosFerLo Jun 11, 2024
40dba29
Merge branch 'main' into issue-7797
CarlosFerLo Jun 11, 2024
224c503
Merge branch 'issue-7797' of https://github.com/carlosFerLo/haystack …
CarlosFerLo Jun 11, 2024
5eddd2a
add 'python-docx' dependency and docs
CarlosFerLo Jun 11, 2024
10c796a
Change logging import
CarlosFerLo Jun 11, 2024
a35b91a
Fix typo
CarlosFerLo Jun 11, 2024
c3bd356
remake metadata extraction for docx
CarlosFerLo Jun 11, 2024
3400c7d
solve merge issues
CarlosFerLo Jun 11, 2024
9a849c9
solve bug regarding _get_docx_metadata method
CarlosFerLo Jun 11, 2024
ed37423
Update haystack/components/converters/docx.py
CarlosFerLo Jun 12, 2024
c3b06fa
Update haystack/components/converters/docx.py
CarlosFerLo Jun 12, 2024
aa33ff4
Delete unused test
CarlosFerLo Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.docx import DocxToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
Expand All @@ -22,4 +23,5 @@
"MarkdownToDocument",
"OpenAPIServiceToFunctions",
"OutputAdapter",
"DocxToDocument",
]
97 changes: 97 additions & 0 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

import io
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport

logger = logging.getLogger(__name__)

with LazyImport("Run 'pip install python-docx'") as docx_import:
import docx


@component
class DocxToDocument:
"""
Converts Docx files to Documents.

Uses `python-docx` library to convert the Docx file to a document.
This component does not preserve page brakes in the original document.

Usage example:
```python
from haystack.components.converters.docx import DocxToDocument

converter = DocxToDocument()
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the Docx file.'
```
"""

def __init__(self):
"""
Create a DocxToDocument component.
"""
docx_import.check()

@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts Docx files to Documents.

:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.

:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""
documents = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning(f"Could not read {source}. Skipping it. Error: {e}")
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved
continue

try:
file = docx.Document(io.BytesIO(bytestream.data))
except Exception as e:
logger.warning(f"Could not read {source} and convert it to a Docx Document, skipping. Error: {e}")
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved
continue

try:
paragraphs = [para.text for para in file.paragraphs]
text = "\n".join(paragraphs)
sjrl marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
logger.warning(f"Could not convert {source} to a Document, skipping it. Error: {e}")
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved
continue

merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=text, meta=merged_metadata)

documents.append(document)

return {"documents": documents}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
highlights: >
Adding the `DocxToDocument` component to convert Docx files to Documents.
features:
- |
Adding the `DocxToDocument` component inside the `converters` category. It uses the `python-docx` library to convert Docx files to haystack Documents.
72 changes: 72 additions & 0 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import logging
from unittest.mock import patch

import pytest

from haystack.dataclasses import ByteStream
from haystack.components.converters import DocxToDocument


@pytest.fixture
def docx_converter():
return DocxToDocument()


class TestDocxToDocument:
def test_init(self, docx_converter):
assert isinstance(docx_converter, DocxToDocument)

@pytest.mark.integration
def test_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content

@pytest.mark.skip("For now, DocxToDocument does not preserve page brakes.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's go ahead and delete this test, and instead, we can open a feature request if we like for adding page break support.

Copy link
Contributor Author

@CarlosFerLo CarlosFerLo Jun 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okey, I will create an issue about it once this PR is resolved.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! And in the mean time can we delete this test?

@pytest.mark.integration
def test_page_breaks_added(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 3
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved

def test_run_with_meta(self, test_files_path, docx_converter):
with patch("haystack.components.converters.docx.DocxToDocument"):
output = docx_converter.run(
sources=[test_files_path / "docx" / "sample_docx_1.docx"],
meta={"language": "it", "author": "test_author"},
)

# check that the metadata from the bytestream is merged with that from the meta parameter
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"

def test_run_error_handling(self, test_files_path, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
docx_converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text

@pytest.mark.integration
def test_mixed_sources_run(sefl, test_files_path, docx_converter):
CarlosFerLo marked this conversation as resolved.
Show resolved Hide resolved
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
paths.append(ByteStream(f.read()))

output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
Binary file added test/test_files/docx/sample_docx_1.docx
Binary file not shown.
Loading