deepset-ai · sjrl · Jun 12, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
+from haystack.components.converters.docx import DocxToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.markdown import MarkdownToDocument
 from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
@@ -22,4 +23,5 @@
     "MarkdownToDocument",
     "OpenAPIServiceToFunctions",
     "OutputAdapter",
+    "DocxToDocument",
 ]
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import io
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+from haystack.lazy_imports import LazyImport
+
+logger = logging.getLogger(__name__)
+
+with LazyImport("Run 'pip install python-docx'") as docx_import:
+    import docx
+
+
+@component
+class DocxToDocument:
+    """
+    Converts Docx files to Documents.
+
+    Uses `python-docx` library to convert the Docx file to a document.
+    This component does not preserve page brakes in the original document.
+
+    Usage example:
+    ```python
+    from haystack.components.converters.docx import DocxToDocument
+
+    converter = DocxToDocument()
+    results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'This is a text from the Docx file.'
+    ```
+    """
+
+    def __init__(self):
+        """
+        Create a DocxToDocument component.
+        """
+        docx_import.check()
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
+        """
+        Converts Docx files to Documents.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents
+        """
+        documents = []
+        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning(f"Could not read {source}. Skipping it. Error: {e}")
+                continue
+
+            try:
+                file = docx.Document(io.BytesIO(bytestream.data))
+            except Exception as e:
+                logger.warning(f"Could not read {source} and convert it to a Docx Document, skipping. Error: {e}")
+                continue
+
+            try:
+                paragraphs = [para.text for para in file.paragraphs]
+                text = "\n".join(paragraphs)
+            except Exception as e:
+                logger.warning(f"Could not convert {source} to a Document, skipping it. Error: {e}")
+                continue
+
+            merged_metadata = {**bytestream.meta, **metadata}
+            document = Document(content=text, meta=merged_metadata)
+
+            documents.append(document)
+
+        return {"documents": documents}
@@ -0,0 +1,6 @@
+---
+highlights: >
+    Adding the `DocxToDocument` component to convert Docx files to Documents.
+features:
+  - |
+    Adding the `DocxToDocument` component inside the `converters` category. It uses the `python-docx` library to convert Docx files to haystack Documents.
@@ -0,0 +1,72 @@
+import logging
+from unittest.mock import patch
+
+import pytest
+
+from haystack.dataclasses import ByteStream
+from haystack.components.converters import DocxToDocument
+
+
+@pytest.fixture
+def docx_converter():
+    return DocxToDocument()
+
+
+class TestDocxToDocument:
+    def test_init(self, docx_converter):
+        assert isinstance(docx_converter, DocxToDocument)
+
+    @pytest.mark.integration
+    def test_run(self, test_files_path, docx_converter):
+        """
+        Test if the component runs correctly
+        """
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "History" in docs[0].content
+
+    @pytest.mark.skip("For now, DocxToDocument does not preserve page brakes.")
+    @pytest.mark.integration
+    def test_page_breaks_added(self, test_files_path, docx_converter):
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert docs[0].content.count("\f") == 3
+
+    def test_run_with_meta(self, test_files_path, docx_converter):
+        with patch("haystack.components.converters.docx.DocxToDocument"):
+            output = docx_converter.run(
+                sources=[test_files_path / "docx" / "sample_docx_1.docx"],
+                meta={"language": "it", "author": "test_author"},
+            )
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert output["documents"][0].meta["author"] == "test_author"
+        assert output["documents"][0].meta["language"] == "it"
+
+    def test_run_error_handling(self, test_files_path, docx_converter, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        paths = ["non_existing_file.docx"]
+        with caplog.at_level(logging.WARNING):
+            docx_converter.run(sources=paths)
+            assert "Could not read non_existing_file.docx" in caplog.text
+
+    @pytest.mark.integration
+    def test_mixed_sources_run(sefl, test_files_path, docx_converter):
+        """
+        Test if the component runs correctly when mixed sources are provided.
+        """
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
+            paths.append(ByteStream(f.read()))
+
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 2
+        assert "History and standardization" in docs[0].content
+        assert "History and standardization" in docs[1].content