rfctr: docx partitioning (#1422)

Reviewers: I recommend reviewing commit-by-commit or just looking at the final version of `partition/docx.py` as View File. This refactor solves a few problems but mostly lays the groundwork to allow us to refine further aspects such as page-break detection, list-item detection, and moving python-docx internals upstream to that library so our work doesn't depend on that domain-knowledge.
Unstructured-IO · Sep 19, 2023 · b54994a · b54994a
1 parent 9a3e24f
commit b54994a
Show file tree

Hide file tree

Showing 61 changed files with 1,286 additions and 434 deletions.
diff --git a/.gitignore b/.gitignore
@@ -132,6 +132,9 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# pyright (Python LSP/type-checker in VSCode) config
+/pyrightconfig.json
+
 # ingest outputs
 /structured-output
 
@@ -194,4 +197,4 @@ unstructured-inference/
 example-docs/*_images
 examples/**/output/
 
-outputdiff.txt
+outputdiff.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ### Enhancements
 
 * **Adds data source properties to Airtable, Confluence, Discord, Elasticsearch, Google Drive, and Wikipedia connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
+* **DOCX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except in multi-section documents containing different headers/footers for different sections. These will now emit all distinct headers and footers encountered instead of just those for the last section.
 
 ### Features
 

diff --git a/Makefile b/Makefile
@@ -324,7 +324,7 @@ check: check-src check-tests check-version
 ## check-src:               runs linters (source only, no tests)
 .PHONY: check-src
 check-src:
-	ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore PT011,PT012,SIM117
+	ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore COM812,PT011,PT012,SIM117
 	black --line-length 100 ${PACKAGE_NAME} --check
 	flake8 ${PACKAGE_NAME}
 	mypy ${PACKAGE_NAME} --ignore-missing-imports --check-untyped-defs

diff --git a/docs/source/introduction/getting_started.rst b/docs/source/introduction/getting_started.rst
@@ -34,7 +34,7 @@ After installation, confirm the setup by executing the below Python code:
 .. code-block:: python
 
    from unstructured.partition.auto import partition
-   elements = partition(filename="example-docs/fake-email.eml")
+   elements = partition(filename="example-docs/eml/fake-email.eml")
 
 If you've opted for the "local-inference" installation, you should also be able to execute:
 

diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst
@@ -26,12 +26,13 @@ Some document types support location data for the elements, usually in the form
 If it exists, an element's location data is available with ``element.metadata.coordinates``.
 
 The ``coordinates`` property of an ``ElementMetadata`` stores:
+
 * points: These specify the corners of the bounding box starting from the top left corner and
-proceeding counter-clockwise. The points represent pixels, the origin is in the top left and
-the ``y`` coordinate increases in the downward direction.
+  proceeding counter-clockwise. The points represent pixels, the origin is in the top left and
+  the ``y`` coordinate increases in the downward direction.
 * system: The points have an associated coordinate system. A typical example of a coordinate system is
-``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a
-name, orientation, layout width, and layout height.
+  ``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a
+  name, orientation, layout width, and layout height.
 
 Information about the element’s coordinates (including the coordinate system name, coordinate points,
 the layout width, and the layout height) can be accessed with `element.to_dict()["metadata"]["coordinates"]`.

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,25 @@
+[tool.black]
+line-length = 100
+
+[tool.ruff]
+line-length = 100
+select = [
+    "C4",       # -- flake8-comprehensions --
+    "COM",      # -- flake8-commas --
+    "E",        # -- pycodestyle errors --
+    "F",        # -- pyflakes --
+    "I",        # -- isort (imports) --
+    "PLR0402",  # -- Name compared with itself like `foo == foo` --
+    "PT",       # -- flake8-pytest-style --
+    "SIM",      # -- flake8-simplify --
+    "UP015",    # -- redundant `open()` mode parameter (like "r" is default) --
+    "UP018",    # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
+    "UP032",    # -- Use f-string instead of `.format()` call --
+    "UP034",    # -- Avoid extraneous parentheses --
+]
+ignore = [
+    "COM812",   # -- over aggressively insists on trailing commas where not desireable --
+    "PT011",    # -- pytest.raises({exc}) too broad, use match param or more specific exception --
+    "PT012",    # -- pytest.raises() block should contain a single simple statement --
+    "SIM117",   # -- merge `with` statements for context managers that have same scope --
+]
diff --git a/scripts/collect_env.py b/scripts/collect_env.py
@@ -40,7 +40,7 @@ def get_os_version():
     return platform.platform()
 
 
-def is_python_package_installed(package_name):
+def is_python_package_installed(package_name: str):
     """
     Check if a Python package is installed
 
@@ -57,14 +57,10 @@ def is_python_package_installed(package_name):
         check=True,
     )
 
-    for line in result.stdout.splitlines():
-        if line.lower().startswith(package_name.lower()):
-            return True
-
-    return False
+    return any(line.lower().startswith(package_name.lower()) for line in result.stdout.splitlines())
 
 
-def is_brew_package_installed(package_name):
+def is_brew_package_installed(package_name: str):
     """
     Check if a Homebrew package is installed
 
@@ -95,11 +91,7 @@ def is_brew_package_installed(package_name):
         check=True,
     )
 
-    for line in result.stdout.splitlines():
-        if line.lower().startswith(package_name.lower()):
-            return True
-
-    return False
+    return any(line.lower().startswith(package_name.lower()) for line in result.stdout.splitlines())
 
 
 def get_python_package_version(package_name):
@@ -221,8 +213,7 @@ def main():
     ):
         print(
             "PaddleOCR version: ",
-            get_python_package_version("paddlepaddle")
-            or get_python_package_version("paddleocr"),
+            get_python_package_version("paddlepaddle") or get_python_package_version("paddleocr"),
         )
     else:
         print("PaddleOCR is not installed")

diff --git a/scripts/performance/run_partition.py b/scripts/performance/run_partition.py
@@ -13,11 +13,7 @@
 
     file_path = sys.argv[1]
     strategy = sys.argv[2]
-    model_name = None
-    if len(sys.argv) > 3:
-        model_name = sys.argv[3]
-    else:
-        model_name = os.environ.get("PARTITION_MODEL_NAME")
+    model_name = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("PARTITION_MODEL_NAME")
     result = partition(file_path, strategy=strategy, model_name=model_name)
     # access element in the return value to make sure we got something back, otherwise error
     result[1]
diff --git a/setup.cfg b/setup.cfg
@@ -7,3 +7,5 @@ max-line-length = 100
 [tool:pytest]
 filterwarnings =
     ignore::DeprecationWarning
+python_classes = Test Describe
+python_functions = test_ it_ they_ but_ and_
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -1,5 +1,8 @@
+# pyright: reportPrivateUsage=false
+
 import os
 from tempfile import SpooledTemporaryFile
+from typing import Dict, List
 
 import docx
 import pytest
@@ -16,12 +19,7 @@
     Title,
 )
 from unstructured.partition.doc import partition_doc
-from unstructured.partition.docx import (
-    _extract_contents_and_tags,
-    _get_emphasized_texts_from_paragraph,
-    _get_emphasized_texts_from_table,
-    partition_docx,
-)
+from unstructured.partition.docx import _DocxPartitioner, partition_docx
 from unstructured.partition.json import partition_json
 from unstructured.staging.base import elements_to_json
 
@@ -316,52 +314,46 @@ def test_partition_docx_from_file_without_metadata_date(
     assert elements[0].metadata.last_modified is None
 
 
-def test_get_emphasized_texts_from_paragraph(
-    expected_emphasized_texts,
-    filename="example-docs/fake-doc-emphasized-text.docx",
-):
-    document = docx.Document(filename)
-    paragraph = document.paragraphs[1]
-    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dict[str, str]]):
+    partitioner = _DocxPartitioner(
+        "example-docs/fake-doc-emphasized-text.docx", None, None, False, None
+    )
+    paragraph = partitioner._document.paragraphs[1]
+    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
     assert paragraph.text == "I am a bold italic bold-italic text."
     assert emphasized_texts == expected_emphasized_texts
 
-    paragraph = document.paragraphs[2]
-    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+    paragraph = partitioner._document.paragraphs[2]
+    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
     assert paragraph.text == ""
     assert emphasized_texts == []
 
-    paragraph = document.paragraphs[3]
-    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+    paragraph = partitioner._document.paragraphs[3]
+    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
     assert paragraph.text == "I am a normal text."
     assert emphasized_texts == []
 
 
-def test_get_emphasized_texts_from_table(
-    expected_emphasized_texts,
-    filename="example-docs/fake-doc-emphasized-text.docx",
-):
-    document = docx.Document(filename)
-    table = document.tables[0]
-    emphasized_texts = _get_emphasized_texts_from_table(table)
+def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
+    partitioner = _DocxPartitioner(
+        "example-docs/fake-doc-emphasized-text.docx", None, None, False, None
+    )
+    table = partitioner._document.tables[0]
+    emphasized_texts = list(partitioner._iter_table_emphasis(table))
     assert emphasized_texts == expected_emphasized_texts
 
 
-def test_extract_contents_and_tags(
-    expected_emphasized_texts,
-    expected_emphasized_text_contents,
-    expected_emphasized_text_tags,
+def test_table_emphasis(
+    expected_emphasized_text_contents: List[str], expected_emphasized_text_tags: List[str]
 ):
-    emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
-        expected_emphasized_texts,
+    partitioner = _DocxPartitioner(
+        "example-docs/fake-doc-emphasized-text.docx", None, None, False, None
     )
+    table = partitioner._document.tables[0]
+    emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table)
     assert emphasized_text_contents == expected_emphasized_text_contents
     assert emphasized_text_tags == expected_emphasized_text_tags
 
-    emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([])
-    assert emphasized_text_contents is None
-    assert emphasized_text_tags is None
-
 
 @pytest.mark.parametrize(
     ("filename", "partition_func"),

diff --git a/typings/docx/__init__.pyi b/typings/docx/__init__.pyi
@@ -0,0 +1,3 @@
+from docx.api import Document
+
+__all__ = ["Document"]
diff --git a/typings/docx/api.pyi b/typings/docx/api.pyi
@@ -0,0 +1,5 @@
+from typing import BinaryIO, Optional, Union
+
+import docx.document
+
+def Document(docx: Optional[Union[str, BinaryIO]] = None) -> docx.document.Document: ...
diff --git a/typings/docx/blkcntnr.pyi b/typings/docx/blkcntnr.pyi
@@ -0,0 +1,12 @@
+from typing import Sequence
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+
+class BlockItemContainer:
+    _element: BaseOxmlElement
+    @property
+    def paragraphs(self) -> Sequence[Paragraph]: ...
+    @property
+    def tables(self) -> Sequence[Table]: ...
diff --git a/typings/docx/document.pyi b/typings/docx/document.pyi
@@ -0,0 +1,22 @@
+# pyright: reportPrivateUsage=false
+
+from typing import BinaryIO, Optional, Union
+
+from docx.blkcntnr import BlockItemContainer
+from docx.oxml.document import CT_Document
+from docx.section import Sections
+from docx.settings import Settings
+from docx.styles.style import _ParagraphStyle
+from docx.text.paragraph import Paragraph
+
+class Document(BlockItemContainer):
+    def add_paragraph(
+        self, text: str = "", style: Optional[Union[_ParagraphStyle, str]] = None
+    ) -> Paragraph: ...
+    @property
+    def element(self) -> CT_Document: ...
+    def save(self, path_or_stream: Union[str, BinaryIO]) -> None: ...
+    @property
+    def sections(self) -> Sections: ...
+    @property
+    def settings(self) -> Settings: ...
diff --git a/typings/docx/enum/section.pyi b/typings/docx/enum/section.pyi
@@ -0,0 +1,11 @@
+import enum
+
+class WD_SECTION_START(enum.Enum):
+    CONTINUOUS: enum.Enum
+    EVEN_PAGE: enum.Enum
+    NEW_COLUMN: enum.Enum
+    NEW_PAGE: enum.Enum
+    ODD_PAGE: enum.Enum
+
+# -- alias --
+WD_SECTION = WD_SECTION_START
diff --git a/typings/docx/oxml/__init__.pyi b/typings/docx/oxml/__init__.pyi
@@ -0,0 +1,7 @@
+# pyright: reportPrivateUsage=false
+
+from typing import Union
+
+from lxml import etree
+
+def parse_xml(xml: Union[str, bytes]) -> etree._Element: ...
diff --git a/typings/docx/oxml/document.pyi b/typings/docx/oxml/document.pyi
@@ -0,0 +1,10 @@
+from typing import Iterator
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_Body(BaseOxmlElement):
+    def __iter__(self) -> Iterator[BaseOxmlElement]: ...
+
+class CT_Document(BaseOxmlElement):
+    @property
+    def body(self) -> CT_Body: ...
diff --git a/typings/docx/oxml/ns.pyi b/typings/docx/oxml/ns.pyi
@@ -0,0 +1,5 @@
+from typing import Dict
+
+nsmap: Dict[str, str]
+
+def qn(tag: str) -> str: ...
diff --git a/typings/docx/oxml/section.pyi b/typings/docx/oxml/section.pyi
@@ -0,0 +1,7 @@
+from typing import Optional
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_SectPr(BaseOxmlElement):
+    @property
+    def preceding_sectPr(self) -> Optional[CT_SectPr]: ...
diff --git a/typings/docx/oxml/table.pyi b/typings/docx/oxml/table.pyi
@@ -0,0 +1,3 @@
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_Tbl(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/text/paragraph.pyi b/typings/docx/oxml/text/paragraph.pyi
@@ -0,0 +1,3 @@
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_P(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/text/parfmt.pyi b/typings/docx/oxml/text/parfmt.pyi
@@ -0,0 +1,3 @@
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_PPr(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/text/run.pyi b/typings/docx/oxml/text/run.pyi
@@ -0,0 +1,9 @@
+from typing import Optional
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_Br(BaseOxmlElement):
+    type: Optional[str]
+    clear: Optional[str]
+
+class CT_R(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/xmlchemy.pyi b/typings/docx/oxml/xmlchemy.pyi
@@ -0,0 +1,17 @@
+from typing import Any, Iterator
+
+from lxml import etree
+
+class BaseOxmlElement(etree.ElementBase):
+    def __iter__(self) -> Iterator[BaseOxmlElement]: ...
+    @property
+    def xml(self) -> str: ...
+    def xpath(self, xpath_str: str) -> Any:
+        """Return type is typically Sequence[ElementBase], but ...
+
+        lxml.etree.XPath has many possible return types including bool, (a "smart") str,
+        float. The return type can also be a list containing ElementBase, comments,
+        processing instructions, str, and tuple. So you need to cast the result based on
+        the XPath expression you use.
+        """
+        ...
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from docx.api import Document

		__all__ = ["Document"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from docx.oxml.xmlchemy import BaseOxmlElement

		class CT_Tbl(BaseOxmlElement): ...
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from docx.oxml.xmlchemy import BaseOxmlElement

		class CT_P(BaseOxmlElement): ...
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from docx.oxml.xmlchemy import BaseOxmlElement

		class CT_PPr(BaseOxmlElement): ...