From b54994ae9584e528d3d009c5d9a4870624243790 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 19 Sep 2023 15:32:46 -0700 Subject: [PATCH] rfctr: docx partitioning (#1422) Reviewers: I recommend reviewing commit-by-commit or just looking at the final version of `partition/docx.py` as View File. This refactor solves a few problems but mostly lays the groundwork to allow us to refine further aspects such as page-break detection, list-item detection, and moving python-docx internals upstream to that library so our work doesn't depend on that domain-knowledge. --- .gitignore | 5 +- CHANGELOG.md | 1 + Makefile | 2 +- docs/source/introduction/getting_started.rst | 2 +- docs/source/metadata.rst | 9 +- pyproject.toml | 25 + scripts/collect_env.py | 19 +- scripts/performance/run_partition.py | 6 +- setup.cfg | 2 + test_unstructured/partition/docx/test_docx.py | 60 +- typings/docx/__init__.pyi | 3 + typings/docx/api.pyi | 5 + typings/docx/blkcntnr.pyi | 12 + typings/docx/document.pyi | 22 + typings/docx/enum/section.pyi | 11 + typings/docx/oxml/__init__.pyi | 7 + typings/docx/oxml/document.pyi | 10 + typings/docx/oxml/ns.pyi | 5 + typings/docx/oxml/section.pyi | 7 + typings/docx/oxml/table.pyi | 3 + typings/docx/oxml/text/paragraph.pyi | 3 + typings/docx/oxml/text/parfmt.pyi | 3 + typings/docx/oxml/text/run.pyi | 9 + typings/docx/oxml/xmlchemy.pyi | 17 + typings/docx/section.pyi | 33 + typings/docx/settings.pyi | 5 + typings/docx/shared.pyi | 16 + typings/docx/styles/style.pyi | 8 + typings/docx/table.pyi | 21 + typings/docx/text/__init__.pyi | 0 typings/docx/text/paragraph.pyi | 19 + typings/docx/text/run.pyi | 13 + typings/pptx/__init__.pyi | 3 + typings/pptx/api.pyi | 5 + typings/pptx/oxml/__init__.py | 9 + typings/pptx/oxml/text.pyi | 6 + typings/pptx/oxml/xmlchemy.pyi | 17 + typings/pptx/presentation.pyi | 11 + typings/pptx/shapes/__init__.py | 2 + typings/pptx/shapes/autoshape.pyi | 6 + typings/pptx/shapes/base.pyi | 9 + typings/pptx/shapes/graphfrm.pyi | 6 + typings/pptx/shapes/group.pyi | 6 + typings/pptx/shapes/shapetree.pyi | 18 + typings/pptx/shared.pyi | 3 + typings/pptx/slide.pyi | 31 + typings/pptx/table.pyi | 1 + typings/pptx/text/text.pyi | 13 + typings/pptx/util.pyi | 4 + typings/pypandoc/__init__.pyi | 0 unstructured/cleaners/core.py | 2 +- unstructured/documents/elements.py | 30 +- unstructured/documents/html.py | 7 +- unstructured/partition/common.py | 11 +- unstructured/partition/docx.py | 1017 +++++++++++------ unstructured/partition/json.py | 5 +- unstructured/partition/odt.py | 6 +- unstructured/partition/pptx.py | 4 +- unstructured/partition/text_type.py | 4 +- unstructured/staging/prodigy.py | 4 +- unstructured/utils.py | 117 +- 61 files changed, 1286 insertions(+), 434 deletions(-) create mode 100644 pyproject.toml create mode 100644 typings/docx/__init__.pyi create mode 100644 typings/docx/api.pyi create mode 100644 typings/docx/blkcntnr.pyi create mode 100644 typings/docx/document.pyi create mode 100644 typings/docx/enum/section.pyi create mode 100644 typings/docx/oxml/__init__.pyi create mode 100644 typings/docx/oxml/document.pyi create mode 100644 typings/docx/oxml/ns.pyi create mode 100644 typings/docx/oxml/section.pyi create mode 100644 typings/docx/oxml/table.pyi create mode 100644 typings/docx/oxml/text/paragraph.pyi create mode 100644 typings/docx/oxml/text/parfmt.pyi create mode 100644 typings/docx/oxml/text/run.pyi create mode 100644 typings/docx/oxml/xmlchemy.pyi create mode 100644 typings/docx/section.pyi create mode 100644 typings/docx/settings.pyi create mode 100644 typings/docx/shared.pyi create mode 100644 typings/docx/styles/style.pyi create mode 100644 typings/docx/table.pyi create mode 100644 typings/docx/text/__init__.pyi create mode 100644 typings/docx/text/paragraph.pyi create mode 100644 typings/docx/text/run.pyi create mode 100644 typings/pptx/__init__.pyi create mode 100644 typings/pptx/api.pyi create mode 100644 typings/pptx/oxml/__init__.py create mode 100644 typings/pptx/oxml/text.pyi create mode 100644 typings/pptx/oxml/xmlchemy.pyi create mode 100644 typings/pptx/presentation.pyi create mode 100644 typings/pptx/shapes/__init__.py create mode 100644 typings/pptx/shapes/autoshape.pyi create mode 100644 typings/pptx/shapes/base.pyi create mode 100644 typings/pptx/shapes/graphfrm.pyi create mode 100644 typings/pptx/shapes/group.pyi create mode 100644 typings/pptx/shapes/shapetree.pyi create mode 100644 typings/pptx/shared.pyi create mode 100644 typings/pptx/slide.pyi create mode 100644 typings/pptx/table.pyi create mode 100644 typings/pptx/text/text.pyi create mode 100644 typings/pptx/util.pyi create mode 100644 typings/pypandoc/__init__.pyi diff --git a/.gitignore b/.gitignore index 8353474c5c..f7efde4599 100644 --- a/.gitignore +++ b/.gitignore @@ -132,6 +132,9 @@ dmypy.json # Pyre type checker .pyre/ +# pyright (Python LSP/type-checker in VSCode) config +/pyrightconfig.json + # ingest outputs /structured-output @@ -194,4 +197,4 @@ unstructured-inference/ example-docs/*_images examples/**/output/ -outputdiff.txt \ No newline at end of file +outputdiff.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index e0557ab94e..b230b79587 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements * **Adds data source properties to Airtable, Confluence, Discord, Elasticsearch, Google Drive, and Wikipedia connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **DOCX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except in multi-section documents containing different headers/footers for different sections. These will now emit all distinct headers and footers encountered instead of just those for the last section. ### Features diff --git a/Makefile b/Makefile index 19051c435b..51ede92052 100644 --- a/Makefile +++ b/Makefile @@ -324,7 +324,7 @@ check: check-src check-tests check-version ## check-src: runs linters (source only, no tests) .PHONY: check-src check-src: - ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore PT011,PT012,SIM117 + ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore COM812,PT011,PT012,SIM117 black --line-length 100 ${PACKAGE_NAME} --check flake8 ${PACKAGE_NAME} mypy ${PACKAGE_NAME} --ignore-missing-imports --check-untyped-defs diff --git a/docs/source/introduction/getting_started.rst b/docs/source/introduction/getting_started.rst index 60e9daaddf..c659046817 100644 --- a/docs/source/introduction/getting_started.rst +++ b/docs/source/introduction/getting_started.rst @@ -34,7 +34,7 @@ After installation, confirm the setup by executing the below Python code: .. code-block:: python from unstructured.partition.auto import partition - elements = partition(filename="example-docs/fake-email.eml") + elements = partition(filename="example-docs/eml/fake-email.eml") If you've opted for the "local-inference" installation, you should also be able to execute: diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst index d114254ec5..3f406d4028 100644 --- a/docs/source/metadata.rst +++ b/docs/source/metadata.rst @@ -26,12 +26,13 @@ Some document types support location data for the elements, usually in the form If it exists, an element's location data is available with ``element.metadata.coordinates``. The ``coordinates`` property of an ``ElementMetadata`` stores: + * points: These specify the corners of the bounding box starting from the top left corner and -proceeding counter-clockwise. The points represent pixels, the origin is in the top left and -the ``y`` coordinate increases in the downward direction. + proceeding counter-clockwise. The points represent pixels, the origin is in the top left and + the ``y`` coordinate increases in the downward direction. * system: The points have an associated coordinate system. A typical example of a coordinate system is -``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a -name, orientation, layout width, and layout height. + ``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a + name, orientation, layout width, and layout height. Information about the element’s coordinates (including the coordinate system name, coordinate points, the layout width, and the layout height) can be accessed with `element.to_dict()["metadata"]["coordinates"]`. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..50069ef642 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.black] +line-length = 100 + +[tool.ruff] +line-length = 100 +select = [ + "C4", # -- flake8-comprehensions -- + "COM", # -- flake8-commas -- + "E", # -- pycodestyle errors -- + "F", # -- pyflakes -- + "I", # -- isort (imports) -- + "PLR0402", # -- Name compared with itself like `foo == foo` -- + "PT", # -- flake8-pytest-style -- + "SIM", # -- flake8-simplify -- + "UP015", # -- redundant `open()` mode parameter (like "r" is default) -- + "UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) -- + "UP032", # -- Use f-string instead of `.format()` call -- + "UP034", # -- Avoid extraneous parentheses -- +] +ignore = [ + "COM812", # -- over aggressively insists on trailing commas where not desireable -- + "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- + "PT012", # -- pytest.raises() block should contain a single simple statement -- + "SIM117", # -- merge `with` statements for context managers that have same scope -- +] diff --git a/scripts/collect_env.py b/scripts/collect_env.py index f0649147b1..6cb48e0f25 100644 --- a/scripts/collect_env.py +++ b/scripts/collect_env.py @@ -40,7 +40,7 @@ def get_os_version(): return platform.platform() -def is_python_package_installed(package_name): +def is_python_package_installed(package_name: str): """ Check if a Python package is installed @@ -57,14 +57,10 @@ def is_python_package_installed(package_name): check=True, ) - for line in result.stdout.splitlines(): - if line.lower().startswith(package_name.lower()): - return True - - return False + return any(line.lower().startswith(package_name.lower()) for line in result.stdout.splitlines()) -def is_brew_package_installed(package_name): +def is_brew_package_installed(package_name: str): """ Check if a Homebrew package is installed @@ -95,11 +91,7 @@ def is_brew_package_installed(package_name): check=True, ) - for line in result.stdout.splitlines(): - if line.lower().startswith(package_name.lower()): - return True - - return False + return any(line.lower().startswith(package_name.lower()) for line in result.stdout.splitlines()) def get_python_package_version(package_name): @@ -221,8 +213,7 @@ def main(): ): print( "PaddleOCR version: ", - get_python_package_version("paddlepaddle") - or get_python_package_version("paddleocr"), + get_python_package_version("paddlepaddle") or get_python_package_version("paddleocr"), ) else: print("PaddleOCR is not installed") diff --git a/scripts/performance/run_partition.py b/scripts/performance/run_partition.py index 4da380f02e..3710f02c64 100644 --- a/scripts/performance/run_partition.py +++ b/scripts/performance/run_partition.py @@ -13,11 +13,7 @@ file_path = sys.argv[1] strategy = sys.argv[2] - model_name = None - if len(sys.argv) > 3: - model_name = sys.argv[3] - else: - model_name = os.environ.get("PARTITION_MODEL_NAME") + model_name = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("PARTITION_MODEL_NAME") result = partition(file_path, strategy=strategy, model_name=model_name) # access element in the return value to make sure we got something back, otherwise error result[1] diff --git a/setup.cfg b/setup.cfg index a06a3629b5..ae8174cacc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,3 +7,5 @@ max-line-length = 100 [tool:pytest] filterwarnings = ignore::DeprecationWarning +python_classes = Test Describe +python_functions = test_ it_ they_ but_ and_ diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index b73caa0e70..77d8b79751 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -1,5 +1,8 @@ +# pyright: reportPrivateUsage=false + import os from tempfile import SpooledTemporaryFile +from typing import Dict, List import docx import pytest @@ -16,12 +19,7 @@ Title, ) from unstructured.partition.doc import partition_doc -from unstructured.partition.docx import ( - _extract_contents_and_tags, - _get_emphasized_texts_from_paragraph, - _get_emphasized_texts_from_table, - partition_docx, -) +from unstructured.partition.docx import _DocxPartitioner, partition_docx from unstructured.partition.json import partition_json from unstructured.staging.base import elements_to_json @@ -316,52 +314,46 @@ def test_partition_docx_from_file_without_metadata_date( assert elements[0].metadata.last_modified is None -def test_get_emphasized_texts_from_paragraph( - expected_emphasized_texts, - filename="example-docs/fake-doc-emphasized-text.docx", -): - document = docx.Document(filename) - paragraph = document.paragraphs[1] - emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph) +def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dict[str, str]]): + partitioner = _DocxPartitioner( + "example-docs/fake-doc-emphasized-text.docx", None, None, False, None + ) + paragraph = partitioner._document.paragraphs[1] + emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph)) assert paragraph.text == "I am a bold italic bold-italic text." assert emphasized_texts == expected_emphasized_texts - paragraph = document.paragraphs[2] - emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph) + paragraph = partitioner._document.paragraphs[2] + emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph)) assert paragraph.text == "" assert emphasized_texts == [] - paragraph = document.paragraphs[3] - emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph) + paragraph = partitioner._document.paragraphs[3] + emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph)) assert paragraph.text == "I am a normal text." assert emphasized_texts == [] -def test_get_emphasized_texts_from_table( - expected_emphasized_texts, - filename="example-docs/fake-doc-emphasized-text.docx", -): - document = docx.Document(filename) - table = document.tables[0] - emphasized_texts = _get_emphasized_texts_from_table(table) +def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]): + partitioner = _DocxPartitioner( + "example-docs/fake-doc-emphasized-text.docx", None, None, False, None + ) + table = partitioner._document.tables[0] + emphasized_texts = list(partitioner._iter_table_emphasis(table)) assert emphasized_texts == expected_emphasized_texts -def test_extract_contents_and_tags( - expected_emphasized_texts, - expected_emphasized_text_contents, - expected_emphasized_text_tags, +def test_table_emphasis( + expected_emphasized_text_contents: List[str], expected_emphasized_text_tags: List[str] ): - emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags( - expected_emphasized_texts, + partitioner = _DocxPartitioner( + "example-docs/fake-doc-emphasized-text.docx", None, None, False, None ) + table = partitioner._document.tables[0] + emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table) assert emphasized_text_contents == expected_emphasized_text_contents assert emphasized_text_tags == expected_emphasized_text_tags - emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([]) - assert emphasized_text_contents is None - assert emphasized_text_tags is None - @pytest.mark.parametrize( ("filename", "partition_func"), diff --git a/typings/docx/__init__.pyi b/typings/docx/__init__.pyi new file mode 100644 index 0000000000..a5cb78f3e1 --- /dev/null +++ b/typings/docx/__init__.pyi @@ -0,0 +1,3 @@ +from docx.api import Document + +__all__ = ["Document"] diff --git a/typings/docx/api.pyi b/typings/docx/api.pyi new file mode 100644 index 0000000000..9b3ff122d6 --- /dev/null +++ b/typings/docx/api.pyi @@ -0,0 +1,5 @@ +from typing import BinaryIO, Optional, Union + +import docx.document + +def Document(docx: Optional[Union[str, BinaryIO]] = None) -> docx.document.Document: ... diff --git a/typings/docx/blkcntnr.pyi b/typings/docx/blkcntnr.pyi new file mode 100644 index 0000000000..9e09ea8c2a --- /dev/null +++ b/typings/docx/blkcntnr.pyi @@ -0,0 +1,12 @@ +from typing import Sequence + +from docx.oxml.xmlchemy import BaseOxmlElement +from docx.table import Table +from docx.text.paragraph import Paragraph + +class BlockItemContainer: + _element: BaseOxmlElement + @property + def paragraphs(self) -> Sequence[Paragraph]: ... + @property + def tables(self) -> Sequence[Table]: ... diff --git a/typings/docx/document.pyi b/typings/docx/document.pyi new file mode 100644 index 0000000000..c6d34d467f --- /dev/null +++ b/typings/docx/document.pyi @@ -0,0 +1,22 @@ +# pyright: reportPrivateUsage=false + +from typing import BinaryIO, Optional, Union + +from docx.blkcntnr import BlockItemContainer +from docx.oxml.document import CT_Document +from docx.section import Sections +from docx.settings import Settings +from docx.styles.style import _ParagraphStyle +from docx.text.paragraph import Paragraph + +class Document(BlockItemContainer): + def add_paragraph( + self, text: str = "", style: Optional[Union[_ParagraphStyle, str]] = None + ) -> Paragraph: ... + @property + def element(self) -> CT_Document: ... + def save(self, path_or_stream: Union[str, BinaryIO]) -> None: ... + @property + def sections(self) -> Sections: ... + @property + def settings(self) -> Settings: ... diff --git a/typings/docx/enum/section.pyi b/typings/docx/enum/section.pyi new file mode 100644 index 0000000000..b06b79d766 --- /dev/null +++ b/typings/docx/enum/section.pyi @@ -0,0 +1,11 @@ +import enum + +class WD_SECTION_START(enum.Enum): + CONTINUOUS: enum.Enum + EVEN_PAGE: enum.Enum + NEW_COLUMN: enum.Enum + NEW_PAGE: enum.Enum + ODD_PAGE: enum.Enum + +# -- alias -- +WD_SECTION = WD_SECTION_START diff --git a/typings/docx/oxml/__init__.pyi b/typings/docx/oxml/__init__.pyi new file mode 100644 index 0000000000..2634b55642 --- /dev/null +++ b/typings/docx/oxml/__init__.pyi @@ -0,0 +1,7 @@ +# pyright: reportPrivateUsage=false + +from typing import Union + +from lxml import etree + +def parse_xml(xml: Union[str, bytes]) -> etree._Element: ... diff --git a/typings/docx/oxml/document.pyi b/typings/docx/oxml/document.pyi new file mode 100644 index 0000000000..460c1c1d7a --- /dev/null +++ b/typings/docx/oxml/document.pyi @@ -0,0 +1,10 @@ +from typing import Iterator + +from docx.oxml.xmlchemy import BaseOxmlElement + +class CT_Body(BaseOxmlElement): + def __iter__(self) -> Iterator[BaseOxmlElement]: ... + +class CT_Document(BaseOxmlElement): + @property + def body(self) -> CT_Body: ... diff --git a/typings/docx/oxml/ns.pyi b/typings/docx/oxml/ns.pyi new file mode 100644 index 0000000000..3f387dd2fb --- /dev/null +++ b/typings/docx/oxml/ns.pyi @@ -0,0 +1,5 @@ +from typing import Dict + +nsmap: Dict[str, str] + +def qn(tag: str) -> str: ... diff --git a/typings/docx/oxml/section.pyi b/typings/docx/oxml/section.pyi new file mode 100644 index 0000000000..2f90109e09 --- /dev/null +++ b/typings/docx/oxml/section.pyi @@ -0,0 +1,7 @@ +from typing import Optional + +from docx.oxml.xmlchemy import BaseOxmlElement + +class CT_SectPr(BaseOxmlElement): + @property + def preceding_sectPr(self) -> Optional[CT_SectPr]: ... diff --git a/typings/docx/oxml/table.pyi b/typings/docx/oxml/table.pyi new file mode 100644 index 0000000000..2d96facdc3 --- /dev/null +++ b/typings/docx/oxml/table.pyi @@ -0,0 +1,3 @@ +from docx.oxml.xmlchemy import BaseOxmlElement + +class CT_Tbl(BaseOxmlElement): ... diff --git a/typings/docx/oxml/text/paragraph.pyi b/typings/docx/oxml/text/paragraph.pyi new file mode 100644 index 0000000000..d869c19c04 --- /dev/null +++ b/typings/docx/oxml/text/paragraph.pyi @@ -0,0 +1,3 @@ +from docx.oxml.xmlchemy import BaseOxmlElement + +class CT_P(BaseOxmlElement): ... diff --git a/typings/docx/oxml/text/parfmt.pyi b/typings/docx/oxml/text/parfmt.pyi new file mode 100644 index 0000000000..21c4e7d668 --- /dev/null +++ b/typings/docx/oxml/text/parfmt.pyi @@ -0,0 +1,3 @@ +from docx.oxml.xmlchemy import BaseOxmlElement + +class CT_PPr(BaseOxmlElement): ... diff --git a/typings/docx/oxml/text/run.pyi b/typings/docx/oxml/text/run.pyi new file mode 100644 index 0000000000..2964024a4a --- /dev/null +++ b/typings/docx/oxml/text/run.pyi @@ -0,0 +1,9 @@ +from typing import Optional + +from docx.oxml.xmlchemy import BaseOxmlElement + +class CT_Br(BaseOxmlElement): + type: Optional[str] + clear: Optional[str] + +class CT_R(BaseOxmlElement): ... diff --git a/typings/docx/oxml/xmlchemy.pyi b/typings/docx/oxml/xmlchemy.pyi new file mode 100644 index 0000000000..e08277ee68 --- /dev/null +++ b/typings/docx/oxml/xmlchemy.pyi @@ -0,0 +1,17 @@ +from typing import Any, Iterator + +from lxml import etree + +class BaseOxmlElement(etree.ElementBase): + def __iter__(self) -> Iterator[BaseOxmlElement]: ... + @property + def xml(self) -> str: ... + def xpath(self, xpath_str: str) -> Any: + """Return type is typically Sequence[ElementBase], but ... + + lxml.etree.XPath has many possible return types including bool, (a "smart") str, + float. The return type can also be a list containing ElementBase, comments, + processing instructions, str, and tuple. So you need to cast the result based on + the XPath expression you use. + """ + ... diff --git a/typings/docx/section.pyi b/typings/docx/section.pyi new file mode 100644 index 0000000000..e0c856a9da --- /dev/null +++ b/typings/docx/section.pyi @@ -0,0 +1,33 @@ +from typing import Sequence + +from docx.blkcntnr import BlockItemContainer +from docx.enum.section import WD_SECTION +from docx.oxml.section import CT_SectPr + +class Section: + _sectPr: CT_SectPr + @property + def different_first_page_header_footer(self) -> bool: ... + @property + def even_page_footer(self) -> _Footer: ... + @property + def even_page_header(self) -> _Header: ... + @property + def first_page_footer(self) -> _Footer: ... + @property + def first_page_header(self) -> _Header: ... + @property + def footer(self) -> _Footer: ... + @property + def header(self) -> _Header: ... + @property + def start_type(self) -> WD_SECTION: ... + +class Sections(Sequence[Section]): ... + +class _BaseHeaderFooter(BlockItemContainer): + @property + def is_linked_to_previous(self) -> bool: ... + +class _Footer(_BaseHeaderFooter): ... +class _Header(_BaseHeaderFooter): ... diff --git a/typings/docx/settings.pyi b/typings/docx/settings.pyi new file mode 100644 index 0000000000..d5bf481d69 --- /dev/null +++ b/typings/docx/settings.pyi @@ -0,0 +1,5 @@ +from docx.shared import ElementProxy + +class Settings(ElementProxy): + @property + def odd_and_even_pages_header_footer(self) -> bool: ... diff --git a/typings/docx/shared.pyi b/typings/docx/shared.pyi new file mode 100644 index 0000000000..2722b0b76e --- /dev/null +++ b/typings/docx/shared.pyi @@ -0,0 +1,16 @@ +from typing import Any, Callable, Generic, TypeVar + +from docx.oxml.xmlchemy import BaseOxmlElement + +_T = TypeVar("_T") + +class lazyproperty(Generic[_T]): + def __init__(self, fget: Callable[..., _T]) -> None: ... + def __get__(self, obj: Any, type: Any = None) -> _T: ... + def __set__(self, obj: Any, value: Any) -> None: ... + +class ElementProxy: + @property + def element(self) -> BaseOxmlElement: ... + +class Parented: ... diff --git a/typings/docx/styles/style.pyi b/typings/docx/styles/style.pyi new file mode 100644 index 0000000000..5e79b70447 --- /dev/null +++ b/typings/docx/styles/style.pyi @@ -0,0 +1,8 @@ +from typing import Optional + +class BaseStyle: + @property + def name(self) -> Optional[str]: ... + +class _CharacterStyle(BaseStyle): ... +class _ParagraphStyle(_CharacterStyle): ... diff --git a/typings/docx/table.pyi b/typings/docx/table.pyi new file mode 100644 index 0000000000..b5757f57a5 --- /dev/null +++ b/typings/docx/table.pyi @@ -0,0 +1,21 @@ +from typing import Sequence + +from docx.blkcntnr import BlockItemContainer +from docx.oxml.table import CT_Tbl +from docx.shared import Parented +from docx.text.paragraph import Paragraph + +class _Cell: + @property + def paragraphs(self) -> Sequence[Paragraph]: ... + +class _Row: + @property + def cells(self) -> Sequence[_Cell]: ... + +class _Rows(Sequence[_Row]): ... + +class Table(Parented): + def __init__(self, tbl: CT_Tbl, parent: BlockItemContainer) -> None: ... + @property + def rows(self) -> _Rows: ... diff --git a/typings/docx/text/__init__.pyi b/typings/docx/text/__init__.pyi new file mode 100644 index 0000000000..e69de29bb2 diff --git a/typings/docx/text/paragraph.pyi b/typings/docx/text/paragraph.pyi new file mode 100644 index 0000000000..1a8193927d --- /dev/null +++ b/typings/docx/text/paragraph.pyi @@ -0,0 +1,19 @@ +# pyright: reportPrivateUsage = false + +from typing import Optional, Sequence + +from docx.blkcntnr import BlockItemContainer +from docx.oxml.text.paragraph import CT_P +from docx.oxml.xmlchemy import BaseOxmlElement +from docx.styles.style import _ParagraphStyle +from docx.text.run import Run + +class Paragraph(BlockItemContainer): + _p: CT_P + def __init__(self, p: BaseOxmlElement, parent: BlockItemContainer) -> None: ... + @property + def runs(self) -> Sequence[Run]: ... + @property + def style(self) -> Optional[_ParagraphStyle]: ... + @property + def text(self) -> str: ... diff --git a/typings/docx/text/run.pyi b/typings/docx/text/run.pyi new file mode 100644 index 0000000000..bc80169f4c --- /dev/null +++ b/typings/docx/text/run.pyi @@ -0,0 +1,13 @@ +from docx.oxml.text.run import CT_R +from docx.text.paragraph import Paragraph + +class Run: + _element: CT_R + _r: CT_R + def __init__(self, r: CT_R, parent: Paragraph) -> None: ... + @property + def bold(self) -> bool: ... + @property + def italic(self) -> bool: ... + @property + def text(self) -> str: ... diff --git a/typings/pptx/__init__.pyi b/typings/pptx/__init__.pyi new file mode 100644 index 0000000000..9c98df114b --- /dev/null +++ b/typings/pptx/__init__.pyi @@ -0,0 +1,3 @@ +from pptx.api import Presentation + +__all__ = ["Presentation"] diff --git a/typings/pptx/api.pyi b/typings/pptx/api.pyi new file mode 100644 index 0000000000..236122e21d --- /dev/null +++ b/typings/pptx/api.pyi @@ -0,0 +1,5 @@ +from typing import BinaryIO, Optional, Union + +import pptx.presentation + +def Presentation(pptx: Optional[Union[str, BinaryIO]] = None) -> pptx.presentation.Presentation: ... diff --git a/typings/pptx/oxml/__init__.py b/typings/pptx/oxml/__init__.py new file mode 100644 index 0000000000..734aac61d1 --- /dev/null +++ b/typings/pptx/oxml/__init__.py @@ -0,0 +1,9 @@ +# pyright: reportPrivateUsage=false + +from typing import Union + +from lxml import etree + + +def parse_xml(xml: Union[str, bytes]) -> etree._Element: + ... diff --git a/typings/pptx/oxml/text.pyi b/typings/pptx/oxml/text.pyi new file mode 100644 index 0000000000..70bfd1ca6b --- /dev/null +++ b/typings/pptx/oxml/text.pyi @@ -0,0 +1,6 @@ +from pptx.oxml.xmlchemy import BaseOxmlElement + +class CT_TextParagraph(BaseOxmlElement): + def get_or_add_pPr(self) -> CT_TextParagraphProperties: ... + +class CT_TextParagraphProperties(BaseOxmlElement): ... diff --git a/typings/pptx/oxml/xmlchemy.pyi b/typings/pptx/oxml/xmlchemy.pyi new file mode 100644 index 0000000000..e08277ee68 --- /dev/null +++ b/typings/pptx/oxml/xmlchemy.pyi @@ -0,0 +1,17 @@ +from typing import Any, Iterator + +from lxml import etree + +class BaseOxmlElement(etree.ElementBase): + def __iter__(self) -> Iterator[BaseOxmlElement]: ... + @property + def xml(self) -> str: ... + def xpath(self, xpath_str: str) -> Any: + """Return type is typically Sequence[ElementBase], but ... + + lxml.etree.XPath has many possible return types including bool, (a "smart") str, + float. The return type can also be a list containing ElementBase, comments, + processing instructions, str, and tuple. So you need to cast the result based on + the XPath expression you use. + """ + ... diff --git a/typings/pptx/presentation.pyi b/typings/pptx/presentation.pyi new file mode 100644 index 0000000000..3f476d1db1 --- /dev/null +++ b/typings/pptx/presentation.pyi @@ -0,0 +1,11 @@ +from typing import BinaryIO, Union + +from pptx.shared import PartElementProxy +from pptx.slide import SlideLayouts, Slides + +class Presentation(PartElementProxy): + def save(self, file: Union[str, BinaryIO]) -> None: ... + @property + def slide_layouts(self) -> SlideLayouts: ... + @property + def slides(self) -> Slides: ... diff --git a/typings/pptx/shapes/__init__.py b/typings/pptx/shapes/__init__.py new file mode 100644 index 0000000000..41be6cfc5e --- /dev/null +++ b/typings/pptx/shapes/__init__.py @@ -0,0 +1,2 @@ +class Subshape: + ... diff --git a/typings/pptx/shapes/autoshape.pyi b/typings/pptx/shapes/autoshape.pyi new file mode 100644 index 0000000000..77d6f3afa0 --- /dev/null +++ b/typings/pptx/shapes/autoshape.pyi @@ -0,0 +1,6 @@ +from pptx.shapes.base import BaseShape +from pptx.text.text import TextFrame + +class Shape(BaseShape): + @property + def text_frame(self) -> TextFrame: ... diff --git a/typings/pptx/shapes/base.pyi b/typings/pptx/shapes/base.pyi new file mode 100644 index 0000000000..ddafa8275a --- /dev/null +++ b/typings/pptx/shapes/base.pyi @@ -0,0 +1,9 @@ +from pptx.util import Length + +class BaseShape: + left: Length + top: Length + @property + def has_table(self) -> bool: ... + @property + def has_text_frame(self) -> bool: ... diff --git a/typings/pptx/shapes/graphfrm.pyi b/typings/pptx/shapes/graphfrm.pyi new file mode 100644 index 0000000000..970bf63760 --- /dev/null +++ b/typings/pptx/shapes/graphfrm.pyi @@ -0,0 +1,6 @@ +from pptx.shapes.base import BaseShape +from pptx.table import Table + +class GraphicFrame(BaseShape): + @property + def table(self) -> Table: ... diff --git a/typings/pptx/shapes/group.pyi b/typings/pptx/shapes/group.pyi new file mode 100644 index 0000000000..af18cf6083 --- /dev/null +++ b/typings/pptx/shapes/group.pyi @@ -0,0 +1,6 @@ +from pptx.shapes.base import BaseShape +from pptx.shapes.shapetree import GroupShapes + +class GroupShape(BaseShape): + @property + def shapes(self) -> GroupShapes: ... diff --git a/typings/pptx/shapes/shapetree.pyi b/typings/pptx/shapes/shapetree.pyi new file mode 100644 index 0000000000..ebb905c4eb --- /dev/null +++ b/typings/pptx/shapes/shapetree.pyi @@ -0,0 +1,18 @@ +from typing import Iterator + +from pptx.shapes.autoshape import Shape +from pptx.shapes.base import BaseShape +from pptx.shared import ParentedElementProxy +from pptx.util import Length + +class _BaseShapes(ParentedElementProxy): + def __iter__(self) -> Iterator[BaseShape]: ... + +class _BaseGroupShapes(_BaseShapes): + def add_textbox(self, left: Length, top: Length, width: Length, height: Length) -> Shape: ... + +class GroupShapes(_BaseGroupShapes): ... +class NotesSlideShapes(_BaseShapes): ... + +class SlideShapes(_BaseGroupShapes): + def __iter__(self) -> Iterator[BaseShape]: ... diff --git a/typings/pptx/shared.pyi b/typings/pptx/shared.pyi new file mode 100644 index 0000000000..48abf5a1a4 --- /dev/null +++ b/typings/pptx/shared.pyi @@ -0,0 +1,3 @@ +class ElementProxy: ... +class ParentedElementProxy(ElementProxy): ... +class PartElementProxy(ElementProxy): ... diff --git a/typings/pptx/slide.pyi b/typings/pptx/slide.pyi new file mode 100644 index 0000000000..6372ab53b4 --- /dev/null +++ b/typings/pptx/slide.pyi @@ -0,0 +1,31 @@ +from typing import Iterator, Optional + +from pptx.shapes.shapetree import SlideShapes +from pptx.shared import ParentedElementProxy, PartElementProxy +from pptx.text.text import TextFrame + +class _BaseSlide(PartElementProxy): ... + +class NotesSlide(_BaseSlide): + @property + def notes_text_frame(self) -> Optional[TextFrame]: ... + +class Slide(_BaseSlide): + @property + def has_notes_slide(self) -> bool: ... + @property + def notes_slide(self) -> NotesSlide: ... + @property + def shapes(self) -> SlideShapes: ... + +class SlideLayout(_BaseSlide): ... + +class SlideLayouts(ParentedElementProxy): + def __getitem__(self, idx: int) -> SlideLayout: ... + def __iter__(self) -> Iterator[SlideLayout]: ... + def __len__(self) -> int: ... + +class Slides(ParentedElementProxy): + def __iter__(self) -> Iterator[Slide]: ... + def __len__(self) -> int: ... + def add_slide(self, slide_layout: SlideLayout) -> Slide: ... diff --git a/typings/pptx/table.pyi b/typings/pptx/table.pyi new file mode 100644 index 0000000000..edfa21b534 --- /dev/null +++ b/typings/pptx/table.pyi @@ -0,0 +1 @@ +class Table: ... diff --git a/typings/pptx/text/text.pyi b/typings/pptx/text/text.pyi new file mode 100644 index 0000000000..3e65274de4 --- /dev/null +++ b/typings/pptx/text/text.pyi @@ -0,0 +1,13 @@ +from typing import Sequence + +from pptx.oxml.text import CT_TextParagraph +from pptx.shapes import Subshape + +class TextFrame(Subshape): + text: str + @property + def paragraphs(self) -> Sequence[_Paragraph]: ... + +class _Paragraph(Subshape): + _p: CT_TextParagraph + text: str diff --git a/typings/pptx/util.pyi b/typings/pptx/util.pyi new file mode 100644 index 0000000000..c27d7d311d --- /dev/null +++ b/typings/pptx/util.pyi @@ -0,0 +1,4 @@ +class Length(int): ... + +class Inches(Length): + def __init__(self, inches: float) -> None: ... diff --git a/typings/pypandoc/__init__.pyi b/typings/pypandoc/__init__.pyi new file mode 100644 index 0000000000..e69de29bb2 diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 70682af42c..088cfa170e 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -29,7 +29,7 @@ def clean_non_ascii_chars(text) -> str: return en.decode() -def clean_bullets(text) -> str: +def clean_bullets(text: str) -> str: """Cleans unicode bullets from a section of text. Example diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 98b7be7433..907cacf8cd 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -1,17 +1,19 @@ from __future__ import annotations +import abc +import copy +import dataclasses as dc import datetime +import functools import hashlib import inspect import os import pathlib import re import uuid -from abc import ABC -from copy import deepcopy -from dataclasses import dataclass -from functools import wraps -from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast + +from typing_extensions import Self, TypedDict from unstructured.documents.coordinates import ( TYPE_TO_COORDINATE_SYSTEM_MAP, @@ -20,19 +22,19 @@ ) -class NoID(ABC): +class NoID(abc.ABC): """Class to indicate that an element do not have an ID.""" pass -class UUID(ABC): +class UUID(abc.ABC): """Class to indicate that an element should have a UUID.""" pass -@dataclass +@dc.dataclass class DataSourceMetadata: """Metadata fields that pertain to the data source of the document.""" @@ -47,7 +49,7 @@ def to_dict(self): return {key: value for key, value in self.__dict__.items() if value is not None} -@dataclass +@dc.dataclass class CoordinatesMetadata: """Metadata fields that pertain to the coordinates of the element.""" @@ -125,7 +127,7 @@ class Link(TypedDict): url: str -@dataclass +@dc.dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None @@ -192,8 +194,8 @@ def to_dict(self): return _dict @classmethod - def from_dict(cls, input_dict): - constructor_args = deepcopy(input_dict) + def from_dict(cls, input_dict: Dict[str, Any]) -> Self: + constructor_args = copy.deepcopy(input_dict) if constructor_args.get("coordinates", None) is not None: constructor_args["coordinates"] = CoordinatesMetadata.from_dict( constructor_args["coordinates"], @@ -237,7 +239,7 @@ def decorator(func: Callable): attribute on the elements in the output.""" ) - @wraps(func) + @functools.wraps(func) def wrapper(*args, **kwargs): elements = func(*args, **kwargs) sig = inspect.signature(func) @@ -293,7 +295,7 @@ def _add_regex_metadata( return elements -class Element(ABC): +class Element(abc.ABC): """An element is a section of a page in the document.""" def __init__( diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index ff32ebc5a1..75299fe898 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -444,11 +444,8 @@ def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) -> return text.strip() -def _has_break_tags(tag_elem: etree.Element) -> bool: - for descendant in tag_elem.iterdescendants(): - if descendant.tag in TEXTBREAK_TAGS: - return True - return False +def _has_break_tags(tag_elem: etree._Element) -> bool: # pyright: ignore[reportPrivateUsage] + return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants()) def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]: diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index f161d89e32..1f10fdd7bc 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -83,7 +83,7 @@ def get_last_modified_date(filename: str) -> Union[str, None]: def get_last_modified_date_from_file( - file: Union[IO[bytes], SpooledTemporaryFile, BinaryIO, bytes], + file: Union[IO[bytes], SpooledTemporaryFile[bytes], BinaryIO, bytes], ) -> Union[str, None]: filename = None if hasattr(file, "name"): @@ -405,7 +405,7 @@ def convert_office_doc( logger.error(error.decode().strip()) -def exactly_one(**kwargs) -> None: +def exactly_one(**kwargs: Any) -> None: """ Verify arguments; exactly one of all keyword arguments must not be None. @@ -422,7 +422,7 @@ def exactly_one(**kwargs) -> None: def spooled_to_bytes_io_if_needed( - file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]], + file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile[bytes]]], ) -> Optional[Union[bytes, BinaryIO]]: if isinstance(file_obj, SpooledTemporaryFile): file_obj.seek(0) @@ -453,10 +453,7 @@ def convert_to_bytes( return f_bytes -def convert_ms_office_table_to_text( - table: "docxtable.Table", - as_html: bool = True, -) -> str: +def convert_ms_office_table_to_text(table: "docxtable", as_html: bool = True) -> str: """ Convert a table object from a Word document to an HTML table string using the tabulate library. diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index fbc7c8190f..24c872ac69 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -1,13 +1,42 @@ +# pyright: reportPrivateUsage=false + +from __future__ import annotations + +import io +import itertools import os import tempfile from tempfile import SpooledTemporaryFile -from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast +from typing import ( + Any, + BinaryIO, + Dict, + Iterator, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) +# -- CT_* stands for "complex-type", an XML element type in docx parlance -- import docx -from docx.oxml.shared import qn +from docx.document import Document +from docx.enum.section import WD_SECTION_START +from docx.oxml.ns import nsmap, qn +from docx.oxml.section import CT_SectPr +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +from docx.oxml.text.run import CT_R +from docx.oxml.xmlchemy import BaseOxmlElement +from docx.section import Section, _Footer, _Header from docx.table import Table as DocxTable from docx.text.paragraph import Paragraph from docx.text.run import Run +from lxml import etree +from typing_extensions import TypeAlias from unstructured.chunking.title import add_chunking_strategy from unstructured.cleaners.core import clean_bullets @@ -32,7 +61,6 @@ exactly_one, get_last_modified_date, get_last_modified_date_from_file, - spooled_to_bytes_io_if_needed, ) from unstructured.partition.text_type import ( is_bulleted_text, @@ -41,75 +69,82 @@ is_possible_title, is_us_city_state_zip, ) -from unstructured.utils import dependency_exists +from unstructured.utils import dependency_exists, lazyproperty, requires_dependencies if dependency_exists("pypandoc"): import pypandoc -# NOTE(robinson) - documentation on built in styles can be found at the link below -# ref: https://python-docx.readthedocs.io/en/latest/user/ -# styles-understanding.html#paragraph-styles-in-default-template -STYLE_TO_ELEMENT_MAPPING = { - "Caption": Text, # TODO(robinson) - add caption element type - "Heading 1": Title, - "Heading 2": Title, - "Heading 3": Title, - "Heading 4": Title, - "Heading 5": Title, - "Heading 6": Title, - "Heading 7": Title, - "Heading 8": Title, - "Heading 9": Title, - "Intense Quote": Text, # TODO(robinson) - add quote element type - "List": ListItem, - "List 2": ListItem, - "List 3": ListItem, - "List Bullet": ListItem, - "List Bullet 2": ListItem, - "List Bullet 3": ListItem, - "List Continue": ListItem, - "List Continue 2": ListItem, - "List Continue 3": ListItem, - "List Number": ListItem, - "List Number 2": ListItem, - "List Number 3": ListItem, - "List Paragraph": ListItem, - "Macro Text": Text, - "No Spacing": Text, - "Quote": Text, # TODO(robinson) - add quote element type - "Subtitle": Title, - "TOCHeading": Title, - "Title": Title, -} - - -def _get_paragraph_runs(paragraph): - """ - Get hyperlink text from a paragraph object. - Without this, the default runs function skips over hyperlinks. - Args: - paragraph (Paragraph): A Paragraph object. +BlockElement: TypeAlias = Union[CT_P, CT_Tbl] +BlockItem: TypeAlias = Union[Paragraph, DocxTable] - Returns: - list: A list of Run objects. + +@requires_dependencies("pypandoc") +def convert_and_partition_docx( + source_format: str, + filename: Optional[str] = None, + file: Optional[BinaryIO] = None, + include_metadata: bool = True, + metadata_filename: Optional[str] = None, + metadata_last_modified: Optional[str] = None, +) -> List[Element]: + """Converts a document to DOCX and then partitions it using partition_docx. + + Works with any file format support by pandoc. + + Parameters + ---------- + source_format + The format of the source document, .e.g. odt + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + include_metadata + Determines whether or not metadata is included in the metadata attribute on the elements in + the output. """ + exactly_one(filename=filename, file=file) - # Recursively get runs. - def _get_runs(node, parent): - for child in node: - # If the child is a run, yield a Run object - if child.tag == qn("w:r"): - yield Run(child, parent) - # If the child is a hyperlink, search for runs within it recursively - if child.tag == qn("w:hyperlink"): - yield from _get_runs(child, parent) + def validate_filename(filename: str) -> str: + """Return path to a file confirmed to exist on the filesystem.""" + if not os.path.exists(filename): + raise ValueError(f"The file {filename} does not exist.") + return filename - return list(_get_runs(paragraph._element, paragraph)) + def copy_to_tempfile(file: BinaryIO) -> str: + """Return path to temporary copy of file to be converted.""" + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp.write(file.read()) + return tmp.name + + def extract_docx_filename(file_path: str) -> str: + """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """ + # -- a/b/foo.odt -> foo.odt -- + filename = os.path.basename(file_path) + # -- foo.odt -> foo -- + root_name, _ = os.path.splitext(filename) + # -- foo -> foo.docx -- + return f"{root_name}.docx" + file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(BinaryIO, file)) + + with tempfile.TemporaryDirectory() as tmpdir: + docx_path = os.path.join(tmpdir, extract_docx_filename(file_path)) + pypandoc.convert_file( # pyright: ignore + file_path, + "docx", + format=source_format, + outputfile=docx_path, + ) + elements = partition_docx( + filename=docx_path, + metadata_filename=metadata_filename, + include_metadata=include_metadata, + metadata_last_modified=metadata_last_modified, + ) -# Add the runs property to the Paragraph class -Paragraph.runs = property(lambda self: _get_paragraph_runs(self)) + return elements @process_metadata() @@ -117,13 +152,13 @@ def _get_runs(node, parent): @add_chunking_strategy() def partition_docx( filename: Optional[str] = None, - file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, + file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None, metadata_filename: Optional[str] = None, include_page_breaks: bool = True, include_metadata: bool = True, metadata_last_modified: Optional[str] = None, chunking_strategy: Optional[str] = None, - **kwargs, + **kwargs: Any, ) -> List[Element]: """Partitions Microsoft Word Documents in .docx format into its document elements. @@ -134,305 +169,625 @@ def partition_docx( file A file-like object using "rb" mode --> open(filename, "rb"). metadata_filename - The filename to use for the metadata. Relevant because partition_doc converts the - document to .docx before partition. We want the original source filename in the - metadata. + The filename to use for the metadata. Relevant because partition_doc converts the document + to .docx before partition. We want the original source filename in the metadata. metadata_last_modified The last modified date for the document. """ - - # Verify that only one of the arguments was provided + # -- verify that only one file-specifier argument was provided -- exactly_one(filename=filename, file=file) - last_modification_date = None - if filename is not None: - if not filename.startswith("/tmp"): - last_modification_date = get_last_modified_date(filename) - - document = docx.Document(filename) - elif file is not None: - last_modification_date = get_last_modified_date_from_file(file) - - document = docx.Document( - spooled_to_bytes_io_if_needed( - cast(Union[BinaryIO, SpooledTemporaryFile], file), - ), + return list( + _DocxPartitioner.iter_document_elements( + filename, + file, + metadata_filename, + include_page_breaks, + metadata_last_modified, ) + ) - elements: List[Element] = [] - table_index = 0 - - headers_and_footers = _get_headers_and_footers(document, metadata_filename) - if len(headers_and_footers) > 0: - elements.extend(headers_and_footers[0][0]) - - document_contains_pagebreaks = _element_contains_pagebreak(document._element) - page_number = 1 if document_contains_pagebreaks else None - section = 0 - is_list = False - for element_item in document.element.body: - if element_item.tag.endswith("tbl"): - table = document.tables[table_index] - emphasized_texts = _get_emphasized_texts_from_table(table) - emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags( - emphasized_texts, - ) - html_table = convert_ms_office_table_to_text(table, as_html=True) - text_table = convert_ms_office_table_to_text(table, as_html=False) - element = Table(text_table) - if element is not None: - element.metadata = ElementMetadata( - text_as_html=html_table, - filename=metadata_filename, - page_number=page_number, - last_modified=metadata_last_modified or last_modification_date, - emphasized_text_contents=emphasized_text_contents, - emphasized_text_tags=emphasized_text_tags, - ) - elements.append(element) - table_index += 1 - elif element_item.tag.endswith("p"): - if "" in element_item.xml: - is_list = True - paragraph = docx.text.paragraph.Paragraph(element_item, document) - emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph) - emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags( - emphasized_texts, + +class _DocxPartitioner: + """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" + + # TODO: I think we can do better on metadata.filename. Should that only be populated when a + # `metadata_filename` argument was provided to `partition_docx()`? What about when not but + # we do get a `filename` arg or a `file` arg that has a `.name` attribute? + # TODO: get last-modified date from document-properties (stored in docx package) rather than + # relying on last filesystem-write date; maybe fall-back to filesystem-date. + # TODO: improve `._element_contains_pagebreak()`. It uses substring matching on the rendered + # XML text which is error-prone and not performant. Use XPath instead with the specific + # locations a page-break can be located. Also, there can be more than one, so return a + # count instead of a boolean. + # TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for + # `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like + # the "_element_contains_pagebreak()" function. + # TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is + # a list-item is encapsulated in a single place rather than distributed around the code. + # TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead + # of a substring match on the rendered XML. Include all permutations of how a numbered + # list can be manually applied (as opposed to by using a style). + # TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much + # domain-specific knowledge to comfortable here and is of general use so welcome in the + # library. + # TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`. + + def __init__( + self, + filename: Optional[str], + file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]], + metadata_filename: Optional[str], + include_page_breaks: bool, + metadata_last_modified: Optional[str], + ) -> None: + self._filename = filename + self._file = file + self._metadata_filename = metadata_filename + self._include_page_breaks = include_page_breaks + self._metadata_last_modified = metadata_last_modified + self._page_counter: int = 1 + + @classmethod + def iter_document_elements( + cls, + filename: Optional[str] = None, + file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None, + metadata_filename: Optional[str] = None, + include_page_breaks: bool = True, + metadata_last_modified: Optional[str] = None, + ) -> Iterator[Element]: + """Partition MS Word documents (.docx format) into its document elements.""" + return cls( + filename, + file, + metadata_filename, + include_page_breaks, + metadata_last_modified, + )._iter_document_elements() + + def _iter_document_elements(self) -> Iterator[Element]: + """Generate each document-element in (docx) `document` in document order.""" + for section_idx, section in enumerate(self._document.sections): + yield from self._iter_section_page_breaks(section_idx, section) + yield from self._iter_section_headers(section) + + for block_item in _SectBlockItemIterator.iter_sect_block_items(section, self._document): + # -- a block-item can only be a Paragraph ... -- + if isinstance(block_item, Paragraph): + yield from self._iter_paragraph_elements(block_item) + # -- a paragraph can contain a page-break -- + yield from self._iter_maybe_paragraph_page_breaks(block_item) + # -- ... or a Table -- + else: + yield from self._iter_table_element(block_item) + + yield from self._iter_section_footers(section) + + @lazyproperty + def _document(self) -> Document: + """The python-docx `Document` object loaded from file or filename.""" + filename, file = self._filename, self._file + + if filename is not None: + return docx.Document(filename) + + assert file is not None + if isinstance(file, SpooledTemporaryFile): + file.seek(0) + file = io.BytesIO(file.read()) + return docx.Document(file) + + @lazyproperty + def _document_contains_pagebreaks(self) -> bool: + """True when there is at least one page-break detected in the document.""" + return self._element_contains_pagebreak(self._document._element) + + def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool: + """True when `element` contains a page break. + + Checks for both "hard" page breaks (page breaks explicitly inserted by the user) + and "soft" page breaks, which are sometimes inserted by the MS Word renderer. + Note that soft page breaks aren't always present. Whether or not pages are + tracked may depend on your Word renderer. + """ + page_break_indicators = [ + ["w:br", 'type="page"'], # "Hard" page break inserted by user + ["lastRenderedPageBreak"], # "Soft" page break inserted by renderer + ] + if hasattr(element, "xml"): + for indicators in page_break_indicators: + if all(indicator in element.xml for indicator in indicators): + return True + return False + + def _increment_page_number(self) -> Iterator[PageBreak]: + """Increment page-number by 1 and generate a PageBreak element if enabled.""" + self._page_counter += 1 + if self._include_page_breaks: + yield PageBreak("") + + def _is_list_item(self, paragraph: Paragraph) -> bool: + """True when `paragraph` can be identified as a list-item.""" + if is_bulleted_text(paragraph.text): + return True + + return "" in paragraph._p.xml + + def _iter_paragraph_elements(self, paragraph: Paragraph) -> Iterator[Element]: + """Generate zero-or-one document element for `paragraph`. + + In Word, an empty paragraph is commonly used for inter-paragraph spacing. An empty paragraph + does not contribute to the document-element stream and will not cause an element to be + emitted. + """ + text = paragraph.text + + # -- blank paragraphs are commonly used for spacing between paragraphs and + # -- do not contribute to the document-element stream. + if not text.strip(): + return + + metadata = self._paragraph_metadata(paragraph) + + # -- a list gets some special treatment -- + if self._is_list_item(paragraph): + clean_text = clean_bullets(text).strip() + if clean_text: + yield ListItem(text=clean_text, metadata=metadata) + return + + # -- determine element-type from an explicit Word paragraph-style if possible -- + TextSubCls = self._style_based_element_type(paragraph) + if TextSubCls: + yield TextSubCls(text=text, metadata=metadata) + return + + # -- try to recognize the element type by parsing its text -- + TextSubCls = self._parse_paragraph_text_for_element_type(paragraph) + if TextSubCls: + yield TextSubCls(text=text, metadata=metadata) + return + + # -- if all that fails we give it the default `Text` element-type -- + yield Text(text, metadata=metadata) + + def _iter_maybe_paragraph_page_breaks(self, paragraph: Paragraph) -> Iterator[PageBreak]: + """Generate a `PageBreak` document element for each page-break in `paragraph`. + + Checks for both "hard" page breaks (page breaks explicitly inserted by the user) + and "soft" page breaks, which are sometimes inserted by the MS Word renderer. + Note that soft page breaks aren't always present. Whether or not pages are + tracked may depend on your Word renderer. + """ + + def has_page_break_implementation_we_have_so_far() -> bool: + """Needs to become more sophisticated.""" + page_break_indicators = [ + ["w:br", 'type="page"'], # "Hard" page break inserted by user + ["lastRenderedPageBreak"], # "Soft" page break inserted by renderer + ] + for indicators in page_break_indicators: + if all(indicator in paragraph._p.xml for indicator in indicators): + return True + return False + + if not has_page_break_implementation_we_have_so_far(): + return + + yield from self._increment_page_number() + + def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]: + """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.""" + for run in paragraph.runs: + text = run.text.strip() if run.text else "" + if not text: + continue + if run.bold: + yield {"text": text, "tag": "b"} + if run.italic: + yield {"text": text, "tag": "i"} + + def _iter_section_footers(self, section: Section) -> Iterator[Footer]: + """Generate any `Footer` elements defined for this section. + + A Word document has up to three header and footer definition pairs for each document + section, a primary, first-page, and even-page header and footer. The first-page pair + applies only to the first page of the section (perhaps a title page or chapter start). The + even-page pair is used in book-bound documents where there are both recto and verso pages + (it is applied to verso (even-numbered) pages). A page where neither more specialized + footer applies uses the primary footer. + """ + + def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]: + """Generate zero-or-one Footer elements for `footer`.""" + if footer.is_linked_to_previous: + return + text = "\n".join([p.text for p in footer.paragraphs]) + if not text: + return + yield Footer( + text=text, + metadata=ElementMetadata( + filename=self._metadata_filename, + header_footer_type=header_footer_type, + ), ) - para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list) - if para_element is not None: - para_element.metadata = ElementMetadata( - filename=metadata_filename, - page_number=page_number, - last_modified=metadata_last_modified or last_modification_date, - emphasized_text_contents=emphasized_text_contents, - emphasized_text_tags=emphasized_text_tags, - ) - elements.append(para_element) - is_list = False - elif element_item.tag.endswith("sectPr"): - if len(headers_and_footers) > section: - footers = headers_and_footers[section][1] - elements.extend(footers) - - section += 1 - if len(headers_and_footers) > section: - headers = headers_and_footers[section][0] - elements.extend(headers) - - if page_number is not None and _element_contains_pagebreak(element_item): - page_number += 1 - if include_page_breaks: - elements.append(PageBreak(text="")) - return elements + yield from iter_footer(section.footer, "primary") + if section.different_first_page_header_footer: + yield from iter_footer(section.first_page_footer, "first_page") + if self._document.settings.odd_and_even_pages_header_footer: + yield from iter_footer(section.even_page_footer, "even_page") + + def _iter_section_headers(self, section: Section) -> Iterator[Header]: + """Generate `Header` elements for this section if it has them. + + See `._iter_section_footers()` docstring for more on docx headers and footers. + """ + + def iter_header(header: _Header, header_footer_type: str) -> Iterator[Header]: + """Generate zero-or-one Header elements for `header`.""" + if header.is_linked_to_previous: + return + text = "\n".join([p.text for p in header.paragraphs]) + if not text: + return + yield Header( + text=text, + metadata=ElementMetadata( + filename=self._metadata_filename, + header_footer_type=header_footer_type, + ), + ) + yield from iter_header(section.header, "primary") + if section.different_first_page_header_footer: + yield from iter_header(section.first_page_header, "first_page") + if self._document.settings.odd_and_even_pages_header_footer: + yield from iter_header(section.even_page_header, "even_page") + + def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Iterator[PageBreak]: + """Generate zero-or-one `PageBreak` document elements for `section`. + + A docx section has a "start" type which can be "continuous" (no page-break), "nextPage", + "evenPage", or "oddPage". For the next, even, and odd varieties, a `w:renderedPageBreak` + element signals one page break. Here we only need to handle the case where we need to add + another, for example to go from one odd page to another odd page and we need a total of + two page-breaks. + """ + + def page_is_odd() -> bool: + return self._page_counter % 2 == 1 + + start_type = section.start_type + + # -- This method is called upon entering a new section, which happens before any paragraphs + # -- in that section are partitioned. A rendered page-break due to a section-start occurs + # -- in the first paragraph of the section and so occurs _later_ in the proces. Here we + # -- predict when two page breaks will be needed and emit one of them. The second will be + # -- emitted by the rendered page-break to follow. + + if start_type == WD_SECTION_START.EVEN_PAGE: + # -- on an even page we need two total, add one to supplement the rendered page break + # -- to follow. There is no "first-document-page" special case because 1 is odd. + if not page_is_odd(): + yield from self._increment_page_number() + + elif start_type == WD_SECTION_START.ODD_PAGE: + # -- the first page of the document is an implicit "new" odd-page, so no page-break -- + if section_idx == 0: + return + if page_is_odd(): + yield from self._increment_page_number() + + # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of + # -- which need our help to get the page-breaks right. + return + + def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: + """Generate zero-or-one Table element for a DOCX `w:tbl` XML element.""" + # -- at present, we always generate exactly one Table element, but we might want + # -- to skip, for example, an empty table, or accommodate nested tables. + + html_table = convert_ms_office_table_to_text(table, as_html=True) + text_table = convert_ms_office_table_to_text(table, as_html=False) + emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table) + + yield Table( + text_table, + metadata=ElementMetadata( + text_as_html=html_table, + filename=self._metadata_filename, + page_number=self._page_number, + last_modified=self._last_modified, + emphasized_text_contents=emphasized_text_contents or None, + emphasized_text_tags=emphasized_text_tags or None, + ), + ) -def _paragraph_to_element( - paragraph: docx.text.paragraph.Paragraph, - is_list=False, -) -> Optional[Text]: - """Converts a docx Paragraph object into the appropriate unstructured document element. - If the paragraph style is "Normal" or unknown, we try to predict the element type from the - raw text.""" - text = paragraph.text - style_name = paragraph.style and paragraph.style.name # .style can be None + def _iter_table_emphasis(self, table: DocxTable) -> Iterator[Dict[str, str]]: + """Generate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`.""" + for row in table.rows: + for cell in row.cells: + for paragraph in cell.paragraphs: + yield from self._iter_paragraph_emphasis(paragraph) + + @lazyproperty + def _last_modified(self) -> Optional[str]: + """Last-modified date suitable for use in element metadata.""" + # -- if this file was converted from another format, any last-modified date for the file + # -- will be today, so we get it from the conversion step in `._metadata_last_modified`. + if self._metadata_last_modified: + return self._metadata_last_modified + + file_path, file = self._filename, self._file + + # -- if the file is on the filesystem, get its date from there -- + if file_path is not None: + return None if file_path.startswith("/tmp") else get_last_modified_date(file_path) + + # -- otherwise try getting it from the file-like object (unlikely since BytesIO and its + # -- brethren have no such metadata). + assert file is not None + return get_last_modified_date_from_file(file) + + @property + def _page_number(self) -> Optional[int]: + """The current page number, or None if we can't really tell. + + Page numbers are not added to element metadata if we can't find any page-breaks in the + document (which may be a common case). + + In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual + page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the + target device. Explicit (hard) page-breaks are always recorded in the docx file but the + rendered page-breaks are only added optionally. + """ + return self._page_counter if self._document_contains_pagebreaks else None + + def _paragraph_emphasis(self, paragraph: Paragraph) -> Tuple[List[str], List[str]]: + """[contents, tags] pair describing emphasized text in `paragraph`.""" + iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph)) + return ([e["text"] for e in iter_p_emph], [e["tag"] for e in iter_p_emph_2]) + + def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata: + """ElementMetadata object describing `paragraph`.""" + emphasized_text_contents, emphasized_text_tags = self._paragraph_emphasis(paragraph) + return ElementMetadata( + filename=self._metadata_filename, + page_number=self._page_number, + last_modified=self._last_modified, + emphasized_text_contents=emphasized_text_contents or None, + emphasized_text_tags=emphasized_text_tags or None, + ) - if len(text.strip()) == 0: - return None + def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]: + """Attempt to differentiate the element-type by inspecting the raw text.""" + text = paragraph.text.strip() + + if len(text) < 2: + return None + if is_us_city_state_zip(text): + return Address + if is_email_address(text): + return EmailAddress + if is_possible_narrative_text(text): + return NarrativeText + if is_possible_title(text): + return Title - element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name) - - # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping. - # Unknown style names will also return None - if is_list: - return _text_to_element(text, is_list) - elif element_class is None: - return _text_to_element(text) - else: - return element_class(text) - - -def _element_contains_pagebreak(element) -> bool: - """Detects if an element contains a page break. Checks for both "hard" page breaks - (page breaks inserted by the user) and "soft" page breaks, which are sometimes - inserted by the MS Word renderer. Note that soft page breaks aren't always present. - Whether or not pages are tracked may depend on your Word renderer.""" - page_break_indicators = [ - ["w:br", 'type="page"'], # "Hard" page break inserted by user - ["lastRenderedPageBreak"], # "Soft" page break inserted by renderer - ] - if hasattr(element, "xml"): - for indicators in page_break_indicators: - if all(indicator in element.xml for indicator in indicators): - return True - return False - - -def _text_to_element(text: str, is_list=False) -> Optional[Text]: - """Converts raw text into an unstructured Text element.""" - if is_bulleted_text(text) or is_list: - clean_text = clean_bullets(text).strip() - return ListItem(text=clean_bullets(text)) if clean_text else None - - elif is_us_city_state_zip(text): - return Address(text=text) - elif is_email_address(text): - return EmailAddress(text=text) - if len(text) < 2: return None - elif is_possible_narrative_text(text): - return NarrativeText(text) - elif is_possible_title(text): - return Title(text) - else: - return Text(text) + def _style_based_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]: + """Element-type for `paragraph` based on its paragraph-style. + + Returns `None` when the style doesn't tell us anything useful, including when it + is the default "Normal" style. + """ + # NOTE(robinson) - documentation on built-in styles at the link below: + # https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html \ + # #paragraph-styles-in-default-template + STYLE_TO_ELEMENT_MAPPING = { + "Caption": Text, # TODO(robinson) - add caption element type + "Heading 1": Title, + "Heading 2": Title, + "Heading 3": Title, + "Heading 4": Title, + "Heading 5": Title, + "Heading 6": Title, + "Heading 7": Title, + "Heading 8": Title, + "Heading 9": Title, + "Intense Quote": Text, # TODO(robinson) - add quote element type + "List": ListItem, + "List 2": ListItem, + "List 3": ListItem, + "List Bullet": ListItem, + "List Bullet 2": ListItem, + "List Bullet 3": ListItem, + "List Continue": ListItem, + "List Continue 2": ListItem, + "List Continue 3": ListItem, + "List Number": ListItem, + "List Number 2": ListItem, + "List Number 3": ListItem, + "List Paragraph": ListItem, + "Macro Text": Text, + "No Spacing": Text, + "Quote": Text, # TODO(robinson) - add quote element type + "Subtitle": Title, + "TOCHeading": Title, + "Title": Title, + } + + # -- paragraph.style can be None in rare cases, so can style.name. That's going + # -- to mean default style which is equivalent to "Normal" for our purposes. + style_name = (paragraph.style and paragraph.style.name) or "Normal" + + # NOTE(robinson) - The "Normal" style name will return None since it's not + # in the mapping. Unknown style names will also return None. + return STYLE_TO_ELEMENT_MAPPING.get(style_name) + + def _table_emphasis(self, table: DocxTable) -> Tuple[List[str], List[str]]: + """[contents, tags] pair describing emphasized text in `table`.""" + iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table)) + return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2]) + + +class _SectBlockItemIterator: + """Generates the block-items in a section. + + A block item is a docx Paragraph or Table. This small class is separated from + `_SectBlockElementIterator` because these two aspects will live in different places upstream. + This makes them easier to transplant, which we expect to do soon. + """ -def _join_paragraphs(paragraphs: List[docx.text.paragraph.Paragraph]) -> Optional[str]: - return "\n".join([paragraph.text for paragraph in paragraphs]) + @classmethod + def iter_sect_block_items(cls, section: Section, document: Document) -> Iterator[BlockItem]: + """Generate each Paragraph or Table object in `section`.""" + for element in _SectBlockElementIterator.iter_sect_block_elements(section._sectPr): + yield ( + Paragraph(element, document) + if isinstance(element, CT_P) + else DocxTable(element, document) + ) -def _get_headers_and_footers( - document: docx.document.Document, - metadata_filename: Optional[str], -) -> List[Tuple[List[Header], List[Footer]]]: - headers_and_footers = [] - attr_prefixes = ["", "first_page_", "even_page_"] +class _SectBlockElementIterator: + """Generates the block-item XML elements in a section. - for section in document.sections: - headers = [] - footers = [] + A block-item element is a `CT_P` (paragraph) or a `CT_Tbl` (table). + """ - for _type in ["header", "footer"]: - for prefix in attr_prefixes: - _elem = getattr(section, f"{prefix}{_type}", None) - if _elem is None: - continue + _compiled_blocks_xpath: Optional[etree.XPath] = None + _compiled_count_xpath: Optional[etree.XPath] = None + + def __init__(self, sectPr: CT_SectPr): + self._sectPr = sectPr + + @classmethod + def iter_sect_block_elements(cls, sectPr: CT_SectPr) -> Iterator[BlockElement]: + """Generate each CT_P or CT_Tbl element within the extents governed by `sectPr`.""" + return cls(sectPr)._iter_sect_block_elements() + + def _iter_sect_block_elements(self) -> Iterator[BlockElement]: + """Generate each CT_P or CT_Tbl element in section.""" + # -- General strategy is to get all block ( and ) elements from start of doc + # -- to and including this section, then compute the count of those elements that came + # -- from prior sections and skip that many to leave only the ones in this section. It's + # -- possible to express this "between here and there" (end of prior section and end of + # -- this one) concept in XPath, but it would be harder to follow because there are + # -- special cases (e.g. no prior section) and the boundary expressions are fairly hairy. + # -- I also believe it would be computationally more expensive than doing it this + # -- straighforward albeit (theoretically) slightly wasteful way. + + sectPr, sectPrs = self._sectPr, self._sectPrs + sectPr_idx = sectPrs.index(sectPr) + + # -- count block items belonging to prior sections -- + n_blks_to_skip = ( + 0 + if sectPr_idx == 0 + else self._count_of_blocks_in_and_above_section(sectPrs[sectPr_idx - 1]) + ) - text = _join_paragraphs(_elem.paragraphs) - if text: - header_footer_type = prefix[:-1] or "primary" - metadata = ElementMetadata( - filename=metadata_filename, - header_footer_type=header_footer_type, - ) + # -- and skip those in set of all blks from doc start to end of this section -- + for element in self._blocks_in_and_above_section(sectPr)[n_blks_to_skip:]: + yield element + + def _blocks_in_and_above_section(self, sectPr: CT_SectPr) -> Sequence[BlockElement]: + """All ps and tbls in section defined by `sectPr` and all prior sections.""" + if self._compiled_blocks_xpath is None: + self._compiled_blocks_xpath = etree.XPath( + self._blocks_in_and_above_section_xpath, + namespaces=nsmap, + regexp=False, + ) + xpath = self._compiled_blocks_xpath + # -- XPath callable results are Any (basically), so need a cast -- + return cast(Sequence[BlockElement], xpath(sectPr)) + + @lazyproperty + def _blocks_in_and_above_section_xpath(self) -> str: + """XPath expr for ps and tbls in context of a sectPr and all prior sectPrs.""" + # -- "p_sect" is a section with sectPr located at w:p/w:pPr/w:sectPr. "body_sect" is a + # -- section with sectPr located at w:body/w:sectPr. The last section in the document is a + # -- "body_sect". All others are of the "p_sect" variety. "term" means "terminal", like + # -- the last p or tbl in the section. "pred" means "predecessor", like a preceding p or + # -- tbl in the section. + + # -- the terminal block in a p-based sect is the p the sectPr appears in -- + p_sect_term_block = "./parent::w:pPr/parent::w:p" + # -- the terminus of a body-based sect is the sectPr itself (not a block) -- + body_sect_term = "self::w:sectPr[parent::w:body]" + # -- all the ps and tbls preceding (but not including) the context node -- + pred_ps_and_tbls = "preceding-sibling::*[self::w:p | self::w:tbl]" + + # -- p_sect_term_block and body_sect_term(inus) are mutually exclusive. So the result is + # -- either the union of nodes found by the first two selectors or the nodes found by the + # -- last selector, never both. + return ( + # -- include the p containing a sectPr -- + f"{p_sect_term_block}" + # -- along with all the blocks that precede it -- + f" | {p_sect_term_block}/{pred_ps_and_tbls}" + # -- or all the preceding blocks if sectPr is body-based (last sectPr) -- + f" | {body_sect_term}/{pred_ps_and_tbls}" + ) - if _type == "header": - headers.append(Header(text=text, metadata=metadata)) - elif _type == "footer": - footers.append(Footer(text=text, metadata=metadata)) + def _count_of_blocks_in_and_above_section(self, sectPr: CT_SectPr) -> int: + """All ps and tbls in section defined by `sectPr` and all prior sections.""" + if self._compiled_count_xpath is None: + self._compiled_count_xpath = etree.XPath( + f"count({self._blocks_in_and_above_section_xpath})", + namespaces=nsmap, + regexp=False, + ) + xpath = self._compiled_count_xpath + # -- numeric XPath results are always float, so need an int() conversion -- + return int(cast(float, xpath(sectPr))) + + @lazyproperty + def _sectPrs(self) -> Sequence[CT_SectPr]: + """All w:sectPr elements in document, in document-order.""" + return self._sectPr.xpath( + "/w:document/w:body/w:p/w:pPr/w:sectPr | /w:document/w:body/w:sectPr" + ) - headers_and_footers.append((headers, footers)) - return headers_and_footers +# == monkey-patch docx.text.Paragraph.runs =========================================== -def convert_and_partition_docx( - source_format: str, - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - include_metadata: bool = True, - metadata_filename: Optional[str] = None, - metadata_last_modified: Optional[str] = None, -) -> List[Element]: - """Converts a document to DOCX and then partitions it using partition_docx. Works with - any file format support by pandoc. +def _get_paragraph_runs(paragraph: Paragraph) -> Sequence[Run]: + """Gets all runs in paragraph, including hyperlinks python-docx skips. - Parameters - ---------- - source_format - The format of the source document, .e.g. odt - filename - A string defining the target filename path. - file - A file-like object using "rb" mode --> open(filename, "rb"). - include_metadata - Determines whether or not metadata is included in the metadata attribute on the - elements in the output. - """ - if filename is None: - filename = "" - exactly_one(filename=filename, file=file) - - if len(filename) > 0: - _, filename_no_path = os.path.split(os.path.abspath(filename)) - base_filename, _ = os.path.splitext(filename_no_path) - if not os.path.exists(filename): - raise ValueError(f"The file {filename} does not exist.") - elif file is not None: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.write(file.read()) - tmp.close() - filename = tmp.name - _, filename_no_path = os.path.split(os.path.abspath(tmp.name)) - - base_filename, _ = os.path.splitext(filename_no_path) + Without this, the default runs function skips over hyperlinks. - with tempfile.TemporaryDirectory() as tmpdir: - docx_filename = os.path.join(tmpdir, f"{base_filename}.docx") - pypandoc.convert_file( - filename, - "docx", - format=source_format, - outputfile=docx_filename, - ) - elements = partition_docx( - filename=docx_filename, - metadata_filename=metadata_filename, - include_metadata=include_metadata, - metadata_last_modified=metadata_last_modified, - ) + Args: + paragraph (Paragraph): A Paragraph object. - return elements + Returns: + list: A list of Run objects. + """ + def _get_runs(node: BaseOxmlElement, parent: Paragraph) -> Iterator[Run]: + """Recursively get runs.""" + for child in node: + # -- the Paragraph has runs as direct children -- + if child.tag == qn("w:r"): + yield Run(cast(CT_R, child), parent) + continue + # -- but it also has hyperlink children that themselves contain runs, so + # -- recurse into those + if child.tag == qn("w:hyperlink"): + yield from _get_runs(child, parent) -def _get_emphasized_texts_from_paragraph(paragraph: Paragraph) -> List[dict]: - """Get emphasized texts with bold/italic formatting from a paragraph in MS Word""" - emphasized_texts = [] - for run in paragraph.runs: - text = run.text.strip() if run.text else None - if not text: - continue - if run.bold: - emphasized_texts.append({"text": text, "tag": "b"}) - if run.italic: - emphasized_texts.append({"text": text, "tag": "i"}) - return emphasized_texts - - -def _get_emphasized_texts_from_table(table: DocxTable) -> List[dict]: - emphasized_texts = [] - for row in table.rows: - for cell in row.cells: - for paragraph in cell.paragraphs: - _emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph) - emphasized_texts += _emphasized_texts - return emphasized_texts - - -def _extract_contents_and_tags( - emphasized_texts: List[dict], -) -> Tuple[Optional[List[str]], Optional[List[str]]]: - """ - Extract the text contents and tags from a list of dictionaries containing emphasized texts. + return list(_get_runs(paragraph._element, paragraph)) - Args: - - emphasized_texts (List[dict]): A list containing dictionaries with keys "text" and "tag". - Returns: - - Tuple[List[str], List[str]]: A tuple containing two lists - - one for text contents and one for tags extracted from the input. - """ - emphasized_text_contents = ( - [emphasized_text["text"] for emphasized_text in emphasized_texts] - if emphasized_texts - else None - ) - emphasized_text_tags = ( - [emphasized_text["tag"] for emphasized_text in emphasized_texts] - if emphasized_texts - else None - ) +Paragraph.runs = property( # pyright: ignore[reportGeneralTypeIssues] + lambda self: _get_paragraph_runs(self) +) - return emphasized_text_contents, emphasized_text_tags +# ==================================================================================== diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index f4771da7d5..33d7d7a469 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -54,10 +54,7 @@ def partition_json( last_modification_date = get_last_modified_date_from_file(file) file_content = file.read() - if isinstance(file_content, str): - file_text = file_content - else: - file_text = file_content.decode() + file_text = file_content if isinstance(file_content, str) else file_content.decode() file.seek(0) elif text is not None: diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index 37b00457bd..5794d934d5 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -1,4 +1,4 @@ -from typing import IO, List, Optional +from typing import Any, BinaryIO, List, Optional from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata @@ -15,12 +15,12 @@ @add_chunking_strategy() def partition_odt( filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, + file: Optional[BinaryIO] = None, include_metadata: bool = True, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, chunking_strategy: Optional[str] = None, - **kwargs, + **kwargs: Any, ) -> List[Element]: """Partitions Open Office Documents in .odt format into its document elements. diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 9c011a7226..8461e128eb 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -104,8 +104,8 @@ def partition_pptx( if shape.has_table: table: pptx.table.Table = shape.table html_table = convert_ms_office_table_to_text(table, as_html=True) - text_table = convert_ms_office_table_to_text(table, as_html=False) - if (text_table := text_table.strip()) != "": + text_table = convert_ms_office_table_to_text(table, as_html=False).strip() + if text_table: metadata = ElementMetadata( filename=metadata_filename or filename, text_as_html=html_table, diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 87fbe9e3ba..820e8ac946 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -295,7 +295,7 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool: return ratio > threshold -def is_us_city_state_zip(text) -> bool: +def is_us_city_state_zip(text: str) -> bool: """Checks if the given text is in the format of US city/state/zip code. Examples @@ -307,7 +307,7 @@ def is_us_city_state_zip(text) -> bool: return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None -def is_email_address(text) -> bool: +def is_email_address(text: str) -> bool: """Check if the given text is the email address""" return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None diff --git a/unstructured/staging/prodigy.py b/unstructured/staging/prodigy.py index e4d5a99ca9..ba822ce423 100644 --- a/unstructured/staging/prodigy.py +++ b/unstructured/staging/prodigy.py @@ -28,9 +28,7 @@ def _validate_prodigy_metadata( ) if isinstance(id_error_index, int): raise ValueError( - 'The key "id" is not allowed with metadata parameter at index: {index}'.format( - index=id_error_index, - ), + f'The key "id" is not allowed with metadata parameter at index: {id_error_index}' ) validated_metadata = metadata else: diff --git a/unstructured/utils.py b/unstructured/utils.py index 5a10af77ce..6a5fa83f9a 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -1,12 +1,125 @@ +import functools import importlib import json from datetime import datetime from functools import wraps -from typing import Dict, List, Optional, Union +from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z") +_T = TypeVar("_T") + + +class lazyproperty(Generic[_T]): + """Decorator like @property, but evaluated only on first access. + + Like @property, this can only be used to decorate methods having only a `self` parameter, and + is accessed like an attribute on an instance, i.e. trailing parentheses are not used. Unlike + @property, the decorated method is only evaluated on first access; the resulting value is + cached and that same value returned on second and later access without re-evaluation of the + method. + + Like @property, this class produces a *data descriptor* object, which is stored in the __dict__ + of the *class* under the name of the decorated method ('fget' nominally). The cached value is + stored in the __dict__ of the *instance* under that same name. + + Because it is a data descriptor (as opposed to a *non-data descriptor*), its `__get__()` method + is executed on each access of the decorated attribute; the __dict__ item of the same name is + "shadowed" by the descriptor. + + While this may represent a performance improvement over a property, its greater benefit may be + its other characteristics. One common use is to construct collaborator objects, removing that + "real work" from the constructor, while still only executing once. It also de-couples client + code from any sequencing considerations; if it's accessed from more than one location, it's + assured it will be ready whenever needed. + + Loosely based on: https://stackoverflow.com/a/6849299/1902513. + + A lazyproperty is read-only. There is no counterpart to the optional "setter" (or deleter) + behavior of an @property. This is critically important to maintaining its immutability and + idempotence guarantees. Attempting to assign to a lazyproperty raises AttributeError + unconditionally. + + The parameter names in the methods below correspond to this usage example:: + + class Obj(object) + + @lazyproperty + def fget(self): + return 'some result' + + obj = Obj() + + Not suitable for wrapping a function (as opposed to a method) because it is not callable. + """ + + def __init__(self, fget: Callable[..., _T]) -> None: + """*fget* is the decorated method (a "getter" function). + + A lazyproperty is read-only, so there is only an *fget* function (a regular + @property can also have an fset and fdel function). This name was chosen for + consistency with Python's `property` class which uses this name for the + corresponding parameter. + """ + # --- maintain a reference to the wrapped getter method + self._fget = fget + # --- and store the name of that decorated method + self._name = fget.__name__ + # --- adopt fget's __name__, __doc__, and other attributes + functools.update_wrapper(self, fget) # pyright: ignore + + def __get__(self, obj: Any, type: Any = None) -> _T: + """Called on each access of 'fget' attribute on class or instance. + + *self* is this instance of a lazyproperty descriptor "wrapping" the property + method it decorates (`fget`, nominally). + + *obj* is the "host" object instance when the attribute is accessed from an + object instance, e.g. `obj = Obj(); obj.fget`. *obj* is None when accessed on + the class, e.g. `Obj.fget`. + + *type* is the class hosting the decorated getter method (`fget`) on both class + and instance attribute access. + """ + # --- when accessed on class, e.g. Obj.fget, just return this descriptor + # --- instance (patched above to look like fget). + if obj is None: + return self # type: ignore + + # --- when accessed on instance, start by checking instance __dict__ for + # --- item with key matching the wrapped function's name + value = obj.__dict__.get(self._name) + if value is None: + # --- on first access, the __dict__ item will be absent. Evaluate fget() + # --- and store that value in the (otherwise unused) host-object + # --- __dict__ value of same name ('fget' nominally) + value = self._fget(obj) + obj.__dict__[self._name] = value + return cast(_T, value) + + def __set__(self, obj: Any, value: Any) -> None: + """Raises unconditionally, to preserve read-only behavior. + + This decorator is intended to implement immutable (and idempotent) object + attributes. For that reason, assignment to this property must be explicitly + prevented. + + If this __set__ method was not present, this descriptor would become a + *non-data descriptor*. That would be nice because the cached value would be + accessed directly once set (__dict__ attrs have precedence over non-data + descriptors on instance attribute lookup). The problem is, there would be + nothing to stop assignment to the cached value, which would overwrite the result + of `fget()` and break both the immutability and idempotence guarantees of this + decorator. + + The performance with this __set__() method in place was roughly 0.4 usec per + access when measured on a 2.8GHz development machine; so quite snappy and + probably not a rich target for optimization efforts. + """ + raise AttributeError("can't set attribute") + + def save_as_jsonl(data: List[Dict], filename: str) -> None: with open(filename, "w+") as output_file: output_file.writelines(json.dumps(datum) + "\n" for datum in data) @@ -47,7 +160,7 @@ def wrapper(*args, **kwargs): return decorator -def dependency_exists(dependency): +def dependency_exists(dependency: str): try: importlib.import_module(dependency) except ImportError as e: