From b54994ae9584e528d3d009c5d9a4870624243790 Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Tue, 19 Sep 2023 15:32:46 -0700
Subject: [PATCH] rfctr: docx partitioning (#1422)

Reviewers: I recommend reviewing commit-by-commit or just looking at the
final version of `partition/docx.py` as View File.

This refactor solves a few problems but mostly lays the groundwork to
allow us to refine further aspects such as page-break detection,
list-item detection, and moving python-docx internals upstream to that
library so our work doesn't depend on that domain-knowledge.
---
 .gitignore                                    |    5 +-
 CHANGELOG.md                                  |    1 +
 Makefile                                      |    2 +-
 docs/source/introduction/getting_started.rst  |    2 +-
 docs/source/metadata.rst                      |    9 +-
 pyproject.toml                                |   25 +
 scripts/collect_env.py                        |   19 +-
 scripts/performance/run_partition.py          |    6 +-
 setup.cfg                                     |    2 +
 test_unstructured/partition/docx/test_docx.py |   60 +-
 typings/docx/__init__.pyi                     |    3 +
 typings/docx/api.pyi                          |    5 +
 typings/docx/blkcntnr.pyi                     |   12 +
 typings/docx/document.pyi                     |   22 +
 typings/docx/enum/section.pyi                 |   11 +
 typings/docx/oxml/__init__.pyi                |    7 +
 typings/docx/oxml/document.pyi                |   10 +
 typings/docx/oxml/ns.pyi                      |    5 +
 typings/docx/oxml/section.pyi                 |    7 +
 typings/docx/oxml/table.pyi                   |    3 +
 typings/docx/oxml/text/paragraph.pyi          |    3 +
 typings/docx/oxml/text/parfmt.pyi             |    3 +
 typings/docx/oxml/text/run.pyi                |    9 +
 typings/docx/oxml/xmlchemy.pyi                |   17 +
 typings/docx/section.pyi                      |   33 +
 typings/docx/settings.pyi                     |    5 +
 typings/docx/shared.pyi                       |   16 +
 typings/docx/styles/style.pyi                 |    8 +
 typings/docx/table.pyi                        |   21 +
 typings/docx/text/__init__.pyi                |    0
 typings/docx/text/paragraph.pyi               |   19 +
 typings/docx/text/run.pyi                     |   13 +
 typings/pptx/__init__.pyi                     |    3 +
 typings/pptx/api.pyi                          |    5 +
 typings/pptx/oxml/__init__.py                 |    9 +
 typings/pptx/oxml/text.pyi                    |    6 +
 typings/pptx/oxml/xmlchemy.pyi                |   17 +
 typings/pptx/presentation.pyi                 |   11 +
 typings/pptx/shapes/__init__.py               |    2 +
 typings/pptx/shapes/autoshape.pyi             |    6 +
 typings/pptx/shapes/base.pyi                  |    9 +
 typings/pptx/shapes/graphfrm.pyi              |    6 +
 typings/pptx/shapes/group.pyi                 |    6 +
 typings/pptx/shapes/shapetree.pyi             |   18 +
 typings/pptx/shared.pyi                       |    3 +
 typings/pptx/slide.pyi                        |   31 +
 typings/pptx/table.pyi                        |    1 +
 typings/pptx/text/text.pyi                    |   13 +
 typings/pptx/util.pyi                         |    4 +
 typings/pypandoc/__init__.pyi                 |    0
 unstructured/cleaners/core.py                 |    2 +-
 unstructured/documents/elements.py            |   30 +-
 unstructured/documents/html.py                |    7 +-
 unstructured/partition/common.py              |   11 +-
 unstructured/partition/docx.py                | 1017 +++++++++++------
 unstructured/partition/json.py                |    5 +-
 unstructured/partition/odt.py                 |    6 +-
 unstructured/partition/pptx.py                |    4 +-
 unstructured/partition/text_type.py           |    4 +-
 unstructured/staging/prodigy.py               |    4 +-
 unstructured/utils.py                         |  117 +-
 61 files changed, 1286 insertions(+), 434 deletions(-)
 create mode 100644 pyproject.toml
 create mode 100644 typings/docx/__init__.pyi
 create mode 100644 typings/docx/api.pyi
 create mode 100644 typings/docx/blkcntnr.pyi
 create mode 100644 typings/docx/document.pyi
 create mode 100644 typings/docx/enum/section.pyi
 create mode 100644 typings/docx/oxml/__init__.pyi
 create mode 100644 typings/docx/oxml/document.pyi
 create mode 100644 typings/docx/oxml/ns.pyi
 create mode 100644 typings/docx/oxml/section.pyi
 create mode 100644 typings/docx/oxml/table.pyi
 create mode 100644 typings/docx/oxml/text/paragraph.pyi
 create mode 100644 typings/docx/oxml/text/parfmt.pyi
 create mode 100644 typings/docx/oxml/text/run.pyi
 create mode 100644 typings/docx/oxml/xmlchemy.pyi
 create mode 100644 typings/docx/section.pyi
 create mode 100644 typings/docx/settings.pyi
 create mode 100644 typings/docx/shared.pyi
 create mode 100644 typings/docx/styles/style.pyi
 create mode 100644 typings/docx/table.pyi
 create mode 100644 typings/docx/text/__init__.pyi
 create mode 100644 typings/docx/text/paragraph.pyi
 create mode 100644 typings/docx/text/run.pyi
 create mode 100644 typings/pptx/__init__.pyi
 create mode 100644 typings/pptx/api.pyi
 create mode 100644 typings/pptx/oxml/__init__.py
 create mode 100644 typings/pptx/oxml/text.pyi
 create mode 100644 typings/pptx/oxml/xmlchemy.pyi
 create mode 100644 typings/pptx/presentation.pyi
 create mode 100644 typings/pptx/shapes/__init__.py
 create mode 100644 typings/pptx/shapes/autoshape.pyi
 create mode 100644 typings/pptx/shapes/base.pyi
 create mode 100644 typings/pptx/shapes/graphfrm.pyi
 create mode 100644 typings/pptx/shapes/group.pyi
 create mode 100644 typings/pptx/shapes/shapetree.pyi
 create mode 100644 typings/pptx/shared.pyi
 create mode 100644 typings/pptx/slide.pyi
 create mode 100644 typings/pptx/table.pyi
 create mode 100644 typings/pptx/text/text.pyi
 create mode 100644 typings/pptx/util.pyi
 create mode 100644 typings/pypandoc/__init__.pyi

diff --git a/.gitignore b/.gitignore
index 8353474c5c..f7efde4599 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,6 +132,9 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# pyright (Python LSP/type-checker in VSCode) config
+/pyrightconfig.json
+
 # ingest outputs
 /structured-output
 
@@ -194,4 +197,4 @@ unstructured-inference/
 example-docs/*_images
 examples/**/output/
 
-outputdiff.txt
\ No newline at end of file
+outputdiff.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0557ab94e..b230b79587 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ### Enhancements
 
 * **Adds data source properties to Airtable, Confluence, Discord, Elasticsearch, Google Drive, and Wikipedia connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
+* **DOCX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except in multi-section documents containing different headers/footers for different sections. These will now emit all distinct headers and footers encountered instead of just those for the last section.
 
 ### Features
 
diff --git a/Makefile b/Makefile
index 19051c435b..51ede92052 100644
--- a/Makefile
+++ b/Makefile
@@ -324,7 +324,7 @@ check: check-src check-tests check-version
 ## check-src:               runs linters (source only, no tests)
 .PHONY: check-src
 check-src:
-	ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore PT011,PT012,SIM117
+	ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore COM812,PT011,PT012,SIM117
 	black --line-length 100 ${PACKAGE_NAME} --check
 	flake8 ${PACKAGE_NAME}
 	mypy ${PACKAGE_NAME} --ignore-missing-imports --check-untyped-defs
diff --git a/docs/source/introduction/getting_started.rst b/docs/source/introduction/getting_started.rst
index 60e9daaddf..c659046817 100644
--- a/docs/source/introduction/getting_started.rst
+++ b/docs/source/introduction/getting_started.rst
@@ -34,7 +34,7 @@ After installation, confirm the setup by executing the below Python code:
 .. code-block:: python
 
    from unstructured.partition.auto import partition
-   elements = partition(filename="example-docs/fake-email.eml")
+   elements = partition(filename="example-docs/eml/fake-email.eml")
 
 If you've opted for the "local-inference" installation, you should also be able to execute:
 
diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst
index d114254ec5..3f406d4028 100644
--- a/docs/source/metadata.rst
+++ b/docs/source/metadata.rst
@@ -26,12 +26,13 @@ Some document types support location data for the elements, usually in the form
 If it exists, an element's location data is available with ``element.metadata.coordinates``.
 
 The ``coordinates`` property of an ``ElementMetadata`` stores:
+
 * points: These specify the corners of the bounding box starting from the top left corner and
-proceeding counter-clockwise. The points represent pixels, the origin is in the top left and
-the ``y`` coordinate increases in the downward direction.
+  proceeding counter-clockwise. The points represent pixels, the origin is in the top left and
+  the ``y`` coordinate increases in the downward direction.
 * system: The points have an associated coordinate system. A typical example of a coordinate system is
-``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a
-name, orientation, layout width, and layout height.
+  ``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a
+  name, orientation, layout width, and layout height.
 
 Information about the element’s coordinates (including the coordinate system name, coordinate points,
 the layout width, and the layout height) can be accessed with `element.to_dict()["metadata"]["coordinates"]`.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..50069ef642
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+[tool.black]
+line-length = 100
+
+[tool.ruff]
+line-length = 100
+select = [
+    "C4",       # -- flake8-comprehensions --
+    "COM",      # -- flake8-commas --
+    "E",        # -- pycodestyle errors --
+    "F",        # -- pyflakes --
+    "I",        # -- isort (imports) --
+    "PLR0402",  # -- Name compared with itself like `foo == foo` --
+    "PT",       # -- flake8-pytest-style --
+    "SIM",      # -- flake8-simplify --
+    "UP015",    # -- redundant `open()` mode parameter (like "r" is default) --
+    "UP018",    # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
+    "UP032",    # -- Use f-string instead of `.format()` call --
+    "UP034",    # -- Avoid extraneous parentheses --
+]
+ignore = [
+    "COM812",   # -- over aggressively insists on trailing commas where not desireable --
+    "PT011",    # -- pytest.raises({exc}) too broad, use match param or more specific exception --
+    "PT012",    # -- pytest.raises() block should contain a single simple statement --
+    "SIM117",   # -- merge `with` statements for context managers that have same scope --
+]
diff --git a/scripts/collect_env.py b/scripts/collect_env.py
index f0649147b1..6cb48e0f25 100644
--- a/scripts/collect_env.py
+++ b/scripts/collect_env.py
@@ -40,7 +40,7 @@ def get_os_version():
     return platform.platform()
 
 
-def is_python_package_installed(package_name):
+def is_python_package_installed(package_name: str):
     """
     Check if a Python package is installed
 
@@ -57,14 +57,10 @@ def is_python_package_installed(package_name):
         check=True,
     )
 
-    for line in result.stdout.splitlines():
-        if line.lower().startswith(package_name.lower()):
-            return True
-
-    return False
+    return any(line.lower().startswith(package_name.lower()) for line in result.stdout.splitlines())
 
 
-def is_brew_package_installed(package_name):
+def is_brew_package_installed(package_name: str):
     """
     Check if a Homebrew package is installed
 
@@ -95,11 +91,7 @@ def is_brew_package_installed(package_name):
         check=True,
     )
 
-    for line in result.stdout.splitlines():
-        if line.lower().startswith(package_name.lower()):
-            return True
-
-    return False
+    return any(line.lower().startswith(package_name.lower()) for line in result.stdout.splitlines())
 
 
 def get_python_package_version(package_name):
@@ -221,8 +213,7 @@ def main():
     ):
         print(
             "PaddleOCR version: ",
-            get_python_package_version("paddlepaddle")
-            or get_python_package_version("paddleocr"),
+            get_python_package_version("paddlepaddle") or get_python_package_version("paddleocr"),
         )
     else:
         print("PaddleOCR is not installed")
diff --git a/scripts/performance/run_partition.py b/scripts/performance/run_partition.py
index 4da380f02e..3710f02c64 100644
--- a/scripts/performance/run_partition.py
+++ b/scripts/performance/run_partition.py
@@ -13,11 +13,7 @@
 
     file_path = sys.argv[1]
     strategy = sys.argv[2]
-    model_name = None
-    if len(sys.argv) > 3:
-        model_name = sys.argv[3]
-    else:
-        model_name = os.environ.get("PARTITION_MODEL_NAME")
+    model_name = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("PARTITION_MODEL_NAME")
     result = partition(file_path, strategy=strategy, model_name=model_name)
     # access element in the return value to make sure we got something back, otherwise error
     result[1]
diff --git a/setup.cfg b/setup.cfg
index a06a3629b5..ae8174cacc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,3 +7,5 @@ max-line-length = 100
 [tool:pytest]
 filterwarnings =
     ignore::DeprecationWarning
+python_classes = Test Describe
+python_functions = test_ it_ they_ but_ and_
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
index b73caa0e70..77d8b79751 100644
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@@ -1,5 +1,8 @@
+# pyright: reportPrivateUsage=false
+
 import os
 from tempfile import SpooledTemporaryFile
+from typing import Dict, List
 
 import docx
 import pytest
@@ -16,12 +19,7 @@
     Title,
 )
 from unstructured.partition.doc import partition_doc
-from unstructured.partition.docx import (
-    _extract_contents_and_tags,
-    _get_emphasized_texts_from_paragraph,
-    _get_emphasized_texts_from_table,
-    partition_docx,
-)
+from unstructured.partition.docx import _DocxPartitioner, partition_docx
 from unstructured.partition.json import partition_json
 from unstructured.staging.base import elements_to_json
 
@@ -316,52 +314,46 @@ def test_partition_docx_from_file_without_metadata_date(
     assert elements[0].metadata.last_modified is None
 
 
-def test_get_emphasized_texts_from_paragraph(
-    expected_emphasized_texts,
-    filename="example-docs/fake-doc-emphasized-text.docx",
-):
-    document = docx.Document(filename)
-    paragraph = document.paragraphs[1]
-    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dict[str, str]]):
+    partitioner = _DocxPartitioner(
+        "example-docs/fake-doc-emphasized-text.docx", None, None, False, None
+    )
+    paragraph = partitioner._document.paragraphs[1]
+    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
     assert paragraph.text == "I am a bold italic bold-italic text."
     assert emphasized_texts == expected_emphasized_texts
 
-    paragraph = document.paragraphs[2]
-    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+    paragraph = partitioner._document.paragraphs[2]
+    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
     assert paragraph.text == ""
     assert emphasized_texts == []
 
-    paragraph = document.paragraphs[3]
-    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+    paragraph = partitioner._document.paragraphs[3]
+    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
     assert paragraph.text == "I am a normal text."
     assert emphasized_texts == []
 
 
-def test_get_emphasized_texts_from_table(
-    expected_emphasized_texts,
-    filename="example-docs/fake-doc-emphasized-text.docx",
-):
-    document = docx.Document(filename)
-    table = document.tables[0]
-    emphasized_texts = _get_emphasized_texts_from_table(table)
+def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
+    partitioner = _DocxPartitioner(
+        "example-docs/fake-doc-emphasized-text.docx", None, None, False, None
+    )
+    table = partitioner._document.tables[0]
+    emphasized_texts = list(partitioner._iter_table_emphasis(table))
     assert emphasized_texts == expected_emphasized_texts
 
 
-def test_extract_contents_and_tags(
-    expected_emphasized_texts,
-    expected_emphasized_text_contents,
-    expected_emphasized_text_tags,
+def test_table_emphasis(
+    expected_emphasized_text_contents: List[str], expected_emphasized_text_tags: List[str]
 ):
-    emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
-        expected_emphasized_texts,
+    partitioner = _DocxPartitioner(
+        "example-docs/fake-doc-emphasized-text.docx", None, None, False, None
     )
+    table = partitioner._document.tables[0]
+    emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table)
     assert emphasized_text_contents == expected_emphasized_text_contents
     assert emphasized_text_tags == expected_emphasized_text_tags
 
-    emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([])
-    assert emphasized_text_contents is None
-    assert emphasized_text_tags is None
-
 
 @pytest.mark.parametrize(
     ("filename", "partition_func"),
diff --git a/typings/docx/__init__.pyi b/typings/docx/__init__.pyi
new file mode 100644
index 0000000000..a5cb78f3e1
--- /dev/null
+++ b/typings/docx/__init__.pyi
@@ -0,0 +1,3 @@
+from docx.api import Document
+
+__all__ = ["Document"]
diff --git a/typings/docx/api.pyi b/typings/docx/api.pyi
new file mode 100644
index 0000000000..9b3ff122d6
--- /dev/null
+++ b/typings/docx/api.pyi
@@ -0,0 +1,5 @@
+from typing import BinaryIO, Optional, Union
+
+import docx.document
+
+def Document(docx: Optional[Union[str, BinaryIO]] = None) -> docx.document.Document: ...
diff --git a/typings/docx/blkcntnr.pyi b/typings/docx/blkcntnr.pyi
new file mode 100644
index 0000000000..9e09ea8c2a
--- /dev/null
+++ b/typings/docx/blkcntnr.pyi
@@ -0,0 +1,12 @@
+from typing import Sequence
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+
+class BlockItemContainer:
+    _element: BaseOxmlElement
+    @property
+    def paragraphs(self) -> Sequence[Paragraph]: ...
+    @property
+    def tables(self) -> Sequence[Table]: ...
diff --git a/typings/docx/document.pyi b/typings/docx/document.pyi
new file mode 100644
index 0000000000..c6d34d467f
--- /dev/null
+++ b/typings/docx/document.pyi
@@ -0,0 +1,22 @@
+# pyright: reportPrivateUsage=false
+
+from typing import BinaryIO, Optional, Union
+
+from docx.blkcntnr import BlockItemContainer
+from docx.oxml.document import CT_Document
+from docx.section import Sections
+from docx.settings import Settings
+from docx.styles.style import _ParagraphStyle
+from docx.text.paragraph import Paragraph
+
+class Document(BlockItemContainer):
+    def add_paragraph(
+        self, text: str = "", style: Optional[Union[_ParagraphStyle, str]] = None
+    ) -> Paragraph: ...
+    @property
+    def element(self) -> CT_Document: ...
+    def save(self, path_or_stream: Union[str, BinaryIO]) -> None: ...
+    @property
+    def sections(self) -> Sections: ...
+    @property
+    def settings(self) -> Settings: ...
diff --git a/typings/docx/enum/section.pyi b/typings/docx/enum/section.pyi
new file mode 100644
index 0000000000..b06b79d766
--- /dev/null
+++ b/typings/docx/enum/section.pyi
@@ -0,0 +1,11 @@
+import enum
+
+class WD_SECTION_START(enum.Enum):
+    CONTINUOUS: enum.Enum
+    EVEN_PAGE: enum.Enum
+    NEW_COLUMN: enum.Enum
+    NEW_PAGE: enum.Enum
+    ODD_PAGE: enum.Enum
+
+# -- alias --
+WD_SECTION = WD_SECTION_START
diff --git a/typings/docx/oxml/__init__.pyi b/typings/docx/oxml/__init__.pyi
new file mode 100644
index 0000000000..2634b55642
--- /dev/null
+++ b/typings/docx/oxml/__init__.pyi
@@ -0,0 +1,7 @@
+# pyright: reportPrivateUsage=false
+
+from typing import Union
+
+from lxml import etree
+
+def parse_xml(xml: Union[str, bytes]) -> etree._Element: ...
diff --git a/typings/docx/oxml/document.pyi b/typings/docx/oxml/document.pyi
new file mode 100644
index 0000000000..460c1c1d7a
--- /dev/null
+++ b/typings/docx/oxml/document.pyi
@@ -0,0 +1,10 @@
+from typing import Iterator
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_Body(BaseOxmlElement):
+    def __iter__(self) -> Iterator[BaseOxmlElement]: ...
+
+class CT_Document(BaseOxmlElement):
+    @property
+    def body(self) -> CT_Body: ...
diff --git a/typings/docx/oxml/ns.pyi b/typings/docx/oxml/ns.pyi
new file mode 100644
index 0000000000..3f387dd2fb
--- /dev/null
+++ b/typings/docx/oxml/ns.pyi
@@ -0,0 +1,5 @@
+from typing import Dict
+
+nsmap: Dict[str, str]
+
+def qn(tag: str) -> str: ...
diff --git a/typings/docx/oxml/section.pyi b/typings/docx/oxml/section.pyi
new file mode 100644
index 0000000000..2f90109e09
--- /dev/null
+++ b/typings/docx/oxml/section.pyi
@@ -0,0 +1,7 @@
+from typing import Optional
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_SectPr(BaseOxmlElement):
+    @property
+    def preceding_sectPr(self) -> Optional[CT_SectPr]: ...
diff --git a/typings/docx/oxml/table.pyi b/typings/docx/oxml/table.pyi
new file mode 100644
index 0000000000..2d96facdc3
--- /dev/null
+++ b/typings/docx/oxml/table.pyi
@@ -0,0 +1,3 @@
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_Tbl(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/text/paragraph.pyi b/typings/docx/oxml/text/paragraph.pyi
new file mode 100644
index 0000000000..d869c19c04
--- /dev/null
+++ b/typings/docx/oxml/text/paragraph.pyi
@@ -0,0 +1,3 @@
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_P(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/text/parfmt.pyi b/typings/docx/oxml/text/parfmt.pyi
new file mode 100644
index 0000000000..21c4e7d668
--- /dev/null
+++ b/typings/docx/oxml/text/parfmt.pyi
@@ -0,0 +1,3 @@
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_PPr(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/text/run.pyi b/typings/docx/oxml/text/run.pyi
new file mode 100644
index 0000000000..2964024a4a
--- /dev/null
+++ b/typings/docx/oxml/text/run.pyi
@@ -0,0 +1,9 @@
+from typing import Optional
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_Br(BaseOxmlElement):
+    type: Optional[str]
+    clear: Optional[str]
+
+class CT_R(BaseOxmlElement): ...
diff --git a/typings/docx/oxml/xmlchemy.pyi b/typings/docx/oxml/xmlchemy.pyi
new file mode 100644
index 0000000000..e08277ee68
--- /dev/null
+++ b/typings/docx/oxml/xmlchemy.pyi
@@ -0,0 +1,17 @@
+from typing import Any, Iterator
+
+from lxml import etree
+
+class BaseOxmlElement(etree.ElementBase):
+    def __iter__(self) -> Iterator[BaseOxmlElement]: ...
+    @property
+    def xml(self) -> str: ...
+    def xpath(self, xpath_str: str) -> Any:
+        """Return type is typically Sequence[ElementBase], but ...
+
+        lxml.etree.XPath has many possible return types including bool, (a "smart") str,
+        float. The return type can also be a list containing ElementBase, comments,
+        processing instructions, str, and tuple. So you need to cast the result based on
+        the XPath expression you use.
+        """
+        ...
diff --git a/typings/docx/section.pyi b/typings/docx/section.pyi
new file mode 100644
index 0000000000..e0c856a9da
--- /dev/null
+++ b/typings/docx/section.pyi
@@ -0,0 +1,33 @@
+from typing import Sequence
+
+from docx.blkcntnr import BlockItemContainer
+from docx.enum.section import WD_SECTION
+from docx.oxml.section import CT_SectPr
+
+class Section:
+    _sectPr: CT_SectPr
+    @property
+    def different_first_page_header_footer(self) -> bool: ...
+    @property
+    def even_page_footer(self) -> _Footer: ...
+    @property
+    def even_page_header(self) -> _Header: ...
+    @property
+    def first_page_footer(self) -> _Footer: ...
+    @property
+    def first_page_header(self) -> _Header: ...
+    @property
+    def footer(self) -> _Footer: ...
+    @property
+    def header(self) -> _Header: ...
+    @property
+    def start_type(self) -> WD_SECTION: ...
+
+class Sections(Sequence[Section]): ...
+
+class _BaseHeaderFooter(BlockItemContainer):
+    @property
+    def is_linked_to_previous(self) -> bool: ...
+
+class _Footer(_BaseHeaderFooter): ...
+class _Header(_BaseHeaderFooter): ...
diff --git a/typings/docx/settings.pyi b/typings/docx/settings.pyi
new file mode 100644
index 0000000000..d5bf481d69
--- /dev/null
+++ b/typings/docx/settings.pyi
@@ -0,0 +1,5 @@
+from docx.shared import ElementProxy
+
+class Settings(ElementProxy):
+    @property
+    def odd_and_even_pages_header_footer(self) -> bool: ...
diff --git a/typings/docx/shared.pyi b/typings/docx/shared.pyi
new file mode 100644
index 0000000000..2722b0b76e
--- /dev/null
+++ b/typings/docx/shared.pyi
@@ -0,0 +1,16 @@
+from typing import Any, Callable, Generic, TypeVar
+
+from docx.oxml.xmlchemy import BaseOxmlElement
+
+_T = TypeVar("_T")
+
+class lazyproperty(Generic[_T]):
+    def __init__(self, fget: Callable[..., _T]) -> None: ...
+    def __get__(self, obj: Any, type: Any = None) -> _T: ...
+    def __set__(self, obj: Any, value: Any) -> None: ...
+
+class ElementProxy:
+    @property
+    def element(self) -> BaseOxmlElement: ...
+
+class Parented: ...
diff --git a/typings/docx/styles/style.pyi b/typings/docx/styles/style.pyi
new file mode 100644
index 0000000000..5e79b70447
--- /dev/null
+++ b/typings/docx/styles/style.pyi
@@ -0,0 +1,8 @@
+from typing import Optional
+
+class BaseStyle:
+    @property
+    def name(self) -> Optional[str]: ...
+
+class _CharacterStyle(BaseStyle): ...
+class _ParagraphStyle(_CharacterStyle): ...
diff --git a/typings/docx/table.pyi b/typings/docx/table.pyi
new file mode 100644
index 0000000000..b5757f57a5
--- /dev/null
+++ b/typings/docx/table.pyi
@@ -0,0 +1,21 @@
+from typing import Sequence
+
+from docx.blkcntnr import BlockItemContainer
+from docx.oxml.table import CT_Tbl
+from docx.shared import Parented
+from docx.text.paragraph import Paragraph
+
+class _Cell:
+    @property
+    def paragraphs(self) -> Sequence[Paragraph]: ...
+
+class _Row:
+    @property
+    def cells(self) -> Sequence[_Cell]: ...
+
+class _Rows(Sequence[_Row]): ...
+
+class Table(Parented):
+    def __init__(self, tbl: CT_Tbl, parent: BlockItemContainer) -> None: ...
+    @property
+    def rows(self) -> _Rows: ...
diff --git a/typings/docx/text/__init__.pyi b/typings/docx/text/__init__.pyi
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/typings/docx/text/paragraph.pyi b/typings/docx/text/paragraph.pyi
new file mode 100644
index 0000000000..1a8193927d
--- /dev/null
+++ b/typings/docx/text/paragraph.pyi
@@ -0,0 +1,19 @@
+# pyright: reportPrivateUsage = false
+
+from typing import Optional, Sequence
+
+from docx.blkcntnr import BlockItemContainer
+from docx.oxml.text.paragraph import CT_P
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.styles.style import _ParagraphStyle
+from docx.text.run import Run
+
+class Paragraph(BlockItemContainer):
+    _p: CT_P
+    def __init__(self, p: BaseOxmlElement, parent: BlockItemContainer) -> None: ...
+    @property
+    def runs(self) -> Sequence[Run]: ...
+    @property
+    def style(self) -> Optional[_ParagraphStyle]: ...
+    @property
+    def text(self) -> str: ...
diff --git a/typings/docx/text/run.pyi b/typings/docx/text/run.pyi
new file mode 100644
index 0000000000..bc80169f4c
--- /dev/null
+++ b/typings/docx/text/run.pyi
@@ -0,0 +1,13 @@
+from docx.oxml.text.run import CT_R
+from docx.text.paragraph import Paragraph
+
+class Run:
+    _element: CT_R
+    _r: CT_R
+    def __init__(self, r: CT_R, parent: Paragraph) -> None: ...
+    @property
+    def bold(self) -> bool: ...
+    @property
+    def italic(self) -> bool: ...
+    @property
+    def text(self) -> str: ...
diff --git a/typings/pptx/__init__.pyi b/typings/pptx/__init__.pyi
new file mode 100644
index 0000000000..9c98df114b
--- /dev/null
+++ b/typings/pptx/__init__.pyi
@@ -0,0 +1,3 @@
+from pptx.api import Presentation
+
+__all__ = ["Presentation"]
diff --git a/typings/pptx/api.pyi b/typings/pptx/api.pyi
new file mode 100644
index 0000000000..236122e21d
--- /dev/null
+++ b/typings/pptx/api.pyi
@@ -0,0 +1,5 @@
+from typing import BinaryIO, Optional, Union
+
+import pptx.presentation
+
+def Presentation(pptx: Optional[Union[str, BinaryIO]] = None) -> pptx.presentation.Presentation: ...
diff --git a/typings/pptx/oxml/__init__.py b/typings/pptx/oxml/__init__.py
new file mode 100644
index 0000000000..734aac61d1
--- /dev/null
+++ b/typings/pptx/oxml/__init__.py
@@ -0,0 +1,9 @@
+# pyright: reportPrivateUsage=false
+
+from typing import Union
+
+from lxml import etree
+
+
+def parse_xml(xml: Union[str, bytes]) -> etree._Element:
+    ...
diff --git a/typings/pptx/oxml/text.pyi b/typings/pptx/oxml/text.pyi
new file mode 100644
index 0000000000..70bfd1ca6b
--- /dev/null
+++ b/typings/pptx/oxml/text.pyi
@@ -0,0 +1,6 @@
+from pptx.oxml.xmlchemy import BaseOxmlElement
+
+class CT_TextParagraph(BaseOxmlElement):
+    def get_or_add_pPr(self) -> CT_TextParagraphProperties: ...
+
+class CT_TextParagraphProperties(BaseOxmlElement): ...
diff --git a/typings/pptx/oxml/xmlchemy.pyi b/typings/pptx/oxml/xmlchemy.pyi
new file mode 100644
index 0000000000..e08277ee68
--- /dev/null
+++ b/typings/pptx/oxml/xmlchemy.pyi
@@ -0,0 +1,17 @@
+from typing import Any, Iterator
+
+from lxml import etree
+
+class BaseOxmlElement(etree.ElementBase):
+    def __iter__(self) -> Iterator[BaseOxmlElement]: ...
+    @property
+    def xml(self) -> str: ...
+    def xpath(self, xpath_str: str) -> Any:
+        """Return type is typically Sequence[ElementBase], but ...
+
+        lxml.etree.XPath has many possible return types including bool, (a "smart") str,
+        float. The return type can also be a list containing ElementBase, comments,
+        processing instructions, str, and tuple. So you need to cast the result based on
+        the XPath expression you use.
+        """
+        ...
diff --git a/typings/pptx/presentation.pyi b/typings/pptx/presentation.pyi
new file mode 100644
index 0000000000..3f476d1db1
--- /dev/null
+++ b/typings/pptx/presentation.pyi
@@ -0,0 +1,11 @@
+from typing import BinaryIO, Union
+
+from pptx.shared import PartElementProxy
+from pptx.slide import SlideLayouts, Slides
+
+class Presentation(PartElementProxy):
+    def save(self, file: Union[str, BinaryIO]) -> None: ...
+    @property
+    def slide_layouts(self) -> SlideLayouts: ...
+    @property
+    def slides(self) -> Slides: ...
diff --git a/typings/pptx/shapes/__init__.py b/typings/pptx/shapes/__init__.py
new file mode 100644
index 0000000000..41be6cfc5e
--- /dev/null
+++ b/typings/pptx/shapes/__init__.py
@@ -0,0 +1,2 @@
+class Subshape:
+    ...
diff --git a/typings/pptx/shapes/autoshape.pyi b/typings/pptx/shapes/autoshape.pyi
new file mode 100644
index 0000000000..77d6f3afa0
--- /dev/null
+++ b/typings/pptx/shapes/autoshape.pyi
@@ -0,0 +1,6 @@
+from pptx.shapes.base import BaseShape
+from pptx.text.text import TextFrame
+
+class Shape(BaseShape):
+    @property
+    def text_frame(self) -> TextFrame: ...
diff --git a/typings/pptx/shapes/base.pyi b/typings/pptx/shapes/base.pyi
new file mode 100644
index 0000000000..ddafa8275a
--- /dev/null
+++ b/typings/pptx/shapes/base.pyi
@@ -0,0 +1,9 @@
+from pptx.util import Length
+
+class BaseShape:
+    left: Length
+    top: Length
+    @property
+    def has_table(self) -> bool: ...
+    @property
+    def has_text_frame(self) -> bool: ...
diff --git a/typings/pptx/shapes/graphfrm.pyi b/typings/pptx/shapes/graphfrm.pyi
new file mode 100644
index 0000000000..970bf63760
--- /dev/null
+++ b/typings/pptx/shapes/graphfrm.pyi
@@ -0,0 +1,6 @@
+from pptx.shapes.base import BaseShape
+from pptx.table import Table
+
+class GraphicFrame(BaseShape):
+    @property
+    def table(self) -> Table: ...
diff --git a/typings/pptx/shapes/group.pyi b/typings/pptx/shapes/group.pyi
new file mode 100644
index 0000000000..af18cf6083
--- /dev/null
+++ b/typings/pptx/shapes/group.pyi
@@ -0,0 +1,6 @@
+from pptx.shapes.base import BaseShape
+from pptx.shapes.shapetree import GroupShapes
+
+class GroupShape(BaseShape):
+    @property
+    def shapes(self) -> GroupShapes: ...
diff --git a/typings/pptx/shapes/shapetree.pyi b/typings/pptx/shapes/shapetree.pyi
new file mode 100644
index 0000000000..ebb905c4eb
--- /dev/null
+++ b/typings/pptx/shapes/shapetree.pyi
@@ -0,0 +1,18 @@
+from typing import Iterator
+
+from pptx.shapes.autoshape import Shape
+from pptx.shapes.base import BaseShape
+from pptx.shared import ParentedElementProxy
+from pptx.util import Length
+
+class _BaseShapes(ParentedElementProxy):
+    def __iter__(self) -> Iterator[BaseShape]: ...
+
+class _BaseGroupShapes(_BaseShapes):
+    def add_textbox(self, left: Length, top: Length, width: Length, height: Length) -> Shape: ...
+
+class GroupShapes(_BaseGroupShapes): ...
+class NotesSlideShapes(_BaseShapes): ...
+
+class SlideShapes(_BaseGroupShapes):
+    def __iter__(self) -> Iterator[BaseShape]: ...
diff --git a/typings/pptx/shared.pyi b/typings/pptx/shared.pyi
new file mode 100644
index 0000000000..48abf5a1a4
--- /dev/null
+++ b/typings/pptx/shared.pyi
@@ -0,0 +1,3 @@
+class ElementProxy: ...
+class ParentedElementProxy(ElementProxy): ...
+class PartElementProxy(ElementProxy): ...
diff --git a/typings/pptx/slide.pyi b/typings/pptx/slide.pyi
new file mode 100644
index 0000000000..6372ab53b4
--- /dev/null
+++ b/typings/pptx/slide.pyi
@@ -0,0 +1,31 @@
+from typing import Iterator, Optional
+
+from pptx.shapes.shapetree import SlideShapes
+from pptx.shared import ParentedElementProxy, PartElementProxy
+from pptx.text.text import TextFrame
+
+class _BaseSlide(PartElementProxy): ...
+
+class NotesSlide(_BaseSlide):
+    @property
+    def notes_text_frame(self) -> Optional[TextFrame]: ...
+
+class Slide(_BaseSlide):
+    @property
+    def has_notes_slide(self) -> bool: ...
+    @property
+    def notes_slide(self) -> NotesSlide: ...
+    @property
+    def shapes(self) -> SlideShapes: ...
+
+class SlideLayout(_BaseSlide): ...
+
+class SlideLayouts(ParentedElementProxy):
+    def __getitem__(self, idx: int) -> SlideLayout: ...
+    def __iter__(self) -> Iterator[SlideLayout]: ...
+    def __len__(self) -> int: ...
+
+class Slides(ParentedElementProxy):
+    def __iter__(self) -> Iterator[Slide]: ...
+    def __len__(self) -> int: ...
+    def add_slide(self, slide_layout: SlideLayout) -> Slide: ...
diff --git a/typings/pptx/table.pyi b/typings/pptx/table.pyi
new file mode 100644
index 0000000000..edfa21b534
--- /dev/null
+++ b/typings/pptx/table.pyi
@@ -0,0 +1 @@
+class Table: ...
diff --git a/typings/pptx/text/text.pyi b/typings/pptx/text/text.pyi
new file mode 100644
index 0000000000..3e65274de4
--- /dev/null
+++ b/typings/pptx/text/text.pyi
@@ -0,0 +1,13 @@
+from typing import Sequence
+
+from pptx.oxml.text import CT_TextParagraph
+from pptx.shapes import Subshape
+
+class TextFrame(Subshape):
+    text: str
+    @property
+    def paragraphs(self) -> Sequence[_Paragraph]: ...
+
+class _Paragraph(Subshape):
+    _p: CT_TextParagraph
+    text: str
diff --git a/typings/pptx/util.pyi b/typings/pptx/util.pyi
new file mode 100644
index 0000000000..c27d7d311d
--- /dev/null
+++ b/typings/pptx/util.pyi
@@ -0,0 +1,4 @@
+class Length(int): ...
+
+class Inches(Length):
+    def __init__(self, inches: float) -> None: ...
diff --git a/typings/pypandoc/__init__.pyi b/typings/pypandoc/__init__.pyi
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 70682af42c..088cfa170e 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -29,7 +29,7 @@ def clean_non_ascii_chars(text) -> str:
     return en.decode()
 
 
-def clean_bullets(text) -> str:
+def clean_bullets(text: str) -> str:
     """Cleans unicode bullets from a section of text.
 
     Example
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index 98b7be7433..907cacf8cd 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -1,17 +1,19 @@
 from __future__ import annotations
 
+import abc
+import copy
+import dataclasses as dc
 import datetime
+import functools
 import hashlib
 import inspect
 import os
 import pathlib
 import re
 import uuid
-from abc import ABC
-from copy import deepcopy
-from dataclasses import dataclass
-from functools import wraps
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+
+from typing_extensions import Self, TypedDict
 
 from unstructured.documents.coordinates import (
     TYPE_TO_COORDINATE_SYSTEM_MAP,
@@ -20,19 +22,19 @@
 )
 
 
-class NoID(ABC):
+class NoID(abc.ABC):
     """Class to indicate that an element do not have an ID."""
 
     pass
 
 
-class UUID(ABC):
+class UUID(abc.ABC):
     """Class to indicate that an element should have a UUID."""
 
     pass
 
 
-@dataclass
+@dc.dataclass
 class DataSourceMetadata:
     """Metadata fields that pertain to the data source of the document."""
 
@@ -47,7 +49,7 @@ def to_dict(self):
         return {key: value for key, value in self.__dict__.items() if value is not None}
 
 
-@dataclass
+@dc.dataclass
 class CoordinatesMetadata:
     """Metadata fields that pertain to the coordinates of the element."""
 
@@ -125,7 +127,7 @@ class Link(TypedDict):
     url: str
 
 
-@dataclass
+@dc.dataclass
 class ElementMetadata:
     coordinates: Optional[CoordinatesMetadata] = None
     data_source: Optional[DataSourceMetadata] = None
@@ -192,8 +194,8 @@ def to_dict(self):
         return _dict
 
     @classmethod
-    def from_dict(cls, input_dict):
-        constructor_args = deepcopy(input_dict)
+    def from_dict(cls, input_dict: Dict[str, Any]) -> Self:
+        constructor_args = copy.deepcopy(input_dict)
         if constructor_args.get("coordinates", None) is not None:
             constructor_args["coordinates"] = CoordinatesMetadata.from_dict(
                 constructor_args["coordinates"],
@@ -237,7 +239,7 @@ def decorator(func: Callable):
                     attribute on the elements in the output."""
                 )
 
-        @wraps(func)
+        @functools.wraps(func)
         def wrapper(*args, **kwargs):
             elements = func(*args, **kwargs)
             sig = inspect.signature(func)
@@ -293,7 +295,7 @@ def _add_regex_metadata(
     return elements
 
 
-class Element(ABC):
+class Element(abc.ABC):
     """An element is a section of a page in the document."""
 
     def __init__(
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index ff32ebc5a1..75299fe898 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -444,11 +444,8 @@ def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) ->
     return text.strip()
 
 
-def _has_break_tags(tag_elem: etree.Element) -> bool:
-    for descendant in tag_elem.iterdescendants():
-        if descendant.tag in TEXTBREAK_TAGS:
-            return True
-    return False
+def _has_break_tags(tag_elem: etree._Element) -> bool:  # pyright: ignore[reportPrivateUsage]
+    return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
 
 
 def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]:
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
index f161d89e32..1f10fdd7bc 100644
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@@ -83,7 +83,7 @@ def get_last_modified_date(filename: str) -> Union[str, None]:
 
 
 def get_last_modified_date_from_file(
-    file: Union[IO[bytes], SpooledTemporaryFile, BinaryIO, bytes],
+    file: Union[IO[bytes], SpooledTemporaryFile[bytes], BinaryIO, bytes],
 ) -> Union[str, None]:
     filename = None
     if hasattr(file, "name"):
@@ -405,7 +405,7 @@ def convert_office_doc(
         logger.error(error.decode().strip())
 
 
-def exactly_one(**kwargs) -> None:
+def exactly_one(**kwargs: Any) -> None:
     """
     Verify arguments; exactly one of all keyword arguments must not be None.
 
@@ -422,7 +422,7 @@ def exactly_one(**kwargs) -> None:
 
 
 def spooled_to_bytes_io_if_needed(
-    file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]],
+    file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile[bytes]]],
 ) -> Optional[Union[bytes, BinaryIO]]:
     if isinstance(file_obj, SpooledTemporaryFile):
         file_obj.seek(0)
@@ -453,10 +453,7 @@ def convert_to_bytes(
     return f_bytes
 
 
-def convert_ms_office_table_to_text(
-    table: "docxtable.Table",
-    as_html: bool = True,
-) -> str:
+def convert_ms_office_table_to_text(table: "docxtable", as_html: bool = True) -> str:
     """
     Convert a table object from a Word document to an HTML table string using the tabulate library.
 
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
index fbc7c8190f..24c872ac69 100644
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@@ -1,13 +1,42 @@
+# pyright: reportPrivateUsage=false
+
+from __future__ import annotations
+
+import io
+import itertools
 import os
 import tempfile
 from tempfile import SpooledTemporaryFile
-from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast
+from typing import (
+    Any,
+    BinaryIO,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)
 
+# -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
-from docx.oxml.shared import qn
+from docx.document import Document
+from docx.enum.section import WD_SECTION_START
+from docx.oxml.ns import nsmap, qn
+from docx.oxml.section import CT_SectPr
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.oxml.text.run import CT_R
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.section import Section, _Footer, _Header
 from docx.table import Table as DocxTable
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
+from lxml import etree
+from typing_extensions import TypeAlias
 
 from unstructured.chunking.title import add_chunking_strategy
 from unstructured.cleaners.core import clean_bullets
@@ -32,7 +61,6 @@
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
-    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.text_type import (
     is_bulleted_text,
@@ -41,75 +69,82 @@
     is_possible_title,
     is_us_city_state_zip,
 )
-from unstructured.utils import dependency_exists
+from unstructured.utils import dependency_exists, lazyproperty, requires_dependencies
 
 if dependency_exists("pypandoc"):
     import pypandoc
 
-# NOTE(robinson) - documentation on built in styles can be found at the link below
-# ref: https://python-docx.readthedocs.io/en/latest/user/
-#   styles-understanding.html#paragraph-styles-in-default-template
-STYLE_TO_ELEMENT_MAPPING = {
-    "Caption": Text,  # TODO(robinson) - add caption element type
-    "Heading 1": Title,
-    "Heading 2": Title,
-    "Heading 3": Title,
-    "Heading 4": Title,
-    "Heading 5": Title,
-    "Heading 6": Title,
-    "Heading 7": Title,
-    "Heading 8": Title,
-    "Heading 9": Title,
-    "Intense Quote": Text,  # TODO(robinson) - add quote element type
-    "List": ListItem,
-    "List 2": ListItem,
-    "List 3": ListItem,
-    "List Bullet": ListItem,
-    "List Bullet 2": ListItem,
-    "List Bullet 3": ListItem,
-    "List Continue": ListItem,
-    "List Continue 2": ListItem,
-    "List Continue 3": ListItem,
-    "List Number": ListItem,
-    "List Number 2": ListItem,
-    "List Number 3": ListItem,
-    "List Paragraph": ListItem,
-    "Macro Text": Text,
-    "No Spacing": Text,
-    "Quote": Text,  # TODO(robinson) - add quote element type
-    "Subtitle": Title,
-    "TOCHeading": Title,
-    "Title": Title,
-}
-
-
-def _get_paragraph_runs(paragraph):
-    """
-    Get hyperlink text from a paragraph object.
-    Without this, the default runs function skips over hyperlinks.
 
-    Args:
-        paragraph (Paragraph): A Paragraph object.
+BlockElement: TypeAlias = Union[CT_P, CT_Tbl]
+BlockItem: TypeAlias = Union[Paragraph, DocxTable]
 
-    Returns:
-        list: A list of Run objects.
+
+@requires_dependencies("pypandoc")
+def convert_and_partition_docx(
+    source_format: str,
+    filename: Optional[str] = None,
+    file: Optional[BinaryIO] = None,
+    include_metadata: bool = True,
+    metadata_filename: Optional[str] = None,
+    metadata_last_modified: Optional[str] = None,
+) -> List[Element]:
+    """Converts a document to DOCX and then partitions it using partition_docx.
+
+    Works with any file format support by pandoc.
+
+    Parameters
+    ----------
+    source_format
+        The format of the source document, .e.g. odt
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    include_metadata
+        Determines whether or not metadata is included in the metadata attribute on the elements in
+        the output.
     """
+    exactly_one(filename=filename, file=file)
 
-    # Recursively get runs.
-    def _get_runs(node, parent):
-        for child in node:
-            # If the child is a run, yield a Run object
-            if child.tag == qn("w:r"):
-                yield Run(child, parent)
-            # If the child is a hyperlink, search for runs within it recursively
-            if child.tag == qn("w:hyperlink"):
-                yield from _get_runs(child, parent)
+    def validate_filename(filename: str) -> str:
+        """Return path to a file confirmed to exist on the filesystem."""
+        if not os.path.exists(filename):
+            raise ValueError(f"The file {filename} does not exist.")
+        return filename
 
-    return list(_get_runs(paragraph._element, paragraph))
+    def copy_to_tempfile(file: BinaryIO) -> str:
+        """Return path to temporary copy of file to be converted."""
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+            tmp.write(file.read())
+            return tmp.name
+
+    def extract_docx_filename(file_path: str) -> str:
+        """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
+        # -- a/b/foo.odt -> foo.odt --
+        filename = os.path.basename(file_path)
+        # -- foo.odt -> foo --
+        root_name, _ = os.path.splitext(filename)
+        # -- foo -> foo.docx --
+        return f"{root_name}.docx"
 
+    file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(BinaryIO, file))
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
+        pypandoc.convert_file(  # pyright: ignore
+            file_path,
+            "docx",
+            format=source_format,
+            outputfile=docx_path,
+        )
+        elements = partition_docx(
+            filename=docx_path,
+            metadata_filename=metadata_filename,
+            include_metadata=include_metadata,
+            metadata_last_modified=metadata_last_modified,
+        )
 
-# Add the runs property to the Paragraph class
-Paragraph.runs = property(lambda self: _get_paragraph_runs(self))
+    return elements
 
 
 @process_metadata()
@@ -117,13 +152,13 @@ def _get_runs(node, parent):
 @add_chunking_strategy()
 def partition_docx(
     filename: Optional[str] = None,
-    file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
+    file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None,
     metadata_filename: Optional[str] = None,
     include_page_breaks: bool = True,
     include_metadata: bool = True,
     metadata_last_modified: Optional[str] = None,
     chunking_strategy: Optional[str] = None,
-    **kwargs,
+    **kwargs: Any,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .docx format into its document elements.
 
@@ -134,305 +169,625 @@ def partition_docx(
     file
         A file-like object using "rb" mode --> open(filename, "rb").
     metadata_filename
-        The filename to use for the metadata. Relevant because partition_doc converts the
-        document to .docx before partition. We want the original source filename in the
-        metadata.
+        The filename to use for the metadata. Relevant because partition_doc converts the document
+        to .docx before partition. We want the original source filename in the metadata.
     metadata_last_modified
         The last modified date for the document.
     """
-
-    # Verify that only one of the arguments was provided
+    # -- verify that only one file-specifier argument was provided --
     exactly_one(filename=filename, file=file)
 
-    last_modification_date = None
-    if filename is not None:
-        if not filename.startswith("/tmp"):
-            last_modification_date = get_last_modified_date(filename)
-
-        document = docx.Document(filename)
-    elif file is not None:
-        last_modification_date = get_last_modified_date_from_file(file)
-
-        document = docx.Document(
-            spooled_to_bytes_io_if_needed(
-                cast(Union[BinaryIO, SpooledTemporaryFile], file),
-            ),
+    return list(
+        _DocxPartitioner.iter_document_elements(
+            filename,
+            file,
+            metadata_filename,
+            include_page_breaks,
+            metadata_last_modified,
         )
+    )
 
-    elements: List[Element] = []
-    table_index = 0
-
-    headers_and_footers = _get_headers_and_footers(document, metadata_filename)
-    if len(headers_and_footers) > 0:
-        elements.extend(headers_and_footers[0][0])
-
-    document_contains_pagebreaks = _element_contains_pagebreak(document._element)
-    page_number = 1 if document_contains_pagebreaks else None
-    section = 0
-    is_list = False
-    for element_item in document.element.body:
-        if element_item.tag.endswith("tbl"):
-            table = document.tables[table_index]
-            emphasized_texts = _get_emphasized_texts_from_table(table)
-            emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
-                emphasized_texts,
-            )
-            html_table = convert_ms_office_table_to_text(table, as_html=True)
-            text_table = convert_ms_office_table_to_text(table, as_html=False)
-            element = Table(text_table)
-            if element is not None:
-                element.metadata = ElementMetadata(
-                    text_as_html=html_table,
-                    filename=metadata_filename,
-                    page_number=page_number,
-                    last_modified=metadata_last_modified or last_modification_date,
-                    emphasized_text_contents=emphasized_text_contents,
-                    emphasized_text_tags=emphasized_text_tags,
-                )
-                elements.append(element)
-            table_index += 1
-        elif element_item.tag.endswith("p"):
-            if "<w:numPr>" in element_item.xml:
-                is_list = True
-            paragraph = docx.text.paragraph.Paragraph(element_item, document)
-            emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
-            emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
-                emphasized_texts,
+
+class _DocxPartitioner:
+    """Provides `.partition()` for MS-Word 2007+ (.docx) files."""
+
+    # TODO: I think we can do better on metadata.filename. Should that only be populated when a
+    #       `metadata_filename` argument was provided to `partition_docx()`? What about when not but
+    #       we do get a `filename` arg or a `file` arg that has a `.name` attribute?
+    # TODO: get last-modified date from document-properties (stored in docx package) rather than
+    #       relying on last filesystem-write date; maybe fall-back to filesystem-date.
+    # TODO: improve `._element_contains_pagebreak()`. It uses substring matching on the rendered
+    #       XML text which is error-prone and not performant. Use XPath instead with the specific
+    #       locations a page-break can be located. Also, there can be more than one, so return a
+    #       count instead of a boolean.
+    # TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for
+    #       `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like
+    #        the "_element_contains_pagebreak()" function.
+    # TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is
+    #       a list-item is encapsulated in a single place rather than distributed around the code.
+    # TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead
+    #       of a substring match on the rendered XML. Include all permutations of how a numbered
+    #       list can be manually applied (as opposed to by using a style).
+    # TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much
+    #       domain-specific knowledge to comfortable here and is of general use so welcome in the
+    #       library.
+    # TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.
+
+    def __init__(
+        self,
+        filename: Optional[str],
+        file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]],
+        metadata_filename: Optional[str],
+        include_page_breaks: bool,
+        metadata_last_modified: Optional[str],
+    ) -> None:
+        self._filename = filename
+        self._file = file
+        self._metadata_filename = metadata_filename
+        self._include_page_breaks = include_page_breaks
+        self._metadata_last_modified = metadata_last_modified
+        self._page_counter: int = 1
+
+    @classmethod
+    def iter_document_elements(
+        cls,
+        filename: Optional[str] = None,
+        file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None,
+        metadata_filename: Optional[str] = None,
+        include_page_breaks: bool = True,
+        metadata_last_modified: Optional[str] = None,
+    ) -> Iterator[Element]:
+        """Partition MS Word documents (.docx format) into its document elements."""
+        return cls(
+            filename,
+            file,
+            metadata_filename,
+            include_page_breaks,
+            metadata_last_modified,
+        )._iter_document_elements()
+
+    def _iter_document_elements(self) -> Iterator[Element]:
+        """Generate each document-element in (docx) `document` in document order."""
+        for section_idx, section in enumerate(self._document.sections):
+            yield from self._iter_section_page_breaks(section_idx, section)
+            yield from self._iter_section_headers(section)
+
+            for block_item in _SectBlockItemIterator.iter_sect_block_items(section, self._document):
+                # -- a block-item can only be a Paragraph ... --
+                if isinstance(block_item, Paragraph):
+                    yield from self._iter_paragraph_elements(block_item)
+                    # -- a paragraph can contain a page-break --
+                    yield from self._iter_maybe_paragraph_page_breaks(block_item)
+                # -- ... or a Table --
+                else:
+                    yield from self._iter_table_element(block_item)
+
+            yield from self._iter_section_footers(section)
+
+    @lazyproperty
+    def _document(self) -> Document:
+        """The python-docx `Document` object loaded from file or filename."""
+        filename, file = self._filename, self._file
+
+        if filename is not None:
+            return docx.Document(filename)
+
+        assert file is not None
+        if isinstance(file, SpooledTemporaryFile):
+            file.seek(0)
+            file = io.BytesIO(file.read())
+        return docx.Document(file)
+
+    @lazyproperty
+    def _document_contains_pagebreaks(self) -> bool:
+        """True when there is at least one page-break detected in the document."""
+        return self._element_contains_pagebreak(self._document._element)
+
+    def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
+        """True when `element` contains a page break.
+
+        Checks for both "hard" page breaks (page breaks explicitly inserted by the user)
+        and "soft" page breaks, which are sometimes inserted by the MS Word renderer.
+        Note that soft page breaks aren't always present. Whether or not pages are
+        tracked may depend on your Word renderer.
+        """
+        page_break_indicators = [
+            ["w:br", 'type="page"'],  # "Hard" page break inserted by user
+            ["lastRenderedPageBreak"],  # "Soft" page break inserted by renderer
+        ]
+        if hasattr(element, "xml"):
+            for indicators in page_break_indicators:
+                if all(indicator in element.xml for indicator in indicators):
+                    return True
+        return False
+
+    def _increment_page_number(self) -> Iterator[PageBreak]:
+        """Increment page-number by 1 and generate a PageBreak element if enabled."""
+        self._page_counter += 1
+        if self._include_page_breaks:
+            yield PageBreak("")
+
+    def _is_list_item(self, paragraph: Paragraph) -> bool:
+        """True when `paragraph` can be identified as a list-item."""
+        if is_bulleted_text(paragraph.text):
+            return True
+
+        return "<w:numPr>" in paragraph._p.xml
+
+    def _iter_paragraph_elements(self, paragraph: Paragraph) -> Iterator[Element]:
+        """Generate zero-or-one document element for `paragraph`.
+
+        In Word, an empty paragraph is commonly used for inter-paragraph spacing. An empty paragraph
+        does not contribute to the document-element stream and will not cause an element to be
+        emitted.
+        """
+        text = paragraph.text
+
+        # -- blank paragraphs are commonly used for spacing between paragraphs and
+        # -- do not contribute to the document-element stream.
+        if not text.strip():
+            return
+
+        metadata = self._paragraph_metadata(paragraph)
+
+        # -- a list gets some special treatment --
+        if self._is_list_item(paragraph):
+            clean_text = clean_bullets(text).strip()
+            if clean_text:
+                yield ListItem(text=clean_text, metadata=metadata)
+            return
+
+        # -- determine element-type from an explicit Word paragraph-style if possible --
+        TextSubCls = self._style_based_element_type(paragraph)
+        if TextSubCls:
+            yield TextSubCls(text=text, metadata=metadata)
+            return
+
+        # -- try to recognize the element type by parsing its text --
+        TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
+        if TextSubCls:
+            yield TextSubCls(text=text, metadata=metadata)
+            return
+
+        # -- if all that fails we give it the default `Text` element-type --
+        yield Text(text, metadata=metadata)
+
+    def _iter_maybe_paragraph_page_breaks(self, paragraph: Paragraph) -> Iterator[PageBreak]:
+        """Generate a `PageBreak` document element for each page-break in `paragraph`.
+
+        Checks for both "hard" page breaks (page breaks explicitly inserted by the user)
+        and "soft" page breaks, which are sometimes inserted by the MS Word renderer.
+        Note that soft page breaks aren't always present. Whether or not pages are
+        tracked may depend on your Word renderer.
+        """
+
+        def has_page_break_implementation_we_have_so_far() -> bool:
+            """Needs to become more sophisticated."""
+            page_break_indicators = [
+                ["w:br", 'type="page"'],  # "Hard" page break inserted by user
+                ["lastRenderedPageBreak"],  # "Soft" page break inserted by renderer
+            ]
+            for indicators in page_break_indicators:
+                if all(indicator in paragraph._p.xml for indicator in indicators):
+                    return True
+            return False
+
+        if not has_page_break_implementation_we_have_so_far():
+            return
+
+        yield from self._increment_page_number()
+
+    def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]:
+        """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
+        for run in paragraph.runs:
+            text = run.text.strip() if run.text else ""
+            if not text:
+                continue
+            if run.bold:
+                yield {"text": text, "tag": "b"}
+            if run.italic:
+                yield {"text": text, "tag": "i"}
+
+    def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
+        """Generate any `Footer` elements defined for this section.
+
+        A Word document has up to three header and footer definition pairs for each document
+        section, a primary, first-page, and even-page header and footer. The first-page pair
+        applies only to the first page of the section (perhaps a title page or chapter start). The
+        even-page pair is used in book-bound documents where there are both recto and verso pages
+        (it is applied to verso (even-numbered) pages). A page where neither more specialized
+        footer applies uses the primary footer.
+        """
+
+        def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]:
+            """Generate zero-or-one Footer elements for `footer`."""
+            if footer.is_linked_to_previous:
+                return
+            text = "\n".join([p.text for p in footer.paragraphs])
+            if not text:
+                return
+            yield Footer(
+                text=text,
+                metadata=ElementMetadata(
+                    filename=self._metadata_filename,
+                    header_footer_type=header_footer_type,
+                ),
             )
-            para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
-            if para_element is not None:
-                para_element.metadata = ElementMetadata(
-                    filename=metadata_filename,
-                    page_number=page_number,
-                    last_modified=metadata_last_modified or last_modification_date,
-                    emphasized_text_contents=emphasized_text_contents,
-                    emphasized_text_tags=emphasized_text_tags,
-                )
-                elements.append(para_element)
-            is_list = False
-        elif element_item.tag.endswith("sectPr"):
-            if len(headers_and_footers) > section:
-                footers = headers_and_footers[section][1]
-                elements.extend(footers)
-
-            section += 1
-            if len(headers_and_footers) > section:
-                headers = headers_and_footers[section][0]
-                elements.extend(headers)
-
-        if page_number is not None and _element_contains_pagebreak(element_item):
-            page_number += 1
-            if include_page_breaks:
-                elements.append(PageBreak(text=""))
 
-    return elements
+        yield from iter_footer(section.footer, "primary")
+        if section.different_first_page_header_footer:
+            yield from iter_footer(section.first_page_footer, "first_page")
+        if self._document.settings.odd_and_even_pages_header_footer:
+            yield from iter_footer(section.even_page_footer, "even_page")
+
+    def _iter_section_headers(self, section: Section) -> Iterator[Header]:
+        """Generate `Header` elements for this section if it has them.
+
+        See `._iter_section_footers()` docstring for more on docx headers and footers.
+        """
+
+        def iter_header(header: _Header, header_footer_type: str) -> Iterator[Header]:
+            """Generate zero-or-one Header elements for `header`."""
+            if header.is_linked_to_previous:
+                return
+            text = "\n".join([p.text for p in header.paragraphs])
+            if not text:
+                return
+            yield Header(
+                text=text,
+                metadata=ElementMetadata(
+                    filename=self._metadata_filename,
+                    header_footer_type=header_footer_type,
+                ),
+            )
 
+        yield from iter_header(section.header, "primary")
+        if section.different_first_page_header_footer:
+            yield from iter_header(section.first_page_header, "first_page")
+        if self._document.settings.odd_and_even_pages_header_footer:
+            yield from iter_header(section.even_page_header, "even_page")
+
+    def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Iterator[PageBreak]:
+        """Generate zero-or-one `PageBreak` document elements for `section`.
+
+        A docx section has a "start" type which can be "continuous" (no page-break), "nextPage",
+        "evenPage", or "oddPage". For the next, even, and odd varieties, a `w:renderedPageBreak`
+        element signals one page break. Here we only need to handle the case where we need to add
+        another, for example to go from one odd page to another odd page and we need a total of
+        two page-breaks.
+        """
+
+        def page_is_odd() -> bool:
+            return self._page_counter % 2 == 1
+
+        start_type = section.start_type
+
+        # -- This method is called upon entering a new section, which happens before any paragraphs
+        # -- in that section are partitioned. A rendered page-break due to a section-start occurs
+        # -- in the first paragraph of the section and so occurs _later_ in the proces. Here we
+        # -- predict when two page breaks will be needed and emit one of them. The second will be
+        # -- emitted by the rendered page-break to follow.
+
+        if start_type == WD_SECTION_START.EVEN_PAGE:
+            # -- on an even page we need two total, add one to supplement the rendered page break
+            # -- to follow. There is no "first-document-page" special case because 1 is odd.
+            if not page_is_odd():
+                yield from self._increment_page_number()
+
+        elif start_type == WD_SECTION_START.ODD_PAGE:
+            # -- the first page of the document is an implicit "new" odd-page, so no page-break --
+            if section_idx == 0:
+                return
+            if page_is_odd():
+                yield from self._increment_page_number()
+
+        # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of
+        # -- which need our help to get the page-breaks right.
+        return
+
+    def _iter_table_element(self, table: DocxTable) -> Iterator[Table]:
+        """Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
+        # -- at present, we always generate exactly one Table element, but we might want
+        # -- to skip, for example, an empty table, or accommodate nested tables.
+
+        html_table = convert_ms_office_table_to_text(table, as_html=True)
+        text_table = convert_ms_office_table_to_text(table, as_html=False)
+        emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
+
+        yield Table(
+            text_table,
+            metadata=ElementMetadata(
+                text_as_html=html_table,
+                filename=self._metadata_filename,
+                page_number=self._page_number,
+                last_modified=self._last_modified,
+                emphasized_text_contents=emphasized_text_contents or None,
+                emphasized_text_tags=emphasized_text_tags or None,
+            ),
+        )
 
-def _paragraph_to_element(
-    paragraph: docx.text.paragraph.Paragraph,
-    is_list=False,
-) -> Optional[Text]:
-    """Converts a docx Paragraph object into the appropriate unstructured document element.
-    If the paragraph style is "Normal" or unknown, we try to predict the element type from the
-    raw text."""
-    text = paragraph.text
-    style_name = paragraph.style and paragraph.style.name  # .style can be None
+    def _iter_table_emphasis(self, table: DocxTable) -> Iterator[Dict[str, str]]:
+        """Generate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`."""
+        for row in table.rows:
+            for cell in row.cells:
+                for paragraph in cell.paragraphs:
+                    yield from self._iter_paragraph_emphasis(paragraph)
+
+    @lazyproperty
+    def _last_modified(self) -> Optional[str]:
+        """Last-modified date suitable for use in element metadata."""
+        # -- if this file was converted from another format, any last-modified date for the file
+        # -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
+        if self._metadata_last_modified:
+            return self._metadata_last_modified
+
+        file_path, file = self._filename, self._file
+
+        # -- if the file is on the filesystem, get its date from there --
+        if file_path is not None:
+            return None if file_path.startswith("/tmp") else get_last_modified_date(file_path)
+
+        # -- otherwise try getting it from the file-like object (unlikely since BytesIO and its
+        # -- brethren have no such metadata).
+        assert file is not None
+        return get_last_modified_date_from_file(file)
+
+    @property
+    def _page_number(self) -> Optional[int]:
+        """The current page number, or None if we can't really tell.
+
+        Page numbers are not added to element metadata if we can't find any page-breaks in the
+        document (which may be a common case).
+
+        In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
+        page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
+        target device. Explicit (hard) page-breaks are always recorded in the docx file but the
+        rendered page-breaks are only added optionally.
+        """
+        return self._page_counter if self._document_contains_pagebreaks else None
+
+    def _paragraph_emphasis(self, paragraph: Paragraph) -> Tuple[List[str], List[str]]:
+        """[contents, tags] pair describing emphasized text in `paragraph`."""
+        iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph))
+        return ([e["text"] for e in iter_p_emph], [e["tag"] for e in iter_p_emph_2])
+
+    def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata:
+        """ElementMetadata object describing `paragraph`."""
+        emphasized_text_contents, emphasized_text_tags = self._paragraph_emphasis(paragraph)
+        return ElementMetadata(
+            filename=self._metadata_filename,
+            page_number=self._page_number,
+            last_modified=self._last_modified,
+            emphasized_text_contents=emphasized_text_contents or None,
+            emphasized_text_tags=emphasized_text_tags or None,
+        )
 
-    if len(text.strip()) == 0:
-        return None
+    def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]:
+        """Attempt to differentiate the element-type by inspecting the raw text."""
+        text = paragraph.text.strip()
+
+        if len(text) < 2:
+            return None
+        if is_us_city_state_zip(text):
+            return Address
+        if is_email_address(text):
+            return EmailAddress
+        if is_possible_narrative_text(text):
+            return NarrativeText
+        if is_possible_title(text):
+            return Title
 
-    element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
-
-    # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
-    # Unknown style names will also return None
-    if is_list:
-        return _text_to_element(text, is_list)
-    elif element_class is None:
-        return _text_to_element(text)
-    else:
-        return element_class(text)
-
-
-def _element_contains_pagebreak(element) -> bool:
-    """Detects if an element contains a page break. Checks for both "hard" page breaks
-    (page breaks inserted by the user) and "soft" page breaks, which are sometimes
-    inserted by the MS Word renderer. Note that soft page breaks aren't always present.
-    Whether or not pages are tracked may depend on your Word renderer."""
-    page_break_indicators = [
-        ["w:br", 'type="page"'],  # "Hard" page break inserted by user
-        ["lastRenderedPageBreak"],  # "Soft" page break inserted by renderer
-    ]
-    if hasattr(element, "xml"):
-        for indicators in page_break_indicators:
-            if all(indicator in element.xml for indicator in indicators):
-                return True
-    return False
-
-
-def _text_to_element(text: str, is_list=False) -> Optional[Text]:
-    """Converts raw text into an unstructured Text element."""
-    if is_bulleted_text(text) or is_list:
-        clean_text = clean_bullets(text).strip()
-        return ListItem(text=clean_bullets(text)) if clean_text else None
-
-    elif is_us_city_state_zip(text):
-        return Address(text=text)
-    elif is_email_address(text):
-        return EmailAddress(text=text)
-    if len(text) < 2:
         return None
-    elif is_possible_narrative_text(text):
-        return NarrativeText(text)
-    elif is_possible_title(text):
-        return Title(text)
-    else:
-        return Text(text)
 
+    def _style_based_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]:
+        """Element-type for `paragraph` based on its paragraph-style.
+
+        Returns `None` when the style doesn't tell us anything useful, including when it
+        is the default "Normal" style.
+        """
+        # NOTE(robinson) - documentation on built-in styles at the link below:
+        # https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html \
+        # #paragraph-styles-in-default-template
+        STYLE_TO_ELEMENT_MAPPING = {
+            "Caption": Text,  # TODO(robinson) - add caption element type
+            "Heading 1": Title,
+            "Heading 2": Title,
+            "Heading 3": Title,
+            "Heading 4": Title,
+            "Heading 5": Title,
+            "Heading 6": Title,
+            "Heading 7": Title,
+            "Heading 8": Title,
+            "Heading 9": Title,
+            "Intense Quote": Text,  # TODO(robinson) - add quote element type
+            "List": ListItem,
+            "List 2": ListItem,
+            "List 3": ListItem,
+            "List Bullet": ListItem,
+            "List Bullet 2": ListItem,
+            "List Bullet 3": ListItem,
+            "List Continue": ListItem,
+            "List Continue 2": ListItem,
+            "List Continue 3": ListItem,
+            "List Number": ListItem,
+            "List Number 2": ListItem,
+            "List Number 3": ListItem,
+            "List Paragraph": ListItem,
+            "Macro Text": Text,
+            "No Spacing": Text,
+            "Quote": Text,  # TODO(robinson) - add quote element type
+            "Subtitle": Title,
+            "TOCHeading": Title,
+            "Title": Title,
+        }
+
+        # -- paragraph.style can be None in rare cases, so can style.name. That's going
+        # -- to mean default style which is equivalent to "Normal" for our purposes.
+        style_name = (paragraph.style and paragraph.style.name) or "Normal"
+
+        # NOTE(robinson) - The "Normal" style name will return None since it's not
+        # in the mapping. Unknown style names will also return None.
+        return STYLE_TO_ELEMENT_MAPPING.get(style_name)
+
+    def _table_emphasis(self, table: DocxTable) -> Tuple[List[str], List[str]]:
+        """[contents, tags] pair describing emphasized text in `table`."""
+        iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
+        return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
+
+
+class _SectBlockItemIterator:
+    """Generates the block-items in a section.
+
+    A block item is a docx Paragraph or Table. This small class is separated from
+    `_SectBlockElementIterator` because these two aspects will live in different places upstream.
+    This makes them easier to transplant, which we expect to do soon.
+    """
 
-def _join_paragraphs(paragraphs: List[docx.text.paragraph.Paragraph]) -> Optional[str]:
-    return "\n".join([paragraph.text for paragraph in paragraphs])
+    @classmethod
+    def iter_sect_block_items(cls, section: Section, document: Document) -> Iterator[BlockItem]:
+        """Generate each Paragraph or Table object in `section`."""
+        for element in _SectBlockElementIterator.iter_sect_block_elements(section._sectPr):
+            yield (
+                Paragraph(element, document)
+                if isinstance(element, CT_P)
+                else DocxTable(element, document)
+            )
 
 
-def _get_headers_and_footers(
-    document: docx.document.Document,
-    metadata_filename: Optional[str],
-) -> List[Tuple[List[Header], List[Footer]]]:
-    headers_and_footers = []
-    attr_prefixes = ["", "first_page_", "even_page_"]
+class _SectBlockElementIterator:
+    """Generates the block-item XML elements in a section.
 
-    for section in document.sections:
-        headers = []
-        footers = []
+    A block-item element is a `CT_P` (paragraph) or a `CT_Tbl` (table).
+    """
 
-        for _type in ["header", "footer"]:
-            for prefix in attr_prefixes:
-                _elem = getattr(section, f"{prefix}{_type}", None)
-                if _elem is None:
-                    continue
+    _compiled_blocks_xpath: Optional[etree.XPath] = None
+    _compiled_count_xpath: Optional[etree.XPath] = None
+
+    def __init__(self, sectPr: CT_SectPr):
+        self._sectPr = sectPr
+
+    @classmethod
+    def iter_sect_block_elements(cls, sectPr: CT_SectPr) -> Iterator[BlockElement]:
+        """Generate each CT_P or CT_Tbl element within the extents governed by `sectPr`."""
+        return cls(sectPr)._iter_sect_block_elements()
+
+    def _iter_sect_block_elements(self) -> Iterator[BlockElement]:
+        """Generate each CT_P or CT_Tbl element in section."""
+        # -- General strategy is to get all block (<w;p> and <w:tbl>) elements from start of doc
+        # -- to and including this section, then compute the count of those elements that came
+        # -- from prior sections and skip that many to leave only the ones in this section. It's
+        # -- possible to express this "between here and there" (end of prior section and end of
+        # -- this one) concept in XPath, but it would be harder to follow because there are
+        # -- special cases (e.g. no prior section) and the boundary expressions are fairly hairy.
+        # -- I also believe it would be computationally more expensive than doing it this
+        # -- straighforward albeit (theoretically) slightly wasteful way.
+
+        sectPr, sectPrs = self._sectPr, self._sectPrs
+        sectPr_idx = sectPrs.index(sectPr)
+
+        # -- count block items belonging to prior sections --
+        n_blks_to_skip = (
+            0
+            if sectPr_idx == 0
+            else self._count_of_blocks_in_and_above_section(sectPrs[sectPr_idx - 1])
+        )
 
-                text = _join_paragraphs(_elem.paragraphs)
-                if text:
-                    header_footer_type = prefix[:-1] or "primary"
-                    metadata = ElementMetadata(
-                        filename=metadata_filename,
-                        header_footer_type=header_footer_type,
-                    )
+        # -- and skip those in set of all blks from doc start to end of this section --
+        for element in self._blocks_in_and_above_section(sectPr)[n_blks_to_skip:]:
+            yield element
+
+    def _blocks_in_and_above_section(self, sectPr: CT_SectPr) -> Sequence[BlockElement]:
+        """All ps and tbls in section defined by `sectPr` and all prior sections."""
+        if self._compiled_blocks_xpath is None:
+            self._compiled_blocks_xpath = etree.XPath(
+                self._blocks_in_and_above_section_xpath,
+                namespaces=nsmap,
+                regexp=False,
+            )
+        xpath = self._compiled_blocks_xpath
+        # -- XPath callable results are Any (basically), so need a cast --
+        return cast(Sequence[BlockElement], xpath(sectPr))
+
+    @lazyproperty
+    def _blocks_in_and_above_section_xpath(self) -> str:
+        """XPath expr for ps and tbls in context of a sectPr and all prior sectPrs."""
+        # -- "p_sect" is a section with sectPr located at w:p/w:pPr/w:sectPr. "body_sect" is a
+        # -- section with sectPr located at w:body/w:sectPr. The last section in the document is a
+        # -- "body_sect". All others are of the "p_sect" variety. "term" means "terminal", like
+        # -- the last p or tbl in the section. "pred" means "predecessor", like a preceding p or
+        # -- tbl in the section.
+
+        # -- the terminal block in a p-based sect is the p the sectPr appears in --
+        p_sect_term_block = "./parent::w:pPr/parent::w:p"
+        # -- the terminus of a body-based sect is the sectPr itself (not a block) --
+        body_sect_term = "self::w:sectPr[parent::w:body]"
+        # -- all the ps and tbls preceding (but not including) the context node --
+        pred_ps_and_tbls = "preceding-sibling::*[self::w:p | self::w:tbl]"
+
+        # -- p_sect_term_block and body_sect_term(inus) are mutually exclusive. So the result is
+        # -- either the union of nodes found by the first two selectors or the nodes found by the
+        # -- last selector, never both.
+        return (
+            # -- include the p containing a sectPr --
+            f"{p_sect_term_block}"
+            # -- along with all the blocks that precede it --
+            f" | {p_sect_term_block}/{pred_ps_and_tbls}"
+            # -- or all the preceding blocks if sectPr is body-based (last sectPr) --
+            f" | {body_sect_term}/{pred_ps_and_tbls}"
+        )
 
-                    if _type == "header":
-                        headers.append(Header(text=text, metadata=metadata))
-                    elif _type == "footer":
-                        footers.append(Footer(text=text, metadata=metadata))
+    def _count_of_blocks_in_and_above_section(self, sectPr: CT_SectPr) -> int:
+        """All ps and tbls in section defined by `sectPr` and all prior sections."""
+        if self._compiled_count_xpath is None:
+            self._compiled_count_xpath = etree.XPath(
+                f"count({self._blocks_in_and_above_section_xpath})",
+                namespaces=nsmap,
+                regexp=False,
+            )
+        xpath = self._compiled_count_xpath
+        # -- numeric XPath results are always float, so need an int() conversion --
+        return int(cast(float, xpath(sectPr)))
+
+    @lazyproperty
+    def _sectPrs(self) -> Sequence[CT_SectPr]:
+        """All w:sectPr elements in document, in document-order."""
+        return self._sectPr.xpath(
+            "/w:document/w:body/w:p/w:pPr/w:sectPr | /w:document/w:body/w:sectPr"
+        )
 
-        headers_and_footers.append((headers, footers))
 
-    return headers_and_footers
+# == monkey-patch docx.text.Paragraph.runs ===========================================
 
 
-def convert_and_partition_docx(
-    source_format: str,
-    filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
-    include_metadata: bool = True,
-    metadata_filename: Optional[str] = None,
-    metadata_last_modified: Optional[str] = None,
-) -> List[Element]:
-    """Converts a document to DOCX and then partitions it using partition_docx. Works with
-    any file format support by pandoc.
+def _get_paragraph_runs(paragraph: Paragraph) -> Sequence[Run]:
+    """Gets all runs in paragraph, including hyperlinks python-docx skips.
 
-    Parameters
-    ----------
-    source_format
-        The format of the source document, .e.g. odt
-    filename
-        A string defining the target filename path.
-    file
-        A file-like object using "rb" mode --> open(filename, "rb").
-    include_metadata
-        Determines whether or not metadata is included in the metadata attribute on the
-        elements in the output.
-    """
-    if filename is None:
-        filename = ""
-    exactly_one(filename=filename, file=file)
-
-    if len(filename) > 0:
-        _, filename_no_path = os.path.split(os.path.abspath(filename))
-        base_filename, _ = os.path.splitext(filename_no_path)
-        if not os.path.exists(filename):
-            raise ValueError(f"The file {filename} does not exist.")
-    elif file is not None:
-        tmp = tempfile.NamedTemporaryFile(delete=False)
-        tmp.write(file.read())
-        tmp.close()
-        filename = tmp.name
-        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
-
-    base_filename, _ = os.path.splitext(filename_no_path)
+    Without this, the default runs function skips over hyperlinks.
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
-        pypandoc.convert_file(
-            filename,
-            "docx",
-            format=source_format,
-            outputfile=docx_filename,
-        )
-        elements = partition_docx(
-            filename=docx_filename,
-            metadata_filename=metadata_filename,
-            include_metadata=include_metadata,
-            metadata_last_modified=metadata_last_modified,
-        )
+    Args:
+        paragraph (Paragraph): A Paragraph object.
 
-    return elements
+    Returns:
+        list: A list of Run objects.
+    """
 
+    def _get_runs(node: BaseOxmlElement, parent: Paragraph) -> Iterator[Run]:
+        """Recursively get runs."""
+        for child in node:
+            # -- the Paragraph has runs as direct children --
+            if child.tag == qn("w:r"):
+                yield Run(cast(CT_R, child), parent)
+                continue
+            # -- but it also has hyperlink children that themselves contain runs, so
+            # -- recurse into those
+            if child.tag == qn("w:hyperlink"):
+                yield from _get_runs(child, parent)
 
-def _get_emphasized_texts_from_paragraph(paragraph: Paragraph) -> List[dict]:
-    """Get emphasized texts with bold/italic formatting from a paragraph in MS Word"""
-    emphasized_texts = []
-    for run in paragraph.runs:
-        text = run.text.strip() if run.text else None
-        if not text:
-            continue
-        if run.bold:
-            emphasized_texts.append({"text": text, "tag": "b"})
-        if run.italic:
-            emphasized_texts.append({"text": text, "tag": "i"})
-    return emphasized_texts
-
-
-def _get_emphasized_texts_from_table(table: DocxTable) -> List[dict]:
-    emphasized_texts = []
-    for row in table.rows:
-        for cell in row.cells:
-            for paragraph in cell.paragraphs:
-                _emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
-                emphasized_texts += _emphasized_texts
-    return emphasized_texts
-
-
-def _extract_contents_and_tags(
-    emphasized_texts: List[dict],
-) -> Tuple[Optional[List[str]], Optional[List[str]]]:
-    """
-    Extract the text contents and tags from a list of dictionaries containing emphasized texts.
+    return list(_get_runs(paragraph._element, paragraph))
 
-    Args:
-    - emphasized_texts (List[dict]): A list containing dictionaries with keys "text" and "tag".
 
-    Returns:
-    - Tuple[List[str], List[str]]: A tuple containing two lists -
-                                   one for text contents and one for tags extracted from the input.
-    """
-    emphasized_text_contents = (
-        [emphasized_text["text"] for emphasized_text in emphasized_texts]
-        if emphasized_texts
-        else None
-    )
-    emphasized_text_tags = (
-        [emphasized_text["tag"] for emphasized_text in emphasized_texts]
-        if emphasized_texts
-        else None
-    )
+Paragraph.runs = property(  # pyright: ignore[reportGeneralTypeIssues]
+    lambda self: _get_paragraph_runs(self)
+)
 
-    return emphasized_text_contents, emphasized_text_tags
+# ====================================================================================
diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py
index f4771da7d5..33d7d7a469 100644
--- a/unstructured/partition/json.py
+++ b/unstructured/partition/json.py
@@ -54,10 +54,7 @@ def partition_json(
         last_modification_date = get_last_modified_date_from_file(file)
 
         file_content = file.read()
-        if isinstance(file_content, str):
-            file_text = file_content
-        else:
-            file_text = file_content.decode()
+        file_text = file_content if isinstance(file_content, str) else file_content.decode()
         file.seek(0)
 
     elif text is not None:
diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py
index 37b00457bd..5794d934d5 100644
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@@ -1,4 +1,4 @@
-from typing import IO, List, Optional
+from typing import Any, BinaryIO, List, Optional
 
 from unstructured.chunking.title import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
@@ -15,12 +15,12 @@
 @add_chunking_strategy()
 def partition_odt(
     filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
+    file: Optional[BinaryIO] = None,
     include_metadata: bool = True,
     metadata_filename: Optional[str] = None,
     metadata_last_modified: Optional[str] = None,
     chunking_strategy: Optional[str] = None,
-    **kwargs,
+    **kwargs: Any,
 ) -> List[Element]:
     """Partitions Open Office Documents in .odt format into its document elements.
 
diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
index 9c011a7226..8461e128eb 100644
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@@ -104,8 +104,8 @@ def partition_pptx(
             if shape.has_table:
                 table: pptx.table.Table = shape.table
                 html_table = convert_ms_office_table_to_text(table, as_html=True)
-                text_table = convert_ms_office_table_to_text(table, as_html=False)
-                if (text_table := text_table.strip()) != "":
+                text_table = convert_ms_office_table_to_text(table, as_html=False).strip()
+                if text_table:
                     metadata = ElementMetadata(
                         filename=metadata_filename or filename,
                         text_as_html=html_table,
diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
index 87fbe9e3ba..820e8ac946 100644
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@@ -295,7 +295,7 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
     return ratio > threshold
 
 
-def is_us_city_state_zip(text) -> bool:
+def is_us_city_state_zip(text: str) -> bool:
     """Checks if the given text is in the format of US city/state/zip code.
 
     Examples
@@ -307,7 +307,7 @@ def is_us_city_state_zip(text) -> bool:
     return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None
 
 
-def is_email_address(text) -> bool:
+def is_email_address(text: str) -> bool:
     """Check if the given text is the email address"""
     return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None
 
diff --git a/unstructured/staging/prodigy.py b/unstructured/staging/prodigy.py
index e4d5a99ca9..ba822ce423 100644
--- a/unstructured/staging/prodigy.py
+++ b/unstructured/staging/prodigy.py
@@ -28,9 +28,7 @@ def _validate_prodigy_metadata(
         )
         if isinstance(id_error_index, int):
             raise ValueError(
-                'The key "id" is not allowed with metadata parameter at index: {index}'.format(
-                    index=id_error_index,
-                ),
+                f'The key "id" is not allowed with metadata parameter at index: {id_error_index}'
             )
         validated_metadata = metadata
     else:
diff --git a/unstructured/utils.py b/unstructured/utils.py
index 5a10af77ce..6a5fa83f9a 100644
--- a/unstructured/utils.py
+++ b/unstructured/utils.py
@@ -1,12 +1,125 @@
+import functools
 import importlib
 import json
 from datetime import datetime
 from functools import wraps
-from typing import Dict, List, Optional, Union
+from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast
 
 DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
 
 
+_T = TypeVar("_T")
+
+
+class lazyproperty(Generic[_T]):
+    """Decorator like @property, but evaluated only on first access.
+
+    Like @property, this can only be used to decorate methods having only a `self` parameter, and
+    is accessed like an attribute on an instance, i.e. trailing parentheses are not used. Unlike
+    @property, the decorated method is only evaluated on first access; the resulting value is
+    cached and that same value returned on second and later access without re-evaluation of the
+    method.
+
+    Like @property, this class produces a *data descriptor* object, which is stored in the __dict__
+    of the *class* under the name of the decorated method ('fget' nominally). The cached value is
+    stored in the __dict__ of the *instance* under that same name.
+
+    Because it is a data descriptor (as opposed to a *non-data descriptor*), its `__get__()` method
+    is executed on each access of the decorated attribute; the __dict__ item of the same name is
+    "shadowed" by the descriptor.
+
+    While this may represent a performance improvement over a property, its greater benefit may be
+    its other characteristics. One common use is to construct collaborator objects, removing that
+    "real work" from the constructor, while still only executing once. It also de-couples client
+    code from any sequencing considerations; if it's accessed from more than one location, it's
+    assured it will be ready whenever needed.
+
+    Loosely based on: https://stackoverflow.com/a/6849299/1902513.
+
+    A lazyproperty is read-only. There is no counterpart to the optional "setter" (or deleter)
+    behavior of an @property. This is critically important to maintaining its immutability and
+    idempotence guarantees. Attempting to assign to a lazyproperty raises AttributeError
+    unconditionally.
+
+    The parameter names in the methods below correspond to this usage example::
+
+        class Obj(object)
+
+            @lazyproperty
+            def fget(self):
+                return 'some result'
+
+        obj = Obj()
+
+    Not suitable for wrapping a function (as opposed to a method) because it is not callable.
+    """
+
+    def __init__(self, fget: Callable[..., _T]) -> None:
+        """*fget* is the decorated method (a "getter" function).
+
+        A lazyproperty is read-only, so there is only an *fget* function (a regular
+        @property can also have an fset and fdel function). This name was chosen for
+        consistency with Python's `property` class which uses this name for the
+        corresponding parameter.
+        """
+        # --- maintain a reference to the wrapped getter method
+        self._fget = fget
+        # --- and store the name of that decorated method
+        self._name = fget.__name__
+        # --- adopt fget's __name__, __doc__, and other attributes
+        functools.update_wrapper(self, fget)  # pyright: ignore
+
+    def __get__(self, obj: Any, type: Any = None) -> _T:
+        """Called on each access of 'fget' attribute on class or instance.
+
+        *self* is this instance of a lazyproperty descriptor "wrapping" the property
+        method it decorates (`fget`, nominally).
+
+        *obj* is the "host" object instance when the attribute is accessed from an
+        object instance, e.g. `obj = Obj(); obj.fget`. *obj* is None when accessed on
+        the class, e.g. `Obj.fget`.
+
+        *type* is the class hosting the decorated getter method (`fget`) on both class
+        and instance attribute access.
+        """
+        # --- when accessed on class, e.g. Obj.fget, just return this descriptor
+        # --- instance (patched above to look like fget).
+        if obj is None:
+            return self  # type: ignore
+
+        # --- when accessed on instance, start by checking instance __dict__ for
+        # --- item with key matching the wrapped function's name
+        value = obj.__dict__.get(self._name)
+        if value is None:
+            # --- on first access, the __dict__ item will be absent. Evaluate fget()
+            # --- and store that value in the (otherwise unused) host-object
+            # --- __dict__ value of same name ('fget' nominally)
+            value = self._fget(obj)
+            obj.__dict__[self._name] = value
+        return cast(_T, value)
+
+    def __set__(self, obj: Any, value: Any) -> None:
+        """Raises unconditionally, to preserve read-only behavior.
+
+        This decorator is intended to implement immutable (and idempotent) object
+        attributes. For that reason, assignment to this property must be explicitly
+        prevented.
+
+        If this __set__ method was not present, this descriptor would become a
+        *non-data descriptor*. That would be nice because the cached value would be
+        accessed directly once set (__dict__ attrs have precedence over non-data
+        descriptors on instance attribute lookup). The problem is, there would be
+        nothing to stop assignment to the cached value, which would overwrite the result
+        of `fget()` and break both the immutability and idempotence guarantees of this
+        decorator.
+
+        The performance with this __set__() method in place was roughly 0.4 usec per
+        access when measured on a 2.8GHz development machine; so quite snappy and
+        probably not a rich target for optimization efforts.
+        """
+        raise AttributeError("can't set attribute")
+
+
 def save_as_jsonl(data: List[Dict], filename: str) -> None:
     with open(filename, "w+") as output_file:
         output_file.writelines(json.dumps(datum) + "\n" for datum in data)
@@ -47,7 +160,7 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
-def dependency_exists(dependency):
+def dependency_exists(dependency: str):
     try:
         importlib.import_module(dependency)
     except ImportError as e: