rfctr: improve convert_and_partition_docx()

* Handle situation where `pypandoc` is not installed with a specific error message rather than something I expect is obscure. * Clarify logic for getting `filename_no_path` and resolve "filename_no_path is possibly unbound" lint error.
Unstructured-IO · Sep 14, 2023 · f9f0d34 · f9f0d34
1 parent c15a56d
commit f9f0d34
Showing 1 changed file with 17 additions and 16 deletions.
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -88,22 +88,26 @@ def convert_and_partition_docx(
         Determines whether or not metadata is included in the metadata attribute on the elements in
         the output.
     """
-    if filename is None:
-        filename = ""
+    if "pypandoc" not in globals():
+        raise ImportError("package 'pypandoc' required for this operation but not installed")
+
     exactly_one(filename=filename, file=file)
 
-    filename_no_path = ""
-    if len(filename) > 0:
-        _, filename_no_path = os.path.split(os.path.abspath(filename))
-        base_filename, _ = os.path.splitext(filename_no_path)
+    def validate_filename(filename: str) -> str:
         if not os.path.exists(filename):
             raise ValueError(f"The file {filename} does not exist.")
-    elif file is not None:
-        tmp = tempfile.NamedTemporaryFile(delete=False)
-        tmp.write(file.read())
-        tmp.close()
-        filename = tmp.name
-        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
+        _, filename_no_path = os.path.split(os.path.abspath(filename))
+        return filename_no_path
+
+    def write_to_tempfile(file: BinaryIO) -> str:
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+            tmp.write(file.read())
+            _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
+        return filename_no_path
+
+    filename_no_path = (
+        validate_filename(filename) if filename else write_to_tempfile(cast(BinaryIO, file))
+    )
 
     base_filename, _ = os.path.splitext(filename_no_path)
 
@@ -181,8 +185,6 @@ class _DocxPartitioner:
     # TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for
     #       `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like
     #        the "_element_contains_pagebreak()" function.
-    # TODO: Improve ._is_list_item() to detect manually-applied bullets (which do not appear in the
-    #       paragraph text so are missed by `is_bulleted_text()`) using XPath.
     # TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is
     #       a list-item is encapsulated in a single place rather than distributed around the code.
     # TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead
@@ -191,8 +193,7 @@ class _DocxPartitioner:
     # TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much
     #       domain-specific knowledge to comfortable here and is of general use so welcome in the
     #       library.
-    # DONE: A section can give rise to one or two page breaks, like an "odd-page" section start
-    #       from an odd current-page produces two. Add page-break detection on section as well.
+    # TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.
 
     def __init__(
         self,