From 6d0842654064aa4242b1bfa82426c582f80322c6 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 12 Sep 2023 22:12:26 -0700 Subject: [PATCH] rfctr: improve convert_and_partition_docx() * Handle situation where `pypandoc` is not installed with a specific error message rather than something I expect is obscure. * Clarify logic for getting `filename_no_path` and resolve "filename_no_path is possibly unbound" lint error. --- unstructured/partition/docx.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 4fa95bab80..f2c1a3520f 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -88,22 +88,26 @@ def convert_and_partition_docx( Determines whether or not metadata is included in the metadata attribute on the elements in the output. """ - if filename is None: - filename = "" + if "pypandoc" not in globals(): + raise ImportError("package 'pypandoc' required for this operation but not installed") + exactly_one(filename=filename, file=file) - filename_no_path = "" - if len(filename) > 0: - _, filename_no_path = os.path.split(os.path.abspath(filename)) - base_filename, _ = os.path.splitext(filename_no_path) + def validate_filename(filename: str) -> str: if not os.path.exists(filename): raise ValueError(f"The file {filename} does not exist.") - elif file is not None: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.write(file.read()) - tmp.close() - filename = tmp.name - _, filename_no_path = os.path.split(os.path.abspath(tmp.name)) + _, filename_no_path = os.path.split(os.path.abspath(filename)) + return filename_no_path + + def write_to_tempfile(file: BinaryIO) -> str: + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp.write(file.read()) + _, filename_no_path = os.path.split(os.path.abspath(tmp.name)) + return filename_no_path + + filename_no_path = ( + validate_filename(filename) if filename else write_to_tempfile(cast(BinaryIO, file)) + ) base_filename, _ = os.path.splitext(filename_no_path) @@ -181,8 +185,6 @@ class _DocxPartitioner: # TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for # `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like # the "_element_contains_pagebreak()" function. - # TODO: Improve ._is_list_item() to detect manually-applied bullets (which do not appear in the - # paragraph text so are missed by `is_bulleted_text()`) using XPath. # TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is # a list-item is encapsulated in a single place rather than distributed around the code. # TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead @@ -191,8 +193,7 @@ class _DocxPartitioner: # TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much # domain-specific knowledge to comfortable here and is of general use so welcome in the # library. - # DONE: A section can give rise to one or two page breaks, like an "odd-page" section start - # from an odd current-page produces two. Add page-break detection on section as well. + # TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`. def __init__( self,