Skip to content

Commit

Permalink
rfctr: improve convert_and_partition_docx()
Browse files Browse the repository at this point in the history
* Handle situation where `pypandoc` is not installed with a specific
  error message rather than something I expect is obscure.
* Clarify logic for getting `filename_no_path` and resolve
  "filename_no_path is possibly unbound" lint error.
  • Loading branch information
scanny committed Sep 14, 2023
1 parent c15a56d commit f9f0d34
Showing 1 changed file with 17 additions and 16 deletions.
33 changes: 17 additions & 16 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,22 +88,26 @@ def convert_and_partition_docx(
Determines whether or not metadata is included in the metadata attribute on the elements in
the output.
"""
if filename is None:
filename = ""
if "pypandoc" not in globals():
raise ImportError("package 'pypandoc' required for this operation but not installed")

exactly_one(filename=filename, file=file)

filename_no_path = ""
if len(filename) > 0:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
def validate_filename(filename: str) -> str:
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
elif file is not None:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
_, filename_no_path = os.path.split(os.path.abspath(filename))
return filename_no_path

def write_to_tempfile(file: BinaryIO) -> str:
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.write(file.read())
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
return filename_no_path

filename_no_path = (
validate_filename(filename) if filename else write_to_tempfile(cast(BinaryIO, file))
)

base_filename, _ = os.path.splitext(filename_no_path)

Expand Down Expand Up @@ -181,8 +185,6 @@ class _DocxPartitioner:
# TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for
# `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like
# the "_element_contains_pagebreak()" function.
# TODO: Improve ._is_list_item() to detect manually-applied bullets (which do not appear in the
# paragraph text so are missed by `is_bulleted_text()`) using XPath.
# TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is
# a list-item is encapsulated in a single place rather than distributed around the code.
# TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead
Expand All @@ -191,8 +193,7 @@ class _DocxPartitioner:
# TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much
# domain-specific knowledge to comfortable here and is of general use so welcome in the
# library.
# DONE: A section can give rise to one or two page breaks, like an "odd-page" section start
# from an odd current-page produces two. Add page-break detection on section as well.
# TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.

def __init__(
self,
Expand Down

0 comments on commit f9f0d34

Please sign in to comment.