Skip to content

Commit

Permalink
rfctr: improve convert_and_partition_docx()
Browse files Browse the repository at this point in the history
* Handle situation where `pypandoc` is not installed with a specific
  error message rather than something I expect is obscure.
* Clarify logic for getting `filename_no_path` and resolve
  "filename_no_path is possibly unbound" lint error.
  • Loading branch information
scanny committed Sep 19, 2023
1 parent 51690b4 commit 8ea8034
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 25 deletions.
48 changes: 26 additions & 22 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
is_possible_title,
is_us_city_state_zip,
)
from unstructured.utils import dependency_exists, lazyproperty
from unstructured.utils import dependency_exists, lazyproperty, requires_dependencies

if dependency_exists("pypandoc"):
import pypandoc
Expand All @@ -79,6 +79,7 @@
BlockItem: TypeAlias = Union[Paragraph, DocxTable]


@requires_dependencies("pypandoc")
def convert_and_partition_docx(
source_format: str,
filename: Optional[str] = None,
Expand All @@ -103,35 +104,41 @@ def convert_and_partition_docx(
Determines whether or not metadata is included in the metadata attribute on the elements in
the output.
"""
if filename is None:
filename = ""
exactly_one(filename=filename, file=file)

filename_no_path = ""
if len(filename) > 0:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
def validate_filename(filename: str) -> str:
"""Return path to a file confirmed to exist on the filesystem."""
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
elif file is not None:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
return filename

base_filename, _ = os.path.splitext(filename_no_path)
def copy_to_tempfile(file: BinaryIO) -> str:
"""Return path to temporary copy of file to be converted."""
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.write(file.read())
return tmp.name

def extract_docx_filename(file_path: str) -> str:
"""Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
# -- a/b/foo.odt -> foo.odt --
filename = os.path.basename(file_path)
# -- foo.odt -> foo --
root_name, _ = os.path.splitext(filename)
# -- foo -> foo.docx --
return f"{root_name}.docx"

file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(BinaryIO, file))

with tempfile.TemporaryDirectory() as tmpdir:
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
pypandoc.convert_file( # pyright: ignore
filename,
file_path,
"docx",
format=source_format,
outputfile=docx_filename,
outputfile=docx_path,
)
elements = partition_docx(
filename=docx_filename,
filename=docx_path,
metadata_filename=metadata_filename,
include_metadata=include_metadata,
metadata_last_modified=metadata_last_modified,
Expand Down Expand Up @@ -196,8 +203,6 @@ class _DocxPartitioner:
# TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for
# `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like
# the "_element_contains_pagebreak()" function.
# TODO: Improve ._is_list_item() to detect manually-applied bullets (which do not appear in the
# paragraph text so are missed by `is_bulleted_text()`) using XPath.
# TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is
# a list-item is encapsulated in a single place rather than distributed around the code.
# TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead
Expand All @@ -206,8 +211,7 @@ class _DocxPartitioner:
# TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much
# domain-specific knowledge to comfortable here and is of general use so welcome in the
# library.
# DONE: A section can give rise to one or two page breaks, like an "odd-page" section start
# from an odd current-page produces two. Add page-break detection on section as well.
# TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.

def __init__(
self,
Expand Down
6 changes: 3 additions & 3 deletions unstructured/partition/odt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import IO, List, Optional
from typing import Any, BinaryIO, List, Optional

from unstructured.chunking.title import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
Expand All @@ -15,12 +15,12 @@
@add_chunking_strategy()
def partition_odt(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file: Optional[BinaryIO] = None,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
**kwargs,
**kwargs: Any,
) -> List[Element]:
"""Partitions Open Office Documents in .odt format into its document elements.
Expand Down

0 comments on commit 8ea8034

Please sign in to comment.