diff --git a/CHANGELOG.md b/CHANGELOG.md index 074f65d1ff..bcff921784 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.8-dev14 +## 0.13.8-dev15 ### Enhancements @@ -20,6 +20,7 @@ * **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines. * **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`. * **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+. +* **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`. ## 0.13.7 diff --git a/typings/pypandoc/__init__.pyi b/typings/pypandoc/__init__.pyi index e69de29bb2..3de23217ad 100644 --- a/typings/pypandoc/__init__.pyi +++ b/typings/pypandoc/__init__.pyi @@ -0,0 +1,5 @@ +import pathlib + +def convert_file( + source_file: str, to: str, format: str | None, outputfile: str | pathlib.Path | None +) -> str: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 602e99f504..10a704ee04 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev14" # pragma: no cover +__version__ = "0.13.8-dev15" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 19f874a338..e29a7089f2 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -5,9 +5,8 @@ import html import io import itertools -import os import tempfile -from typing import IO, Any, Iterator, Optional, Type, cast +from typing import IO, Any, Iterator, Optional, Type # -- CT_* stands for "complex-type", an XML element type in docx parlance -- import docx @@ -45,7 +44,6 @@ ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( - exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) @@ -58,114 +56,13 @@ is_us_city_state_zip, ) from unstructured.partition.utils.constants import PartitionStrategy -from unstructured.utils import ( - dependency_exists, - is_temp_file_path, - lazyproperty, - requires_dependencies, -) - -if dependency_exists("pypandoc"): - import pypandoc +from unstructured.utils import is_temp_file_path, lazyproperty DETECTION_ORIGIN: str = "docx" BlockElement: TypeAlias = "CT_P | CT_Tbl" BlockItem: TypeAlias = "Paragraph | DocxTable" -@requires_dependencies("pypandoc") -def convert_and_partition_docx( - source_format: str, - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - include_metadata: bool = True, - infer_table_structure: bool = True, - metadata_filename: Optional[str] = None, - metadata_last_modified: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - starting_page_number: int = 1, -) -> list[Element]: - """Converts a document to DOCX and then partitions it using partition_docx. - - Works with any file format support by pandoc. - - Parameters - ---------- - source_format - The format of the source document, .e.g. odt - filename - A string defining the target filename path. - file - A file-like object using "rb" mode --> open(filename, "rb"). - include_metadata - Determines whether or not metadata is included in the metadata attribute on the elements in - the output. - infer_table_structure - If True, any Table elements that are extracted will also have a metadata field - named "text_as_html" where the table's text content is rendered into an html string. - I.e., rows and cells are preserved. - Whether True or False, the "text" field is always present in any Table element - and is the text content of the table (no structure). - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. - starting_page_number - Indicates what page number should be assigned to the first page in the document. - This information will be reflected in elements' metadata and can be be especially - useful when partitioning a document that is part of a larger document. - """ - exactly_one(filename=filename, file=file) - - def validate_filename(filename: str) -> str: - """Return path to a file confirmed to exist on the filesystem.""" - if not os.path.exists(filename): - raise ValueError(f"The file {filename} does not exist.") - return filename - - def copy_to_tempfile(file: IO[bytes]) -> str: - """Return path to temporary copy of file to be converted.""" - with tempfile.NamedTemporaryFile(delete=False) as tmp: - tmp.write(file.read()) - return tmp.name - - def extract_docx_filename(file_path: str) -> str: - """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """ - # -- a/b/foo.odt -> foo.odt -- - filename = os.path.basename(file_path) - # -- foo.odt -> foo -- - root_name, _ = os.path.splitext(filename) - # -- foo -> foo.docx -- - return f"{root_name}.docx" - - file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file)) - - with tempfile.TemporaryDirectory() as tmpdir: - docx_path = os.path.join(tmpdir, extract_docx_filename(file_path)) - pypandoc.convert_file( # pyright: ignore - file_path, - "docx", - format=source_format, - outputfile=docx_path, - ) - elements = partition_docx( - filename=docx_path, - metadata_filename=metadata_filename, - include_metadata=include_metadata, - infer_table_structure=infer_table_structure, - metadata_last_modified=metadata_last_modified, - languages=languages, - detect_language_per_element=detect_language_per_element, - starting_page_number=starting_page_number, - ) - - return elements - - @process_metadata() @add_metadata_with_filetype(FileType.DOCX) @add_chunking_strategy diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index ca8fea3af8..922e5032b3 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -1,12 +1,19 @@ from __future__ import annotations -from typing import IO, Any, Optional +import os +import tempfile +from typing import IO, Any, Optional, cast from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file -from unstructured.partition.docx import convert_and_partition_docx +from unstructured.partition.common import ( + exactly_one, + get_last_modified_date, + get_last_modified_date_from_file, +) +from unstructured.partition.docx import partition_docx +from unstructured.utils import requires_dependencies @process_metadata() @@ -14,15 +21,14 @@ @add_chunking_strategy def partition_odt( filename: Optional[str] = None, + *, + date_from_file_object: bool = False, + detect_language_per_element: bool = False, file: Optional[IO[bytes]] = None, - include_metadata: bool = True, infer_table_structure: bool = True, + languages: Optional[list[str]] = ["auto"], metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - chunking_strategy: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - date_from_file_object: bool = False, starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: @@ -51,25 +57,79 @@ def partition_odt( Detect language per element instead of at the document level. date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt - infer last_modified metadata from bytes, otherwise set it to None. + infer last_modified metadata from the file-like object, otherwise set it to None. """ - last_modification_date = None - if filename: - last_modification_date = get_last_modified_date(filename) - elif file: - last_modification_date = ( - get_last_modified_date_from_file(file) if date_from_file_object else None + last_modification_date = ( + get_last_modified_date(filename) + if filename + else get_last_modified_date_from_file(file) if file and date_from_file_object else None + ) + + with tempfile.TemporaryDirectory() as target_dir: + docx_path = _convert_odt_to_docx(target_dir, filename, file) + elements = partition_docx( + filename=docx_path, + detect_language_per_element=detect_language_per_element, + infer_table_structure=infer_table_structure, + languages=languages, + metadata_filename=metadata_filename, + metadata_last_modified=metadata_last_modified or last_modification_date, + starting_page_number=starting_page_number, ) - return convert_and_partition_docx( - source_format="odt", - filename=filename, - file=file, - infer_table_structure=infer_table_structure, - metadata_filename=metadata_filename, - metadata_last_modified=metadata_last_modified or last_modification_date, - languages=languages, - detect_language_per_element=detect_language_per_element, - starting_page_number=starting_page_number, + return elements + + +@requires_dependencies("pypandoc") +def _convert_odt_to_docx( + target_dir: str, filename: Optional[str], file: Optional[IO[bytes]] +) -> str: + """Convert ODT document to DOCX returning the new .docx file's path. + + Parameters + ---------- + target_dir + The str directory-path to use for conversion purposes. The new DOCX file is written to this + directory. When passed as a file-like object, a copy of the source file is written here as + well. It is the caller's responsibility to remove this directory and its contents when + they are no longer needed. + filename + A str file-path specifying the location of the source ODT file on the local filesystem. + file + A file-like object open for reading in binary mode ("rb" mode). + """ + exactly_one(filename=filename, file=file) + + # -- validate file-path when provided so we can provide a more meaningful error than whatever + # -- would come from pandoc. + if filename is not None and not os.path.exists(filename): + raise ValueError(f"The file {filename} does not exist.") + + # -- Pandoc is a command-line program running in its own memory-space. It can therefore only + # -- operate on files on the filesystem. If the source document was passed as `file`, write + # -- it to `target_dir/document.odt` and use that path as the source-path. + source_file_path = f"{target_dir}/document.odt" if file is not None else cast(str, filename) + if file is not None: + with open(source_file_path, "wb") as f: + f.write(file.read()) + + # -- Compute the path of the resulting .docx document. We want its file-name to be preserved + # -- if the source-document was provided as `filename`. + # -- a/b/foo.odt -> foo.odt -- + file_name = os.path.basename(source_file_path) + # -- foo.odt -> foo -- + base_name, _ = os.path.splitext(file_name) + # -- foo -> foo.docx -- + target_docx_path = os.path.join(target_dir, f"{base_name}.docx") + + import pypandoc + + pypandoc.convert_file( + source_file_path, + "docx", + format="odt", + outputfile=target_docx_path, ) + + return target_docx_path