From 1bd650b42bfb76497e2d9d0564c904876bc6bd5b Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 13 May 2024 17:30:08 -0700 Subject: [PATCH] fix(docx): fix disk-space leak `partition_doc()` creates a temporary file in which it writes each source-document provided as a file-like object. This file is not deleted and disk consumption grows without bound. `convert_office_doc()` uses a command-line program provided with LibreOffice to convert from DOC -> DOCX. Because this command-line program operates in a different memory space, the source file cannot be passed as an in-memory object and needs to be on the filesystem. Fix this by writing the temporary source file in the TemporaryDirectory already being used to write the conversion-target DOCX file. That directory is automatically removed when `partition_doc()` completes. --- CHANGELOG.md | 3 +- unstructured/__version__.py | 2 +- unstructured/partition/doc.py | 86 +++++++++++++++++++++------------- unstructured/partition/pptx.py | 2 +- 4 files changed, 58 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0496479af1..5589dcb291 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.8-dev8 +## 0.13.8-dev9 ### Enhancements @@ -15,6 +15,7 @@ * **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously. * **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf. * **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines. +* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`. ## 0.13.7 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 83c1597480..1bcdeee8f7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev8" # pragma: no cover +__version__ = "0.13.8-dev9" # pragma: no cover diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index aa01d4d518..223d7c1042 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -63,52 +63,74 @@ def partition_doc( This information will be reflected in elements' metadata and can be be especially useful when partitioning a document that is part of a larger document. """ - # Verify that only one of the arguments was provided - if filename is None: - filename = "" exactly_one(filename=filename, file=file) - if len(filename) > 0: - _, filename_no_path = os.path.split(os.path.abspath(filename)) - base_filename, _ = os.path.splitext(filename_no_path) - if not os.path.exists(filename): - raise ValueError(f"The file {filename} does not exist.") + last_modified = get_last_modified(filename, file, date_from_file_object) - last_modification_date = get_last_modified_date(filename) + # -- validate file-path when provided so we can provide a more meaningful error -- + if filename is not None and not os.path.exists(filename): + raise ValueError(f"The file {filename} does not exist.") - elif file is not None: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.write(file.read()) - tmp.close() - filename = tmp.name - _, filename_no_path = os.path.split(os.path.abspath(tmp.name)) - base_filename, _ = os.path.splitext(filename_no_path) + # -- `convert_office_doc` uses a command-line program that ships with LibreOffice to convert + # -- from DOC -> DOCX. So both the source and the target need to be file-system files. Put + # -- transient files in a temporary directory that is automatically removed so they don't + # -- pile up. + with tempfile.TemporaryDirectory() as target_dir: - last_modification_date = ( - get_last_modified_date_from_file(file) if date_from_file_object else None - ) + source_file_path = f"{target_dir}/document.doc" if file is not None else filename + assert source_file_path is not None - with tempfile.TemporaryDirectory() as tmpdir: + # -- when source is a file-like object, write it to the filesystem so the command-line + # -- process can access it (CLI executes in different memory-space). + if file is not None: + with open(source_file_path, "wb") as f: + f.write(file.read()) + + # -- convert the .doc file to .docx. The resulting file takes the same base-name as the + # -- source file and is written to `target_dir`. convert_office_doc( - filename, - tmpdir, + source_file_path, + target_dir, target_format="docx", target_filter=libre_office_filter, ) - docx_filename = os.path.join(tmpdir, f"{base_filename}.docx") + + # -- compute the path of the resulting .docx document -- + _, filename_no_path = os.path.split(os.path.abspath(source_file_path)) + base_filename, _ = os.path.splitext(filename_no_path) + target_file_path = os.path.join(target_dir, f"{base_filename}.docx") + + # -- and partition it. Note that `kwargs` is not passed which is a sketchy way to partially + # -- disable post-partitioning processing (what the decorators do) so for example the + # -- resulting elements are not double-chunked. elements = partition_docx( - filename=docx_filename, - metadata_filename=metadata_filename, - include_page_breaks=include_page_breaks, + filename=target_file_path, + detect_language_per_element=detect_language_per_element, include_metadata=include_metadata, - metadata_last_modified=metadata_last_modified or last_modification_date, + include_page_breaks=include_page_breaks, languages=languages, - detect_language_per_element=detect_language_per_element, + metadata_filename=metadata_filename, + metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, ) - # remove tmp.name from filename if parsing file - if file: - for element in elements: - element.metadata.filename = metadata_filename + + # -- Remove temporary document.docx path from metadata when necessary. Note `metadata_filename` + # -- defaults to `None` but that's better than a meaningless temporary filename. + if file: + for element in elements: + element.metadata.filename = metadata_filename return elements + + +def get_last_modified( + filename: str | None, file: IO[bytes] | None, date_from_file_object: bool +) -> str | None: + """Determine best available last-modified date from partitioner document-source.""" + if filename is not None: + return get_last_modified_date(filename) + + if file is not None: + return get_last_modified_date_from_file(file) if date_from_file_object else None + + return None diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 7cc1924fd1..1a78e4f4a2 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -97,8 +97,8 @@ def partition_pptx( languages: Optional[list[str]] = ["auto"], metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - strategy: str = PartitionStrategy.FAST, starting_page_number: int = 1, + strategy: str = PartitionStrategy.FAST, **kwargs: Any, ) -> list[Element]: """Partition PowerPoint document in .pptx format into its document elements.