Merge branch 'fix/only-amd-build' of github.com:Unstructured-IO/unstr…

…uctured into fix/only-amd-build
Unstructured-IO · May 16, 2024 · 565b1f9 · 565b1f9
2 parents f750249 + fc3d16f
commit 565b1f9
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 132 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.13.8-dev14
+## 0.13.8-dev15
 
 ### Enhancements
 
@@ -20,6 +20,7 @@
 * **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
 * **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
 * **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
+* **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`.
 
 ## 0.13.7
 

diff --git a/typings/pypandoc/__init__.pyi b/typings/pypandoc/__init__.pyi
@@ -0,0 +1,5 @@
+import pathlib
+
+def convert_file(
+    source_file: str, to: str, format: str | None, outputfile: str | pathlib.Path | None
+) -> str: ...
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.8-dev14"  # pragma: no cover
+__version__ = "0.13.8-dev15"  # pragma: no cover
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -5,9 +5,8 @@
 import html
 import io
 import itertools
-import os
 import tempfile
-from typing import IO, Any, Iterator, Optional, Type, cast
+from typing import IO, Any, Iterator, Optional, Type
 
 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
@@ -45,7 +44,6 @@
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
-    exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
 )
@@ -58,114 +56,13 @@
     is_us_city_state_zip,
 )
 from unstructured.partition.utils.constants import PartitionStrategy
-from unstructured.utils import (
-    dependency_exists,
-    is_temp_file_path,
-    lazyproperty,
-    requires_dependencies,
-)
-
-if dependency_exists("pypandoc"):
-    import pypandoc
+from unstructured.utils import is_temp_file_path, lazyproperty
 
 DETECTION_ORIGIN: str = "docx"
 BlockElement: TypeAlias = "CT_P | CT_Tbl"
 BlockItem: TypeAlias = "Paragraph | DocxTable"
 
 
-@requires_dependencies("pypandoc")
-def convert_and_partition_docx(
-    source_format: str,
-    filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
-    include_metadata: bool = True,
-    infer_table_structure: bool = True,
-    metadata_filename: Optional[str] = None,
-    metadata_last_modified: Optional[str] = None,
-    languages: Optional[list[str]] = ["auto"],
-    detect_language_per_element: bool = False,
-    starting_page_number: int = 1,
-) -> list[Element]:
-    """Converts a document to DOCX and then partitions it using partition_docx.
-
-    Works with any file format support by pandoc.
-
-    Parameters
-    ----------
-    source_format
-        The format of the source document, .e.g. odt
-    filename
-        A string defining the target filename path.
-    file
-        A file-like object using "rb" mode --> open(filename, "rb").
-    include_metadata
-        Determines whether or not metadata is included in the metadata attribute on the elements in
-        the output.
-    infer_table_structure
-        If True, any Table elements that are extracted will also have a metadata field
-        named "text_as_html" where the table's text content is rendered into an html string.
-        I.e., rows and cells are preserved.
-        Whether True or False, the "text" field is always present in any Table element
-        and is the text content of the table (no structure).
-    languages
-        User defined value for `metadata.languages` if provided. Otherwise language is detected
-        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
-        in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
-    starting_page_number
-        Indicates what page number should be assigned to the first page in the document.
-        This information will be reflected in elements' metadata and can be be especially
-        useful when partitioning a document that is part of a larger document.
-    """
-    exactly_one(filename=filename, file=file)
-
-    def validate_filename(filename: str) -> str:
-        """Return path to a file confirmed to exist on the filesystem."""
-        if not os.path.exists(filename):
-            raise ValueError(f"The file {filename} does not exist.")
-        return filename
-
-    def copy_to_tempfile(file: IO[bytes]) -> str:
-        """Return path to temporary copy of file to be converted."""
-        with tempfile.NamedTemporaryFile(delete=False) as tmp:
-            tmp.write(file.read())
-            return tmp.name
-
-    def extract_docx_filename(file_path: str) -> str:
-        """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
-        # -- a/b/foo.odt -> foo.odt --
-        filename = os.path.basename(file_path)
-        # -- foo.odt -> foo --
-        root_name, _ = os.path.splitext(filename)
-        # -- foo -> foo.docx --
-        return f"{root_name}.docx"
-
-    file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
-        pypandoc.convert_file(  # pyright: ignore
-            file_path,
-            "docx",
-            format=source_format,
-            outputfile=docx_path,
-        )
-        elements = partition_docx(
-            filename=docx_path,
-            metadata_filename=metadata_filename,
-            include_metadata=include_metadata,
-            infer_table_structure=infer_table_structure,
-            metadata_last_modified=metadata_last_modified,
-            languages=languages,
-            detect_language_per_element=detect_language_per_element,
-            starting_page_number=starting_page_number,
-        )
-
-    return elements
-
-
 @process_metadata()
 @add_metadata_with_filetype(FileType.DOCX)
 @add_chunking_strategy

diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py
@@ -1,28 +1,34 @@
 from __future__ import annotations
 
-from typing import IO, Any, Optional
+import os
+import tempfile
+from typing import IO, Any, Optional, cast
 
 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
-from unstructured.partition.docx import convert_and_partition_docx
+from unstructured.partition.common import (
+    exactly_one,
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+)
+from unstructured.partition.docx import partition_docx
+from unstructured.utils import requires_dependencies
 
 
 @process_metadata()
 @add_metadata_with_filetype(FileType.ODT)
 @add_chunking_strategy
 def partition_odt(
     filename: Optional[str] = None,
+    *,
+    date_from_file_object: bool = False,
+    detect_language_per_element: bool = False,
     file: Optional[IO[bytes]] = None,
-    include_metadata: bool = True,
     infer_table_structure: bool = True,
+    languages: Optional[list[str]] = ["auto"],
     metadata_filename: Optional[str] = None,
     metadata_last_modified: Optional[str] = None,
-    chunking_strategy: Optional[str] = None,
-    languages: Optional[list[str]] = ["auto"],
-    detect_language_per_element: bool = False,
-    date_from_file_object: bool = False,
     starting_page_number: int = 1,
     **kwargs: Any,
 ) -> list[Element]:
@@ -51,25 +57,79 @@ def partition_odt(
                 Detect language per element instead of at the document level.
     date_from_file_object
         Applies only when providing file via `file` parameter. If this option is True, attempt
-        infer last_modified metadata from bytes, otherwise set it to None.
+        infer last_modified metadata from the file-like object, otherwise set it to None.
     """
 
-    last_modification_date = None
-    if filename:
-        last_modification_date = get_last_modified_date(filename)
-    elif file:
-        last_modification_date = (
-            get_last_modified_date_from_file(file) if date_from_file_object else None
+    last_modification_date = (
+        get_last_modified_date(filename)
+        if filename
+        else get_last_modified_date_from_file(file) if file and date_from_file_object else None
+    )
+
+    with tempfile.TemporaryDirectory() as target_dir:
+        docx_path = _convert_odt_to_docx(target_dir, filename, file)
+        elements = partition_docx(
+            filename=docx_path,
+            detect_language_per_element=detect_language_per_element,
+            infer_table_structure=infer_table_structure,
+            languages=languages,
+            metadata_filename=metadata_filename,
+            metadata_last_modified=metadata_last_modified or last_modification_date,
+            starting_page_number=starting_page_number,
         )
 
-    return convert_and_partition_docx(
-        source_format="odt",
-        filename=filename,
-        file=file,
-        infer_table_structure=infer_table_structure,
-        metadata_filename=metadata_filename,
-        metadata_last_modified=metadata_last_modified or last_modification_date,
-        languages=languages,
-        detect_language_per_element=detect_language_per_element,
-        starting_page_number=starting_page_number,
+    return elements
+
+
+@requires_dependencies("pypandoc")
+def _convert_odt_to_docx(
+    target_dir: str, filename: Optional[str], file: Optional[IO[bytes]]
+) -> str:
+    """Convert ODT document to DOCX returning the new .docx file's path.
+
+    Parameters
+    ----------
+    target_dir
+        The str directory-path to use for conversion purposes. The new DOCX file is written to this
+        directory. When passed as a file-like object, a copy of the source file is written here as
+        well. It is the caller's responsibility to remove this directory and its contents when
+        they are no longer needed.
+    filename
+        A str file-path specifying the location of the source ODT file on the local filesystem.
+    file
+        A file-like object open for reading in binary mode ("rb" mode).
+    """
+    exactly_one(filename=filename, file=file)
+
+    # -- validate file-path when provided so we can provide a more meaningful error than whatever
+    # -- would come from pandoc.
+    if filename is not None and not os.path.exists(filename):
+        raise ValueError(f"The file {filename} does not exist.")
+
+    # -- Pandoc is a command-line program running in its own memory-space. It can therefore only
+    # -- operate on files on the filesystem. If the source document was passed as `file`, write
+    # -- it to `target_dir/document.odt` and use that path as the source-path.
+    source_file_path = f"{target_dir}/document.odt" if file is not None else cast(str, filename)
+    if file is not None:
+        with open(source_file_path, "wb") as f:
+            f.write(file.read())
+
+    # -- Compute the path of the resulting .docx document. We want its file-name to be preserved
+    # -- if the source-document was provided as `filename`.
+    # -- a/b/foo.odt -> foo.odt --
+    file_name = os.path.basename(source_file_path)
+    # -- foo.odt -> foo --
+    base_name, _ = os.path.splitext(file_name)
+    # -- foo -> foo.docx --
+    target_docx_path = os.path.join(target_dir, f"{base_name}.docx")
+
+    import pypandoc
+
+    pypandoc.convert_file(
+        source_file_path,
+        "docx",
+        format="odt",
+        outputfile=target_docx_path,
     )
+
+    return target_docx_path
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.13.8-dev14" # pragma: no cover
		__version__ = "0.13.8-dev15" # pragma: no cover