Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(doc): fix disk-space leak #3019

Merged
merged 1 commit into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.8-dev8
## 0.13.8-dev9

### Enhancements

Expand All @@ -15,6 +15,7 @@
* **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously.
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.

## 0.13.7

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev8" # pragma: no cover
__version__ = "0.13.8-dev9" # pragma: no cover
86 changes: 54 additions & 32 deletions unstructured/partition/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,52 +63,74 @@ def partition_doc(
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
"""
# Verify that only one of the arguments was provided
if filename is None:
filename = ""
Coniferish marked this conversation as resolved.
Show resolved Hide resolved
exactly_one(filename=filename, file=file)

if len(filename) > 0:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
Coniferish marked this conversation as resolved.
Show resolved Hide resolved
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
last_modified = get_last_modified(filename, file, date_from_file_object)
Coniferish marked this conversation as resolved.
Show resolved Hide resolved

last_modification_date = get_last_modified_date(filename)
# -- validate file-path when provided so we can provide a more meaningful error --
if filename is not None and not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")

elif file is not None:
tmp = tempfile.NamedTemporaryFile(delete=False)
Coniferish marked this conversation as resolved.
Show resolved Hide resolved
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
base_filename, _ = os.path.splitext(filename_no_path)
# -- `convert_office_doc` uses a command-line program that ships with LibreOffice to convert
# -- from DOC -> DOCX. So both the source and the target need to be file-system files. Put
# -- transient files in a temporary directory that is automatically removed so they don't
# -- pile up.
with tempfile.TemporaryDirectory() as target_dir:

last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
source_file_path = f"{target_dir}/document.doc" if file is not None else filename
assert source_file_path is not None
Coniferish marked this conversation as resolved.
Show resolved Hide resolved

with tempfile.TemporaryDirectory() as tmpdir:
# -- when source is a file-like object, write it to the filesystem so the command-line
# -- process can access it (CLI executes in different memory-space).
if file is not None:
with open(source_file_path, "wb") as f:
f.write(file.read())

# -- convert the .doc file to .docx. The resulting file takes the same base-name as the
# -- source file and is written to `target_dir`.
convert_office_doc(
filename,
tmpdir,
source_file_path,
target_dir,
target_format="docx",
target_filter=libre_office_filter,
)
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")

# -- compute the path of the resulting .docx document --
_, filename_no_path = os.path.split(os.path.abspath(source_file_path))
base_filename, _ = os.path.splitext(filename_no_path)
target_file_path = os.path.join(target_dir, f"{base_filename}.docx")

# -- and partition it. Note that `kwargs` is not passed which is a sketchy way to partially
# -- disable post-partitioning processing (what the decorators do) so for example the
# -- resulting elements are not double-chunked.
elements = partition_docx(
filename=docx_filename,
metadata_filename=metadata_filename,
include_page_breaks=include_page_breaks,
filename=target_file_path,
detect_language_per_element=detect_language_per_element,
include_metadata=include_metadata,
metadata_last_modified=metadata_last_modified or last_modification_date,
include_page_breaks=include_page_breaks,
languages=languages,
detect_language_per_element=detect_language_per_element,
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
)
# remove tmp.name from filename if parsing file
if file:
for element in elements:
element.metadata.filename = metadata_filename
Coniferish marked this conversation as resolved.
Show resolved Hide resolved

# -- Remove temporary document.docx path from metadata when necessary. Note `metadata_filename`
# -- defaults to `None` but that's better than a meaningless temporary filename.
if file:
for element in elements:
element.metadata.filename = metadata_filename

return elements


def get_last_modified(
filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
) -> str | None:
"""Determine best available last-modified date from partitioner document-source."""
if filename is not None:
return get_last_modified_date(filename)

if file is not None:
return get_last_modified_date_from_file(file) if date_from_file_object else None

return None
2 changes: 1 addition & 1 deletion unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def partition_pptx(
languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
strategy: str = PartitionStrategy.FAST,
starting_page_number: int = 1,
strategy: str = PartitionStrategy.FAST,
**kwargs: Any,
) -> list[Element]:
"""Partition PowerPoint document in .pptx format into its document elements.
Expand Down
Loading