Skip to content

Commit

Permalink
[issue 1247] fix element and bbox mismatch bug (#1250)
Browse files Browse the repository at this point in the history
This PR resolves #1247 by using the matching elements and bbox for
coordinate computation.

This PR also updates the example doc
`example-docs/layout-parser-paper-fast.pdf` so that it includes a true
blank page and a page with text "this page is intentionally left blank".
This change helps us testing:
- differences between fast and hi_res
- code handling empty pages in between pages with contents (which
triggers the bug found in #1247 )

Lastly, this PR updates the names of the variables inside
`_partition_pdf_or_image_with_ocr` so that matching inputs all starts
with `_` like `_elements`, `_text`, and `_bboxes` to improve
readability.

This change also improves partition performance for multi-page pdfs as
it reduces the amount of iterations inside
`add_pytesseract_bbox_to_elements`. Testing locally on m2 mac + Rocky
docker shows it reduces partition time for DA-619p.pdf file from around
1min to around 23s.
  • Loading branch information
badGarnet authored Aug 30, 2023
1 parent c49df62 commit 2777313
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 48 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

### Fixes

- Fix a bug where mismatched `elements` and `bboxes` are passed into `add_pytesseract_bbox_to_elements`

## 0.10.9

### Enhancements
Expand Down
Binary file modified example-docs/layout-parser-paper-fast.pdf
Binary file not shown.
69 changes: 26 additions & 43 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,57 +110,40 @@ def test_partition_pdf_local_raises_with_no_filename():
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
"strategy",
["fast", "hi_res", "ocr_only"],
("strategy", "expected"),
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[("fast", {1, 4}), ("hi_res", {1, 3, 4}), ("ocr_only", {1, 3, 4})],
)
def test_partition_pdf_with_filename(
def test_partition_pdf(
file_mode,
strategy,
expected,
filename="example-docs/layout-parser-paper-fast.pdf",
):
# Test that the partition_pdf function can handle filename
result = pdf.partition_pdf(filename=filename, strategy=strategy)
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == {1, 2}


@pytest.mark.parametrize(
"strategy",
["fast", "hi_res", "ocr_only"],
)
def test_partition_pdf_with_file_rb(
strategy,
filename="example-docs/layout-parser-paper-fast.pdf",
):
# Test that the partition_pdf function can handle BufferedReader
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy)
def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == {1, 2}


@pytest.mark.parametrize(
"strategy",
["fast", "hi_res", "ocr_only"],
)
def test_partition_pdf_with_spooled_file(
strategy,
filename="example-docs/layout-parser-paper-fast.pdf",
):
# Test that the partition_pdf function can handle a SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == {1, 2}
assert {element.metadata.page_number for element in result} == expected

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy)
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy)
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
_test(result)


@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
Expand Down Expand Up @@ -236,7 +219,7 @@ def test_partition_pdf_with_fast_strategy(
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
assert len(elements) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in elements} == {1, 2}
assert {element.metadata.page_number for element in elements} == {1, 4}
for element in elements:
assert element.metadata.filename == "layout-parser-paper-fast.pdf"

Expand Down
12 changes: 7 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,21 +568,23 @@ def _partition_pdf_or_image_with_ocr(
page_number=page_number,
last_modified=metadata_last_modified,
)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
_text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
_bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
width, height = image.size

_elements = partition_text(
text=text,
text=_text,
max_partition=max_partition,
min_partition=min_partition,
)

# FIXME (yao): do not save duplicated info?
for element in _elements:
element.metadata = metadata
elements.append(element)

add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
add_pytesseract_bbox_to_elements(_elements, _bboxes, width, height)

elements.extend(_elements)
if include_page_breaks:
elements.append(PageBreak(text=""))
return elements

0 comments on commit 2777313

Please sign in to comment.