[issue 1247] fix element and bbox mismatch bug (#1250)

This PR resolves #1247 by using the matching elements and bbox for coordinate computation. This PR also updates the example doc `example-docs/layout-parser-paper-fast.pdf` so that it includes a true blank page and a page with text "this page is intentionally left blank". This change helps us testing: - differences between fast and hi_res - code handling empty pages in between pages with contents (which triggers the bug found in #1247 ) Lastly, this PR updates the names of the variables inside `_partition_pdf_or_image_with_ocr` so that matching inputs all starts with `_` like `_elements`, `_text`, and `_bboxes` to improve readability. This change also improves partition performance for multi-page pdfs as it reduces the amount of iterations inside `add_pytesseract_bbox_to_elements`. Testing locally on m2 mac + Rocky docker shows it reduces partition time for DA-619p.pdf file from around 1min to around 23s.
Unstructured-IO · Aug 30, 2023 · 2777313 · 2777313
1 parent c49df62
commit 2777313
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@
 
 ### Fixes
 
+- Fix a bug where mismatched `elements` and `bboxes` are passed into `add_pytesseract_bbox_to_elements`
+
 ## 0.10.9
 
 ### Enhancements

diff --git a/example-docs/layout-parser-paper-fast.pdf b/example-docs/layout-parser-paper-fast.pdf
diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -110,57 +110,40 @@ def test_partition_pdf_local_raises_with_no_filename():
         pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
 
 
+@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
 @pytest.mark.parametrize(
-    "strategy",
-    ["fast", "hi_res", "ocr_only"],
+    ("strategy", "expected"),
+    # fast: can't capture the "intentionally left blank page" page
+    # others: will ignore the actual blank page
+    [("fast", {1, 4}), ("hi_res", {1, 3, 4}), ("ocr_only", {1, 3, 4})],
 )
-def test_partition_pdf_with_filename(
+def test_partition_pdf(
+    file_mode,
     strategy,
+    expected,
     filename="example-docs/layout-parser-paper-fast.pdf",
 ):
     # Test that the partition_pdf function can handle filename
-    result = pdf.partition_pdf(filename=filename, strategy=strategy)
-    # validate that the result is a non-empty list of dicts
-    assert len(result) > 10
-    # check that the pdf has multiple different page numbers
-    assert {element.metadata.page_number for element in result} == {1, 2}
-
-
-@pytest.mark.parametrize(
-    "strategy",
-    ["fast", "hi_res", "ocr_only"],
-)
-def test_partition_pdf_with_file_rb(
-    strategy,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    # Test that the partition_pdf function can handle BufferedReader
-    with open(filename, "rb") as f:
-        result = pdf.partition_pdf(file=f, strategy=strategy)
+    def _test(result):
         # validate that the result is a non-empty list of dicts
         assert len(result) > 10
         # check that the pdf has multiple different page numbers
-        assert {element.metadata.page_number for element in result} == {1, 2}
-
-
-@pytest.mark.parametrize(
-    "strategy",
-    ["fast", "hi_res", "ocr_only"],
-)
-def test_partition_pdf_with_spooled_file(
-    strategy,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    # Test that the partition_pdf function can handle a SpooledTemporaryFile
-    with open(filename, "rb") as test_file:
-        spooled_temp_file = SpooledTemporaryFile()
-        spooled_temp_file.write(test_file.read())
-        spooled_temp_file.seek(0)
-        result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
-        # validate that the result is a non-empty list of dicts
-        assert len(result) > 10
-        # check that the pdf has multiple different page numbers
-        assert {element.metadata.page_number for element in result} == {1, 2}
+        assert {element.metadata.page_number for element in result} == expected
+
+    if file_mode == "filename":
+        result = pdf.partition_pdf(filename=filename, strategy=strategy)
+        _test(result)
+    elif file_mode == "rb":
+        with open(filename, "rb") as f:
+            result = pdf.partition_pdf(file=f, strategy=strategy)
+            _test(result)
+    else:
+        with open(filename, "rb") as test_file:
+            spooled_temp_file = SpooledTemporaryFile()
+            spooled_temp_file.write(test_file.read())
+            spooled_temp_file.seek(0)
+            result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
+            _test(result)
 
 
 @mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
@@ -236,7 +219,7 @@ def test_partition_pdf_with_fast_strategy(
     elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
     assert len(elements) > 10
     # check that the pdf has multiple different page numbers
-    assert {element.metadata.page_number for element in elements} == {1, 2}
+    assert {element.metadata.page_number for element in elements} == {1, 4}
     for element in elements:
         assert element.metadata.filename == "layout-parser-paper-fast.pdf"
 

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -568,21 +568,23 @@ def _partition_pdf_or_image_with_ocr(
                 page_number=page_number,
                 last_modified=metadata_last_modified,
             )
-            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
-            bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
+            _text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
+            _bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
             width, height = image.size
 
             _elements = partition_text(
-                text=text,
+                text=_text,
                 max_partition=max_partition,
                 min_partition=min_partition,
             )
+
+            # FIXME (yao): do not save duplicated info?
             for element in _elements:
                 element.metadata = metadata
-                elements.append(element)
 
-            add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
+            add_pytesseract_bbox_to_elements(_elements, _bboxes, width, height)
 
+            elements.extend(_elements)
             if include_page_breaks:
                 elements.append(PageBreak(text=""))
     return elements