Unstructured-IO · cragwolfe · Oct 6, 2023 · Sep 22, 2023 · Sep 25, 2023 · Sep 26, 2023
diff --git a/.coveragerc b/.coveragerc
@@ -1,3 +1,5 @@
 [run]
 omit =
     unstructured/ingest/*
+    # TODO(yuming): please remove this line after adding tests for paddle (CORE-1886)
+    unstructured/partition/utils/ocr_models/paddle_ocr.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.10.20-dev5
+## 0.10.20-dev6
 
 ### Enhancements
 
+* **Refactor OCR code** The OCR code for entire page is moved from unstructured-inference to unstructured. On top of continuing support for OCR language parameter, we also support two OCR processing modes, "entire_page" or "individual_blocks".
 * **Align to top left when shrinking bounding boxes for `xy-cut` sorting:** Update `shrink_bbox()` to keep top left rather than center.
 * **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()).
 * **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.

diff --git a/requirements/constraints.in b/requirements/constraints.in
@@ -45,3 +45,4 @@ anyio<4.0
 opencv-python==4.8.0.76
 opencv-contrib-python==4.8.0.76
 onnxruntime==1.15.1
+platformdirs==3.10.0
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -8,6 +8,10 @@ anyio==3.7.1
     # via
     #   -c requirements/constraints.in
     #   jupyter-server
+appdirs==1.4.4
+    # via
+    #   -c requirements/test.txt
+    #   virtualenv
 appnope==0.1.3
     # via
     #   ipykernel
@@ -34,7 +38,7 @@ beautifulsoup4==4.12.2
     # via
     #   -c requirements/base.txt
     #   nbconvert
-bleach==6.0.0
+bleach==6.1.0
     # via nbconvert
 build==1.0.3
     # via pip-tools
@@ -153,7 +157,7 @@ jupyter-client==8.3.1
     #   qtconsole
 jupyter-console==6.6.3
     # via jupyter
-jupyter-core==5.3.2
+jupyter-core==4.12.0
     # via
     #   -c requirements/constraints.in
     #   ipykernel
@@ -245,12 +249,7 @@ pip-tools==7.3.0
     # via -r requirements/dev.in
 pkgutil-resolve-name==1.3.10
     # via jsonschema
-platformdirs==3.11.0
-    # via
-    #   -c requirements/test.txt
-    #   jupyter-core
-    #   virtualenv
-pre-commit==3.4.0
+pre-commit==2.20.0
     # via -r requirements/dev.in
 prometheus-client==0.17.1
     # via jupyter-server
@@ -333,6 +332,7 @@ six==1.16.0
     #   bleach
     #   python-dateutil
     #   rfc3339-validator
+    #   virtualenv
 sniffio==1.3.0
     # via anyio
 soupsieve==2.5
@@ -347,6 +347,8 @@ terminado==0.17.1
     #   jupyter-server-terminals
 tinycss2==1.2.1
     # via nbconvert
+toml==0.10.2
+    # via pre-commit
 tomli==2.0.1
     # via
     #   -c requirements/test.txt
@@ -395,7 +397,7 @@ urllib3==1.26.17
     #   -c requirements/constraints.in
     #   -c requirements/test.txt
     #   requests
-virtualenv==20.24.5
+virtualenv==20.4.7
     # via pre-commit
 wcwidth==0.2.8
     # via prompt-toolkit

diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt
@@ -6,7 +6,7 @@
 #
 importlib-metadata==6.8.0
     # via markdown
-markdown==3.4.4
+markdown==3.5
     # via -r requirements/extra-markdown.in
 zipp==3.17.0
     # via importlib-metadata
diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
@@ -45,7 +45,7 @@ flask==3.0.0
     #   visualdl
 flask-babel==4.0.0
     # via visualdl
-fonttools==4.43.0
+fonttools==4.43.1
     # via matplotlib
 future==0.18.3
     # via bce-python-sdk

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -5,7 +5,7 @@ pdf2image
 pdfminer.six
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.6.6
+unstructured-inference==0.7.2
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -35,14 +35,14 @@ filelock==3.12.4
     #   transformers
 flatbuffers==23.5.26
     # via onnxruntime
-fonttools==4.43.0
+fonttools==4.43.1
     # via matplotlib
 fsspec==2023.9.1
     # via
     #   -c requirements/constraints.in
     #   huggingface-hub
     #   torch
-huggingface-hub==0.16.4
+huggingface-hub==0.17.3
     # via
     #   timm
     #   tokenizers
@@ -199,7 +199,7 @@ sympy==1.12
     #   torch
 timm==0.9.7
     # via effdet
-tokenizers==0.14.0
+tokenizers==0.14.1
     # via transformers
 torch==2.1.0
     # via
@@ -229,7 +229,7 @@ typing-extensions==4.8.0
     #   torch
 tzdata==2023.3
     # via pandas
-unstructured-inference==0.6.6
+unstructured-inference==0.7.2
     # via -r requirements/extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via

diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -27,7 +27,7 @@ fsspec==2023.9.1
     #   -c requirements/constraints.in
     #   huggingface-hub
     #   torch
-huggingface-hub==0.16.4
+huggingface-hub==0.17.3
     # via
     #   tokenizers
     #   transformers
@@ -90,7 +90,7 @@ six==1.16.0
     #   sacremoses
 sympy==1.12
     # via torch
-tokenizers==0.14.0
+tokenizers==0.14.1
     # via transformers
 torch==2.1.0
     # via -r requirements/huggingface.in

diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt
@@ -40,6 +40,8 @@ frozenlist==1.4.0
     # via
     #   aiohttp
     #   aiosignal
+greenlet==3.0.0
+    # via sqlalchemy
 idna==3.4
     # via
     #   -c requirements/base.txt
@@ -50,9 +52,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.309
+langchain==0.0.310
     # via -r requirements/ingest-openai.in
-langsmith==0.0.42
+langsmith==0.0.43
     # via langchain
 marshmallow==3.20.1
     # via

diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt
@@ -33,8 +33,10 @@ more-itertools==10.1.0
     # via simple-salesforce
 pendulum==2.1.2
     # via simple-salesforce
-platformdirs==3.11.0
-    # via zeep
+platformdirs==3.10.0
+    # via
+    #   -c requirements/constraints.in
+    #   zeep
 pycparser==2.21
     # via cffi
 pyjwt==2.8.0

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -68,8 +68,10 @@ packaging==23.2
     #   pytest
 pathspec==0.11.2
     # via black
-platformdirs==3.11.0
-    # via black
+platformdirs==3.10.0
+    # via
+    #   -c requirements/constraints.in
+    #   black
 pluggy==1.3.0
     # via pytest
 pycodestyle==2.11.0

diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
@@ -8,7 +8,7 @@
 from unstructured_inference.inference import layout
 
 from unstructured.chunking.title import chunk_by_title
-from unstructured.partition import image, pdf
+from unstructured.partition import image, ocr, pdf
 from unstructured.partition.json import partition_json
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
@@ -84,7 +84,10 @@ def pages(self):
 
 @pytest.mark.parametrize(
     ("filename", "file"),
-    [("example-docs/example.jpg", None), (None, b"0000")],
+    [
+        ("example-docs/example.jpg", None),
+        (None, b"0000"),
+    ],
 )
 def test_partition_image_local(monkeypatch, filename, file):
     monkeypatch.setattr(
@@ -97,6 +100,16 @@ def test_partition_image_local(monkeypatch, filename, file):
         "process_file_with_model",
         lambda *args, **kwargs: MockDocumentLayout(),
     )
+    monkeypatch.setattr(
+        ocr,
+        "process_data_with_ocr",
+        lambda *args, **kwargs: MockDocumentLayout(),
+    )
+    monkeypatch.setattr(
+        ocr,
+        "process_data_with_ocr",
+        lambda *args, **kwargs: MockDocumentLayout(),
+    )
 
     partition_image_response = pdf._partition_pdf_or_image_local(
         filename,
@@ -146,8 +159,8 @@ def test_partition_image_with_multipage_tiff(
 
 def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
     with mock.patch.object(
-        layout,
-        "process_file_with_model",
+        ocr,
+        "process_file_with_ocr",
         mock.MagicMock(),
     ) as mock_partition:
         image.partition_image(
@@ -163,8 +176,8 @@ def test_partition_image_from_file_with_language_passed(
     filename="example-docs/example.jpg",
 ):
     with mock.patch.object(
-        layout,
-        "process_data_with_model",
+        ocr,
+        "process_data_with_ocr",
         mock.MagicMock(),
     ) as mock_partition, open(filename, "rb") as f:
         image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe")
@@ -437,16 +450,13 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
 
 def test_partition_image_formats_languages_for_tesseract():
     filename = "example-docs/jpn-vert.jpeg"
-    with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
+    with mock.patch(
+        "unstructured.partition.ocr.process_file_with_ocr",
+    ) as mock_process_file_with_ocr:
         image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
-        mock_process.assert_called_once_with(
-            filename,
-            is_image=True,
-            ocr_languages="jpn_vert",
-            ocr_mode="entire_page",
-            extract_tables=False,
-            model_name=pdf.default_hi_res_model(),
-        )
+        _, kwargs = mock_process_file_with_ocr.call_args_list[0]
+        assert "ocr_languages" in kwargs
+        assert kwargs["ocr_languages"] == "jpn_vert"
 
 
 def test_partition_image_warns_with_ocr_languages(caplog):
@@ -493,3 +503,24 @@ def test_partition_image_uses_model_name():
         print(mockpartition.call_args)
         assert "model_name" in mockpartition.call_args.kwargs
         assert mockpartition.call_args.kwargs["model_name"]
+
+
+@pytest.mark.parametrize(
+    ("ocr_mode", "idx_title_element"),
+    [
+        ("entire_page", 2),
+        ("individual_blocks", 1),
+    ],
+)
+def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
+    filename = "example-docs/layout-parser-paper-fast.jpg"
+    elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res")
+    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    # Note(yuming): idx_title_element is different based on xy-cut and ocr mode
+    assert elements[idx_title_element].text == first_line
+
+
+def test_partition_image_hi_res_invalid_ocr_mode():
+    filename = "example-docs/layout-parser-paper-fast.jpg"
+    with pytest.raises(ValueError):
+        _ = image.partition_image(filename=filename, ocr_mode="invalid_ocr_mode", strategy="hi_res")