Refactor: Remove OCR related code for entire page OCR (#231)

## Summary One part of OCR refactor to move it from inference repo to unstructured repo. This PR removes all OCR related code for entire page OCR, which means all table related OCR still remain the same (will be moved after table refactor to accept preprocessed OCR data) ## Test Please see test description in Unstructured-IO/unstructured#1579, since those two need to work together. ## Note The ingest test won't pass until we merge the unstructured refactor PR --------- Co-authored-by: christinestraub <[email protected]>
Unstructured-IO · Oct 5, 2023 · ffb1f0b · ffb1f0b
1 parent cf15726
commit ffb1f0b
Show file tree

Hide file tree

Showing 17 changed files with 30 additions and 1,024 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.0
+
+* Remove all OCR related code expect the table OCR code
+
 ## 0.6.6
 
 * Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until

diff --git a/Dockerfile b/Dockerfile
@@ -20,7 +20,6 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \
   pip install --no-cache -r requirements/base.txt && \
   pip install --no-cache -r requirements/test.txt && \
   pip install --no-cache -r requirements/dev.txt && \
-  pip install "unstructured.PaddleOCR" && \
   dnf -y groupremove "Development Tools" && \
   dnf clean all
 

diff --git a/examples/layout_analysis/visualization.py b/examples/layout_analysis/visualization.py
@@ -14,7 +14,6 @@ def run(f_path, scope):
         "final": None,
         "extracted": {"layout": {"color": "green", "width": 2}},
         "inferred": {"inferred_layout": {"color": "blue", "width": 2}},
-        "ocr": {"ocr_layout": {"color": "yellow", "width": 2}},
     }
 
     f_basename = os.path.splitext(os.path.basename(f_path))[0]
@@ -47,8 +46,7 @@ def run(f_path, scope):
             write_image(img, output_f_path)
 
         print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: "
-              f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)} - "
-              f"n_ocr_elements: {len(page.ocr_layout)}")
+              f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)}")
 
 
 if __name__ == '__main__':

diff --git a/test_unstructured_inference/conftest.py b/test_unstructured_inference/conftest.py
@@ -107,15 +107,6 @@ def mock_embedded_text_regions():
     ]
 
 
-@pytest.fixture()
-def mock_ocr_regions():
-    return [
-        EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None),
-        EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None),
-        EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None),
-    ]
-
-
 # TODO(alan): Make a better test layout
 @pytest.fixture()
 def mock_layout(mock_embedded_text_regions):
@@ -130,19 +121,3 @@ def mock_layout(mock_embedded_text_regions):
         )
         for r in mock_embedded_text_regions
     ]
-
-
-@pytest.fixture()
-def mock_inferred_layout(mock_embedded_text_regions):
-    return [
-        LayoutElement(
-            r.x1,
-            r.y1,
-            r.x2,
-            r.y2,
-            text=None,
-            source=None,
-            type="Text",
-        )
-        for r in mock_embedded_text_regions
-    ]