Skip to content

Commit

Permalink
Refactor: Remove OCR related code for entire page OCR (#231)
Browse files Browse the repository at this point in the history
## Summary
One part of OCR refactor to move it from inference repo to unstructured
repo. This PR removes all OCR related code for entire page OCR, which
means all table related OCR still remain the same (will be moved after
table refactor to accept preprocessed OCR data)

## Test
Please see test description in
Unstructured-IO/unstructured#1579, since those
two need to work together.

## Note
The ingest test won't pass until we merge the unstructured refactor PR

---------

Co-authored-by: christinestraub <[email protected]>
  • Loading branch information
yuming-long and christinestraub authored Oct 5, 2023
1 parent cf15726 commit ffb1f0b
Show file tree
Hide file tree
Showing 17 changed files with 30 additions and 1,024 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.7.0

* Remove all OCR related code expect the table OCR code

## 0.6.6

* Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until
Expand Down
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \
pip install --no-cache -r requirements/base.txt && \
pip install --no-cache -r requirements/test.txt && \
pip install --no-cache -r requirements/dev.txt && \
pip install "unstructured.PaddleOCR" && \
dnf -y groupremove "Development Tools" && \
dnf clean all

Expand Down
4 changes: 1 addition & 3 deletions examples/layout_analysis/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ def run(f_path, scope):
"final": None,
"extracted": {"layout": {"color": "green", "width": 2}},
"inferred": {"inferred_layout": {"color": "blue", "width": 2}},
"ocr": {"ocr_layout": {"color": "yellow", "width": 2}},
}

f_basename = os.path.splitext(os.path.basename(f_path))[0]
Expand Down Expand Up @@ -47,8 +46,7 @@ def run(f_path, scope):
write_image(img, output_f_path)

print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: "
f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)} - "
f"n_ocr_elements: {len(page.ocr_layout)}")
f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)}")


if __name__ == '__main__':
Expand Down
25 changes: 0 additions & 25 deletions test_unstructured_inference/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,6 @@ def mock_embedded_text_regions():
]


@pytest.fixture()
def mock_ocr_regions():
return [
EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None),
EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None),
EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None),
]


# TODO(alan): Make a better test layout
@pytest.fixture()
def mock_layout(mock_embedded_text_regions):
Expand All @@ -130,19 +121,3 @@ def mock_layout(mock_embedded_text_regions):
)
for r in mock_embedded_text_regions
]


@pytest.fixture()
def mock_inferred_layout(mock_embedded_text_regions):
return [
LayoutElement(
r.x1,
r.y1,
r.x2,
r.y2,
text=None,
source=None,
type="Text",
)
for r in mock_embedded_text_regions
]
Loading

0 comments on commit ffb1f0b

Please sign in to comment.