From e2a6757b307cf45cce5d7d08257e8f547700a630 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Tue, 25 Jun 2024 10:40:28 -0700 Subject: [PATCH] Refactor: remove analysis scripts (#305) ### Summary This PR is the first part of the "layout analysis" refactor to move it from unstructured-inference repo to unstructured repo. This PR removes "layout analysis" related code from unstructured-inference repo and works together with the unstructured refactor PR - https://github.com/Unstructured-IO/unstructured/pull/2273. This PR also adds a few more test cases for `layoutelement.py` to make coverage to over 95%. --- CHANGELOG.md | 4 + examples/layout_analysis/README.md | 22 --- examples/layout_analysis/requirements.txt | 1 - examples/layout_analysis/visualization.ipynb | 94 ------------- examples/layout_analysis/visualization.py | 66 --------- examples/ocr_layout_supplement/README.md | 19 --- .../ocr_layout_supplement.ipynb | 126 ------------------ .../ocr_layout_supplement.py | 59 -------- test_unstructured_inference/test_elements.py | 14 +- test_unstructured_inference/test_utils.py | 69 ---------- unstructured_inference/__version__.py | 2 +- unstructured_inference/constants.py | 5 - unstructured_inference/utils.py | 82 +----------- 13 files changed, 19 insertions(+), 544 deletions(-) delete mode 100644 examples/layout_analysis/README.md delete mode 100644 examples/layout_analysis/requirements.txt delete mode 100644 examples/layout_analysis/visualization.ipynb delete mode 100644 examples/layout_analysis/visualization.py delete mode 100644 examples/ocr_layout_supplement/README.md delete mode 100644 examples/ocr_layout_supplement/ocr_layout_supplement.ipynb delete mode 100644 examples/ocr_layout_supplement/ocr_layout_supplement.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8835e9e7..944f1b27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.7.37-dev0 + +* refactor: remove layout analysis related code + ## 0.7.36 fix: add input parameter validation to `fill_cells()` when converting cells to html diff --git a/examples/layout_analysis/README.md b/examples/layout_analysis/README.md deleted file mode 100644 index 77e0e964..00000000 --- a/examples/layout_analysis/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Analyzing Layout Elements - -This directory contains examples of how to analyze layout elements. - -## How to run - -Run `pip install -r requirements.txt` to install the Python dependencies. - -### Visualization -- Python script (visualization.py) -``` -$ PYTHONPATH=. python examples/layout_analysis/visualization.py -``` -The scope can be `image_only` to show only image elements or `all` to show all elements. For example, -``` -$ PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf all -// or -$ PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf image_oly -``` -- Jupyter Notebook (visualization.ipynb) - - Run `jupyter-notebook` to start. - - Run the `visualization.ipynb` notebook. diff --git a/examples/layout_analysis/requirements.txt b/examples/layout_analysis/requirements.txt deleted file mode 100644 index 0d7e9b7d..00000000 --- a/examples/layout_analysis/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -unstructured-inference \ No newline at end of file diff --git a/examples/layout_analysis/visualization.ipynb b/examples/layout_analysis/visualization.ipynb deleted file mode 100644 index d1a221c9..00000000 --- a/examples/layout_analysis/visualization.ipynb +++ /dev/null @@ -1,94 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from unstructured_inference.inference.layout import process_file_with_model\n", - "from unstructured_inference.visualize import show_plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "annotation_data_map = {\n", - " \"final\": None,\n", - " \"extracted\": {\"layout\": {\"color\": \"green\", \"width\": 2}},\n", - " \"inferred\": {\"inferred_layout\": {\"color\": \"blue\", \"width\": 2}},\n", - " \"ocr\": {\"ocr_layout\": {\"color\": \"yellow\", \"width\": 2}},\n", - "}" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_path = \"../../sample-docs/loremipsum.pdf\"\n", - "f_name = os.path.basename(f_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "doc = process_file_with_model(\n", - " f_path,\n", - " model_name=None,\n", - " analysis=True,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for idx, page in enumerate(doc.pages):\n", - " for action_type, action_value in annotation_data_map.items():\n", - " img = page.annotate(annotation_data=action_value)\n", - " if action_value is None:\n", - " n_layout_elements = len(page.elements)\n", - " else:\n", - " attribute = list(action_value.keys())[0]\n", - " n_layout_elements = len(getattr(page, attribute))\n", - " print(f\"Filename: {f_name} - Page: {idx+1} - Layout: {action_type} - n_layout_elements: {n_layout_elements}\")\n", - " show_plot(img, desired_width=14)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/examples/layout_analysis/visualization.py b/examples/layout_analysis/visualization.py deleted file mode 100644 index 07bc5f77..00000000 --- a/examples/layout_analysis/visualization.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import pathlib -import sys - -from unstructured_inference.inference.elements import ImageTextRegion -from unstructured_inference.inference.layout import process_file_with_model -from unstructured_inference.utils import write_image - -CUR_DIR = pathlib.Path(__file__).parent.resolve() - - -def run(f_path, scope): - annotation_data_map = { - "final": None, - "extracted": {"layout": {"color": "green", "width": 2}}, - "inferred": {"inferred_layout": {"color": "blue", "width": 2}}, - } - - f_basename = os.path.splitext(os.path.basename(f_path))[0] - output_dir_path = os.path.join(output_basedir_path, f_basename) - os.makedirs(output_dir_path, exist_ok=True) - - doc = process_file_with_model( - f_path, - model_name=None, - ) - - for idx, page in enumerate(doc.pages): - if scope == "image_only": - embedded_image_elements = [ - el for el in page.layout if isinstance(el, ImageTextRegion) - ] - inferred_image_elements = [ - el for el in page.inferred_layout if el.type == "Figure" - ] - final_image_elements = [el for el in page.elements if el.type == "Image"] - - page.layout = embedded_image_elements - page.inferred_layout = inferred_image_elements - page.elements = final_image_elements - - for action_type, action_value in annotation_data_map.items(): - img = page.annotate(annotation_data=action_value) - output_f_path = os.path.join(output_dir_path, f"{f_basename}_{idx+1}_{action_type}.jpg") - write_image(img, output_f_path) - - print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: " - f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)}") - - -if __name__ == '__main__': - if len(sys.argv) < 3: - print( - "Please provide the path to the file name as the first argument and the scope as the " - "second argument.", - ) - sys.exit(1) - - if sys.argv[2] not in ["all", "image_only"]: - print("Invalid scope") - sys.exit(1) - - output_basedir_path = os.path.join(CUR_DIR, "output") - os.makedirs(output_basedir_path, exist_ok=True) - - run(f_path=sys.argv[1], scope=sys.argv[2]) diff --git a/examples/ocr_layout_supplement/README.md b/examples/ocr_layout_supplement/README.md deleted file mode 100644 index 9fbf2f31..00000000 --- a/examples/ocr_layout_supplement/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Supplementing detected layout with elements from the full-page OCR - -This directory contains examples of how to analyze layout elements. - -## Running the example - -Run `pip install -r requirements.txt` to install the Python dependencies. - -### Running python script -``` -PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py -``` -For example, -``` -PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py sample-docs/patent-1p.pdf pdf -``` -### Running jupyter notebook - - Run `jupyter-notebook` to start. - - Run the `visualization.ipynb` notebook. diff --git a/examples/ocr_layout_supplement/ocr_layout_supplement.ipynb b/examples/ocr_layout_supplement/ocr_layout_supplement.ipynb deleted file mode 100644 index b14648c2..00000000 --- a/examples/ocr_layout_supplement/ocr_layout_supplement.ipynb +++ /dev/null @@ -1,126 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "from unstructured_inference.constants import AnnotationResult\n", - "from unstructured_inference.inference.layout import process_file_with_model\n", - "from unstructured_inference.utils import annotate_layout_elements" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "output_basedir_path = \"output\"" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "def run(f_path, f_type):\n", - " if f_type == \"pdf\":\n", - " is_image = False\n", - " elif f_type == \"image\":\n", - " is_image = True\n", - " else:\n", - " print(\"Invalid file type.\")\n", - " sys.exit(1)\n", - "\n", - " annotation_data_map = {\n", - " \"final\": None,\n", - " }\n", - " actions = [False, True]\n", - " for action in actions:\n", - " _f_basename = os.path.splitext(os.path.basename(f_path))[0]\n", - " output_dir_path = os.path.join(output_basedir_path, f\"{_f_basename}_{file_type}\")\n", - " os.makedirs(output_dir_path, exist_ok=True)\n", - "\n", - " f_basename = f\"updated_{_f_basename}\" if action else f\"original_{_f_basename}\"\n", - "\n", - " label = \"Updated Results: \" if action else \"Original Results: \"\n", - " print(label)\n", - "\n", - " doc = process_file_with_model(\n", - " f_path,\n", - " is_image=is_image,\n", - " model_name=None,\n", - " supplement_with_ocr_elements=action,\n", - " analysis=True,\n", - " )\n", - "\n", - " annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.PLOT)" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "file_path = \"../../sample-docs/patent-1p.pdf\"\n", - "file_type = \"pdf\"\n", - "f_name = os.path.basename(file_path)\n", - "print(f\"file_name: {f_name} - file_type: {file_type}\")\n", - "\n", - "run(file_path, file_type)" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "file_path = \"../../sample-docs/layout-parser-paper-fast.jpg\"\n", - "file_type = \"image\"\n", - "f_name = os.path.basename(file_path)\n", - "print(f\"file_name: {f_name} - file_type: {file_type}\")\n", - "\n", - "run(file_path, file_type)" - ], - "metadata": { - "collapsed": false - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/examples/ocr_layout_supplement/ocr_layout_supplement.py b/examples/ocr_layout_supplement/ocr_layout_supplement.py deleted file mode 100644 index 6f674b6e..00000000 --- a/examples/ocr_layout_supplement/ocr_layout_supplement.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import pathlib -import sys - -from unstructured_inference.constants import AnnotationResult -from unstructured_inference.inference.layout import process_file_with_model -from unstructured_inference.utils import annotate_layout_elements - -CUR_DIR = pathlib.Path(__file__).parent.resolve() - - -def run(f_path, file_type): - print(">>> Start...") - print(f">>> file_path: {f_path} - file_type: {file_type}") - - if file_type == "pdf": - is_image = False - elif file_type == "image": - is_image = True - else: - print("Invalid file type.") - sys.exit(1) - - annotation_data_map = { - "final": None, - } - - actions = [False, True] - for action in actions: - _f_basename = os.path.splitext(os.path.basename(f_path))[0] - output_dir_path = os.path.join(output_basedir_path, f"{_f_basename}_{file_type}") - os.makedirs(output_dir_path, exist_ok=True) - - f_basename = f"updated_{_f_basename}" if action else f"original_{_f_basename}" - - doc = process_file_with_model( - f_path, - is_image=is_image, - model_name=None, - supplement_with_ocr_elements=action, - ) - - annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.IMAGE) - - print("<<< Finished") - - -if __name__ == '__main__': - if len(sys.argv) < 3: - print( - "Please provide the path to the file name as the first argument and the strategy as the " - "second argument.", - ) - sys.exit(1) - - output_basedir_path = os.path.join(CUR_DIR, "output") - os.makedirs(output_basedir_path, exist_ok=True) - - run(f_path=sys.argv[1], file_type=sys.argv[2]) diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 9e49accd..f6f5b568 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -6,7 +6,7 @@ from unstructured_inference.constants import ElementType from unstructured_inference.inference import elements -from unstructured_inference.inference.elements import Rectangle, TextRegion +from unstructured_inference.inference.elements import Rectangle, TextRegion, ImageTextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, merge_inferred_layout_with_extracted_layout, @@ -263,6 +263,10 @@ def test_merge_inferred_layout_with_extracted_layout(): TextRegion.from_coords(377, 469, 1335, 535, text="Example Title"), ] + extracted_layout_with_full_page_image = [ + ImageTextRegion.from_coords(0, 0, 1700, 2200, text="Example Section Header"), + ] + merged_layout = merge_inferred_layout_with_extracted_layout( inferred_layout=inferred_layout, extracted_layout=extracted_layout, @@ -272,3 +276,11 @@ def test_merge_inferred_layout_with_extracted_layout(): assert merged_layout[0].text == "Example Section Header" assert merged_layout[1].type == ElementType.TEXT assert merged_layout[1].text == "Example Title" + + # case: extracted layout with a full page image + merged_layout = merge_inferred_layout_with_extracted_layout( + inferred_layout=inferred_layout, + extracted_layout=extracted_layout_with_full_page_image, + page_image_size=(1700, 2200), + ) + assert merged_layout == inferred_layout diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py index 7c745c9c..399ca739 100644 --- a/test_unstructured_inference/test_utils.py +++ b/test_unstructured_inference/test_utils.py @@ -1,21 +1,12 @@ -import os -import tempfile -from unittest.mock import patch - import numpy as np import pytest -from PIL import Image -from unstructured_inference import utils -from unstructured_inference.constants import AnnotationResult from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.utils import ( LazyDict, LazyEvaluateInfo, - annotate_layout_elements, pad_image_with_background_color, strip_tags, - write_image, ) @@ -73,66 +64,6 @@ def func(x): assert called == expected -@pytest.mark.parametrize("image_type", ["pil", "numpy_array"]) -def test_write_image(image_type, mock_pil_image, mock_numpy_image): - image_map = { - "pil": mock_pil_image, - "numpy_array": mock_numpy_image, - } - image = image_map[image_type] - - with tempfile.TemporaryDirectory() as tmpdir: - output_image_path = os.path.join(tmpdir, "test_image.jpg") - write_image(image, output_image_path) - assert os.path.exists(output_image_path) - - # Additional check to see if the written image can be read - read_image = Image.open(output_image_path) - assert read_image is not None - - -def test_write_image_raises_error(): - with pytest.raises(ValueError): - write_image("invalid_type", "test_image.jpg") - - -def test_annotate_layout_elements_with_image_result(): - mock_doc = MockDocumentLayout() - annotation_data_map = {"final": None} - output_dir_path = "test_output_dir" - output_f_basename = "test_output" - - with patch.object(utils, "write_image") as mock_write_image: - annotate_layout_elements( - mock_doc, - annotation_data_map, - output_dir_path, - output_f_basename, - result=AnnotationResult.IMAGE, - ) - - expected_output_f_path = os.path.join(output_dir_path, "test_output_2_final.jpg") - mock_write_image.assert_called_with("mock_image", expected_output_f_path) - - -def test_annotate_layout_elements_with_plot_result(): - mock_doc = MockDocumentLayout() - annotation_data_map = {"final": None} - output_dir_path = "test_output_dir" - output_f_basename = "test_output" - - with patch.object(utils, "show_plot") as mock_show_plot: - annotate_layout_elements( - mock_doc, - annotation_data_map, - output_dir_path, - output_f_basename, - result=AnnotationResult.PLOT, - ) - - mock_show_plot.assert_called_with("mock_image", desired_width=14) - - def test_pad_image_with_background_color(mock_pil_image): pad = 10 height, width = mock_pil_image.size diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index a98a6b84..33c5779d 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.36" # pragma: no cover +__version__ = "0.7.37-dev0" # pragma: no cover diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index a3341075..173e37b4 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -1,11 +1,6 @@ from enum import Enum -class AnnotationResult(Enum): - IMAGE = "image" - PLOT = "plot" - - class Source(Enum): YOLOX = "yolox" DETECTRON2_ONNX = "detectron2_onnx" diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index c8ea035e..696a2e8a 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -2,19 +2,12 @@ from collections.abc import Mapping from html.parser import HTMLParser from io import StringIO -from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union +from typing import Any, Callable, Hashable, Iterable, Iterator, Union -import cv2 -import numpy as np from huggingface_hub import hf_hub_download from PIL import Image -from unstructured_inference.constants import AnnotationResult from unstructured_inference.inference.layoutelement import LayoutElement -from unstructured_inference.visualize import show_plot - -if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout class LazyEvaluateInfo: @@ -56,79 +49,6 @@ def __len__(self) -> int: return len(self._raw_dict) -def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str): - """ - Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats. - - Parameters: - - image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image - format or a numpy ndarray format. - - output_image_path (str): The path to which the image will be written. - - Raises: - - ValueError: If the provided image type is neither PIL Image nor numpy ndarray. - - Returns: - - None: The function writes the image to the specified path but does not return any value. - """ - - if isinstance(image, Image.Image): - image.save(output_image_path) - elif isinstance(image, np.ndarray): - cv2.imwrite(output_image_path, image) - else: - raise ValueError("Unsupported Image Type") - - -def annotate_layout_elements( - doc: "DocumentLayout", - annotation_data_map: dict, - output_dir_path: str, - output_f_basename: str, - result: AnnotationResult = AnnotationResult.IMAGE, - plot_desired_width: int = 14, -): - """ - Annotates layout elements on each page of the document and saves or displays the result. - - This function iterates through each page of the given document and applies annotations based on - the given action type and action value in the annotation_data_map. The annotated images are then - either saved to the disk or displayed as plots. - - Parameters: - - doc (DocumentLayout): The document layout object containing the pages to annotate. - - annotation_data_map (dict): A mapping from action types to action values defining the - annotations to be applied. - - output_dir_path (str): The directory path where the annotated images will be saved. - - output_f_basename (str): The base name to use for the output image files. - - result (str, optional): Specifies the result type. Can be either - 'ANNOTATION_RESULT_WITH_IMAGE' for saving the annotated images - or 'ANNOTATION_RESULT_WITH_PLOT' for displaying them as plots. - Default is 'ANNOTATION_RESULT_WITH_IMAGE'. - - plot_desired_width (int, optional): The desired width for the plot when result is set to - 'ANNOTATION_RESULT_WITH_PLOT'. Default is 14. - - Note: - - If the 'result' parameter is set to 'ANNOTATION_RESULT_WITH_IMAGE', the annotated images will - be saved in the directory specified by 'output_dir_path'. - - If the 'result' parameter is set to 'ANNOTATION_RESULT_WITH_PLOT', the annotated images will - be displayed as plots and not saved. - """ - - for idx, page in enumerate(doc.pages): - for action_type, action_value in annotation_data_map.items(): - img = page.annotate(annotation_data=action_value) - output_f_path = os.path.join( - output_dir_path, - f"{output_f_basename}_{idx + 1}_{action_type}.jpg", - ) - if result == AnnotationResult.IMAGE: - write_image(img, output_f_path) - print(f"wrote {output_f_path}") - elif result == AnnotationResult.PLOT: - show_plot(img, desired_width=plot_desired_width) - - def tag(elements: Iterable[LayoutElement]): """Asign an numeric id to the elements in the list. Useful for debugging"""