From f673ea4c40f6b438a0ff2569670d9d060db16951 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 5 Oct 2023 12:53:16 -0700 Subject: [PATCH] enhancement: add visualization script to annotate elements (#1613) This PR was initially created to close GitHub Issue #1604 (Synchronizing the default layout model), but since it was already resolved in PR [#1607](https://github.com/Unstructured-IO/unstructured/pull/1607), this PR now only adds the visualization script used to investigate the issue. ### Summary - add python script to annotate elements PDF: [references.pdf](https://github.com/Unstructured-IO/unstructured/files/12778270/references.pdf) ### Evaluation ``` PYTHONPATH=. python examples/layout-analysis/visualization.py references.pdf hi_res ``` --- CHANGELOG.md | 3 +- examples/layout-analysis/README.md | 17 +++++ examples/layout-analysis/requirements.txt | 1 + examples/layout-analysis/visualization.py | 82 +++++++++++++++++++++++ unstructured/__version__.py | 2 +- 5 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 examples/layout-analysis/README.md create mode 100644 examples/layout-analysis/requirements.txt create mode 100644 examples/layout-analysis/visualization.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d5d1843b2..fe638b4aff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.10.20-dev2 +## 0.10.20-dev3 ### Enhancements +* **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()). * **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles. * **Improve hierarchy detection in pptx documents** List items, and other slide text are properly nested under the slide title. This will enable better chunking of pptx documents. diff --git a/examples/layout-analysis/README.md b/examples/layout-analysis/README.md new file mode 100644 index 0000000000..ad2f43bd37 --- /dev/null +++ b/examples/layout-analysis/README.md @@ -0,0 +1,17 @@ +# Analyzing Layout Elements + +This directory contains examples of how to analyze layout elements. + +## How to run + +Run `pip install -r requirements.txt` to install the Python dependencies. + +### Visualization +- Python script (visualization.py) +``` +$ PYTHONPATH=. python examples/layout-analysis/visualization.py +``` +The strategy can be one of "auto", "hi_res", "ocr_only", or "fast". For example, +``` +$ PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/loremipsum.pdf hi_res +``` \ No newline at end of file diff --git a/examples/layout-analysis/requirements.txt b/examples/layout-analysis/requirements.txt new file mode 100644 index 0000000000..0d7e9b7d61 --- /dev/null +++ b/examples/layout-analysis/requirements.txt @@ -0,0 +1 @@ +unstructured-inference \ No newline at end of file diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py new file mode 100644 index 0000000000..9302494a10 --- /dev/null +++ b/examples/layout-analysis/visualization.py @@ -0,0 +1,82 @@ +import os +import pathlib +import sys + +import pdf2image +from unstructured_inference.inference.elements import Rectangle +from unstructured_inference.visualize import draw_bbox + +from unstructured.documents.elements import PageBreak +from unstructured.partition.pdf import partition_pdf + +CUR_DIR = pathlib.Path(__file__).parent.resolve() + + +def extract_element_coordinates(elements): + elements_coordinates = [] + page_elements_coordinates = [] + + for el in elements: + if isinstance(el, PageBreak): + if page_elements_coordinates: + elements_coordinates.append(page_elements_coordinates) + page_elements_coordinates = [] + else: + page_elements_coordinates.append(el.metadata.coordinates) + + if page_elements_coordinates: + elements_coordinates.append(page_elements_coordinates) + + return elements_coordinates + + +def run_partition_pdf(f_path, strategy, images, output_dir): + + elements = partition_pdf( + f_path, + strategy=strategy, + include_page_breaks=True, + ) + + elements_coordinates = extract_element_coordinates(elements) + assert len(images) == len(elements_coordinates) + + for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)): + for coordinate in coords_per_page: + points = coordinate.points + x1, y1 = points[0] + x2, y2 = points[2] + rect = Rectangle(x1, y1, x2, y2) + img = draw_bbox(img, rect, color="red") + + output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg") + print(f"output_image_path: {output_image_path}") + + img.save(output_image_path) + + +def run(f_path, strategy): + f_basename = os.path.splitext(os.path.basename(f_path))[0] + output_dir_path = os.path.join(output_basedir_path, f_basename) + os.makedirs(output_dir_path, exist_ok=True) + + images = pdf2image.convert_from_path(f_path) + run_partition_pdf(f_path, strategy, images, output_dir_path) + + +if __name__ == '__main__': + if len(sys.argv) < 3: + print( + "Please provide the path to the file name as the first argument and the strategy as the " + "second argument.", + ) + sys.exit(1) + + if sys.argv[2] not in ["auto", "hi_res", "ocr_only", "fast"]: + print("Invalid strategy") + sys.exit(1) + + output_basedir_path = os.path.join(CUR_DIR, "output") + os.makedirs(output_basedir_path, exist_ok=True) + + run(f_path=sys.argv[1], strategy=sys.argv[2]) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 680eaf3a9a..57943dc290 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.20-dev2" # pragma: no cover +__version__ = "0.10.20-dev3" # pragma: no cover