From f673ea4c40f6b438a0ff2569670d9d060db16951 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Thu, 5 Oct 2023 12:53:16 -0700
Subject: [PATCH] enhancement: add visualization script to annotate elements
 (#1613)

This PR was initially created to close GitHub Issue #1604 (Synchronizing the default
layout model), but since it was already resolved in PR
[#1607](https://github.com/Unstructured-IO/unstructured/pull/1607), this
PR now only adds the visualization script used to investigate the issue.

### Summary
- add python script to annotate elements

PDF:
[references.pdf](https://github.com/Unstructured-IO/unstructured/files/12778270/references.pdf)

### Evaluation
```
PYTHONPATH=. python examples/layout-analysis/visualization.py references.pdf hi_res
```
---
 CHANGELOG.md                              |  3 +-
 examples/layout-analysis/README.md        | 17 +++++
 examples/layout-analysis/requirements.txt |  1 +
 examples/layout-analysis/visualization.py | 82 +++++++++++++++++++++++
 unstructured/__version__.py               |  2 +-
 5 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 examples/layout-analysis/README.md
 create mode 100644 examples/layout-analysis/requirements.txt
 create mode 100644 examples/layout-analysis/visualization.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d5d1843b2..fe638b4aff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.10.20-dev2
+## 0.10.20-dev3
 
 ### Enhancements
 
+* **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()).
 * **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
 * **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles.
 * **Improve hierarchy detection in pptx documents** List items, and other slide text are properly nested under the slide title. This will enable better chunking of pptx documents.
diff --git a/examples/layout-analysis/README.md b/examples/layout-analysis/README.md
new file mode 100644
index 0000000000..ad2f43bd37
--- /dev/null
+++ b/examples/layout-analysis/README.md
@@ -0,0 +1,17 @@
+# Analyzing Layout Elements
+
+This directory contains examples of how to analyze layout elements.
+
+## How to run
+
+Run `pip install -r requirements.txt` to install the Python dependencies.
+
+### Visualization
+- Python script (visualization.py)
+```
+$ PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy>
+```
+The strategy can be one of "auto", "hi_res", "ocr_only", or "fast". For example,
+```
+$ PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/loremipsum.pdf hi_res
+```
\ No newline at end of file
diff --git a/examples/layout-analysis/requirements.txt b/examples/layout-analysis/requirements.txt
new file mode 100644
index 0000000000..0d7e9b7d61
--- /dev/null
+++ b/examples/layout-analysis/requirements.txt
@@ -0,0 +1 @@
+unstructured-inference
\ No newline at end of file
diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py
new file mode 100644
index 0000000000..9302494a10
--- /dev/null
+++ b/examples/layout-analysis/visualization.py
@@ -0,0 +1,82 @@
+import os
+import pathlib
+import sys
+
+import pdf2image
+from unstructured_inference.inference.elements import Rectangle
+from unstructured_inference.visualize import draw_bbox
+
+from unstructured.documents.elements import PageBreak
+from unstructured.partition.pdf import partition_pdf
+
+CUR_DIR = pathlib.Path(__file__).parent.resolve()
+
+
+def extract_element_coordinates(elements):
+    elements_coordinates = []
+    page_elements_coordinates = []
+
+    for el in elements:
+        if isinstance(el, PageBreak):
+            if page_elements_coordinates:
+                elements_coordinates.append(page_elements_coordinates)
+                page_elements_coordinates = []
+        else:
+            page_elements_coordinates.append(el.metadata.coordinates)
+
+    if page_elements_coordinates:
+        elements_coordinates.append(page_elements_coordinates)
+
+    return elements_coordinates
+
+
+def run_partition_pdf(f_path, strategy, images, output_dir):
+
+    elements = partition_pdf(
+        f_path,
+        strategy=strategy,
+        include_page_breaks=True,
+    )
+
+    elements_coordinates = extract_element_coordinates(elements)
+    assert len(images) == len(elements_coordinates)
+
+    for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
+        for coordinate in coords_per_page:
+            points = coordinate.points
+            x1, y1 = points[0]
+            x2, y2 = points[2]
+            rect = Rectangle(x1, y1, x2, y2)
+            img = draw_bbox(img, rect, color="red")
+
+        output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
+        print(f"output_image_path: {output_image_path}")
+
+        img.save(output_image_path)
+
+
+def run(f_path, strategy):
+    f_basename = os.path.splitext(os.path.basename(f_path))[0]
+    output_dir_path = os.path.join(output_basedir_path, f_basename)
+    os.makedirs(output_dir_path, exist_ok=True)
+
+    images = pdf2image.convert_from_path(f_path)
+    run_partition_pdf(f_path, strategy, images, output_dir_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print(
+            "Please provide the path to the file name as the first argument and the strategy as the "
+            "second argument.",
+        )
+        sys.exit(1)
+
+    if sys.argv[2] not in ["auto", "hi_res", "ocr_only", "fast"]:
+        print("Invalid strategy")
+        sys.exit(1)
+
+    output_basedir_path = os.path.join(CUR_DIR, "output")
+    os.makedirs(output_basedir_path, exist_ok=True)
+
+    run(f_path=sys.argv[1], strategy=sys.argv[2])
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 680eaf3a9a..57943dc290 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.20-dev2"  # pragma: no cover
+__version__ = "0.10.20-dev3"  # pragma: no cover