From 794f38b27e5ed6325e930e1b5b87410a7bf53620 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Mon, 4 Dec 2023 23:41:09 -0800
Subject: [PATCH] Refactor: remove image extraction related code (#299)

### Summary
This PR is the first part of the "image extraction" refactor to move it
from unstructured-inference repo to unstructured repo. This PR removes
all "image extraction" related code from unstructured-inference repo and
works together with the unstructured refactor PR -
https://github.com/Unstructured-IO/unstructured/pull/2201.

### Note
The ingest test won't pass until we merge the unstructured refactor PR -
https://github.com/Unstructured-IO/unstructured/pull/2201.
---
 CHANGELOG.md                                  |  4 +
 examples/image_extraction/README.md           | 21 -----
 .../embedded-image-extraction.py              | 94 -------------------
 examples/image_extraction/requirements.txt    |  3 -
 .../inference/test_layout.py                  | 20 +---
 unstructured_inference/__version__.py         |  2 +-
 unstructured_inference/inference/layout.py    | 39 --------
 7 files changed, 6 insertions(+), 177 deletions(-)
 delete mode 100644 examples/image_extraction/README.md
 delete mode 100644 examples/image_extraction/embedded-image-extraction.py
 delete mode 100644 examples/image_extraction/requirements.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
index db6a973a..884b7684 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.18
+
+* refactor: remove all image extraction related code
+
 ## 0.7.17
 
 * refactor: remove all `pdfminer` related code
diff --git a/examples/image_extraction/README.md b/examples/image_extraction/README.md
deleted file mode 100644
index cd3461d7..00000000
--- a/examples/image_extraction/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Extracting Images
-
-This directory contains examples of how to extract images in PDF's separately as images.
-
-## How to run
-
-Run `pip install -r requirements.txt` to install the Python dependencies.
-
-### Extracting Embedded Images
-- Python script (embedded-image-extraction.py)
-```
- $ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py <file_path> <library>
-```
-The library can be  `unstructured`, `pymupdf`, and `pypdf2`. For example,
-```
-$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf unstructured
-// or
-$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pymupdf
-// or
-$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pypdf2
-```
diff --git a/examples/image_extraction/embedded-image-extraction.py b/examples/image_extraction/embedded-image-extraction.py
deleted file mode 100644
index 2443c04a..00000000
--- a/examples/image_extraction/embedded-image-extraction.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import io
-import os.path
-import pathlib
-import sys
-
-import fitz  # PyMuPDF
-from PIL import Image
-from PyPDF2 import PdfReader
-
-from unstructured_inference.inference.layout import DocumentLayout
-
-CUR_DIR = pathlib.Path(__file__).parent.resolve()
-
-
-def print_result(images, page_index):
-    if images:
-        print(f"[+] Found a total of {len(images)} images in page {page_index}")
-    else:
-        print(f"[!] No images found on page {page_index}")
-
-
-def run_with_unstructured(f_path, output_dir_path):
-    doc = DocumentLayout.from_file(
-        filename=f_path,
-        extract_images_in_pdf=True,
-        image_output_dir_path=output_dir_path,
-    )
-
-    for page_index, page in enumerate(doc.pages, start=1):
-        image_elements = [el for el in page.elements if el.type == "Image"]
-        print_result(image_elements, page_index)
-
-
-def run_with_pymupdf(f_path, output_dir_path):
-    doc = fitz.open(f_path)
-    for page_index, page in enumerate(doc, start=1):
-        image_list = page.get_images(full=True)
-        print_result(image_list, page_index)
-
-        for image_index, img in enumerate(image_list, start=1):
-            # Get the XREF of the image
-            xref = img[0]
-            # Extract the image bytes
-            base_image = doc.extract_image(xref)
-            image_bytes = base_image["image"]
-            # Get the image extension
-            image_ext = base_image["ext"]
-            # Load it to PIL
-            image = Image.open(io.BytesIO(image_bytes))
-            output_f_path = os.path.join(output_dir_path, f"image_{page_index}_{image_index}.{image_ext}")
-            image.save(output_f_path)
-
-
-def run_with_pypdf2(f_path, output_dir_path):
-    reader = PdfReader(f_path)
-    for page_index, page in enumerate(reader.pages, start=1):
-        images = page.images
-        print_result(images, page_index)
-
-        for image_file_object in images:
-            output_f_path = os.path.join(output_dir_path, f"figure_{page_index}_{image_file_object.name}")
-            with open(output_f_path, "wb") as fp:
-                fp.write(image_file_object.data)
-
-
-def run(f_path, library):
-    f_basename = os.path.splitext(os.path.basename(f_path))[0]
-    output_dir_path = os.path.join(output_basedir_path, library, f_basename)
-    os.makedirs(output_dir_path, exist_ok=True)
-
-    if library == "unstructured":
-        run_with_unstructured(f_path, output_dir_path)
-    elif library == "pymupdf":
-        run_with_pymupdf(f_path, output_dir_path)
-    elif library == "pypdf2":
-        run_with_pypdf2(f_path, output_dir_path)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3:
-        print(
-            "Please provide the path to the file name as the first argument and the image "
-            "extraction library as the second argument.",
-        )
-        sys.exit(1)
-
-    if sys.argv[2] not in ["unstructured", "pymupdf", "pypdf2"]:
-        print("Invalid pdf library")
-        sys.exit(1)
-
-    output_basedir_path = os.path.join(CUR_DIR, "output")
-    os.makedirs(output_basedir_path, exist_ok=True)
-
-    run(f_path=sys.argv[1], library=sys.argv[2])
diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt
deleted file mode 100644
index 351b4120..00000000
--- a/examples/image_extraction/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-unstructured-inference
-pymupdf
-pypdf2
\ No newline at end of file
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index 04680aa5..564a5cbb 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -1,7 +1,7 @@
 import os
 import os.path
 import tempfile
-from unittest.mock import ANY, mock_open, patch
+from unittest.mock import mock_open, patch
 
 import numpy as np
 import pytest
@@ -557,22 +557,6 @@ def test_from_image(
         assert mock_detection.called == detection_model_called
 
 
-def test_extract_images(mock_pil_image):
-    page = MockPageLayout(image=mock_pil_image)
-    page.elements = [
-        layoutelement.LayoutElement.from_coords(1, 1, 10, 10, text=None, type="Image"),
-        layoutelement.LayoutElement.from_coords(11, 11, 20, 20, text=None, type="Image"),
-    ]
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        page.extract_images(output_dir_path=str(tmpdir))
-
-        for i, el in enumerate(page.elements):
-            expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg")
-            assert os.path.isfile(el.image_path)
-            assert el.image_path == expected_image_path
-
-
 class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):
     def initialize(self, *args, **kwargs):
         return super().initialize(*args, **kwargs)
@@ -614,8 +598,6 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
             fixed_layouts=None,
             extract_tables=False,
             pdf_image_dpi=200,
-            extract_images_in_pdf=ANY,
-            image_output_dir_path=ANY,
         )
 
 
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index a0548a92..9d99e964 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.17"  # pragma: no cover
+__version__ = "0.7.18"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index e5583ece..172f377a 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -9,7 +9,6 @@
 import pdf2image
 from PIL import Image, ImageSequence
 
-from unstructured_inference.constants import ElementType
 from unstructured_inference.inference.elements import (
     TextRegion,
 )
@@ -24,7 +23,6 @@
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
 )
-from unstructured_inference.utils import write_image
 from unstructured_inference.visualize import draw_bbox
 
 
@@ -230,34 +228,6 @@ def get_elements_from_layout(
         ]
         return elements
 
-    def extract_images(self, output_dir_path: Optional[str] = None):
-        """
-        Extract and save images from the page. This method iterates through the layout elements
-        of the page, identifies image regions, and extracts and saves them as separate image files.
-        """
-
-        if not output_dir_path:
-            output_dir_path = os.path.join(os.getcwd(), "figures")
-        os.makedirs(output_dir_path, exist_ok=True)
-
-        figure_number = 0
-        image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
-        for el in self.elements:
-            if (el.bbox is None) or (el.type not in image_element_types):
-                continue
-
-            figure_number += 1
-            try:
-                output_f_path = os.path.join(
-                    output_dir_path,
-                    f"figure-{self.number}-{figure_number}.jpg",
-                )
-                cropped_image = self.image.crop((el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2))
-                write_image(cropped_image, output_f_path)
-                el.image_path = output_f_path
-            except (ValueError, IOError):
-                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
-
     def _get_image_array(self) -> Union[np.ndarray, None]:
         """Converts the raw image into a numpy array."""
         if self.image_array is None:
@@ -350,8 +320,6 @@ def from_image(
         element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
         extract_tables: bool = False,
         fixed_layout: Optional[List[TextRegion]] = None,
-        extract_images_in_pdf: bool = False,
-        image_output_dir_path: Optional[str] = None,
     ):
         """Creates a PageLayout from an already-loaded PIL Image."""
 
@@ -378,9 +346,6 @@ def from_image(
         page.image_path = os.path.abspath(image_path) if image_path else None
         page.document_filename = os.path.abspath(document_filename) if document_filename else None
 
-        if extract_images_in_pdf:
-            page.extract_images(image_output_dir_path)
-
         # Clear the image to save memory
         page.image = None
 
@@ -413,8 +378,6 @@ def process_file_with_model(
     fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
     extract_tables: bool = False,
     pdf_image_dpi: int = 200,
-    extract_images_in_pdf: bool = False,
-    image_output_dir_path: Optional[str] = None,
     **kwargs,
 ) -> DocumentLayout:
     """Processes pdf file with name filename into a DocumentLayout by using a model identified by
@@ -445,8 +408,6 @@ def process_file_with_model(
             fixed_layouts=fixed_layouts,
             extract_tables=extract_tables,
             pdf_image_dpi=pdf_image_dpi,
-            extract_images_in_pdf=extract_images_in_pdf,
-            image_output_dir_path=image_output_dir_path,
             **kwargs,
         )
     )