Refactor: remove image extraction related code (#299)

### Summary This PR is the first part of the "image extraction" refactor to move it from unstructured-inference repo to unstructured repo. This PR removes all "image extraction" related code from unstructured-inference repo and works together with the unstructured refactor PR - Unstructured-IO/unstructured#2201. ### Note The ingest test won't pass until we merge the unstructured refactor PR - Unstructured-IO/unstructured#2201.
Unstructured-IO · Dec 5, 2023 · 794f38b · 794f38b
1 parent 2b29254
commit 794f38b
Show file tree

Hide file tree

Showing 7 changed files with 6 additions and 177 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.18
+
+* refactor: remove all image extraction related code
+
 ## 0.7.17
 
 * refactor: remove all `pdfminer` related code

diff --git a/examples/image_extraction/README.md b/examples/image_extraction/README.md
diff --git a/examples/image_extraction/embedded-image-extraction.py b/examples/image_extraction/embedded-image-extraction.py
diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -1,7 +1,7 @@
 import os
 import os.path
 import tempfile
-from unittest.mock import ANY, mock_open, patch
+from unittest.mock import mock_open, patch
 
 import numpy as np
 import pytest
@@ -557,22 +557,6 @@ def test_from_image(
         assert mock_detection.called == detection_model_called
 
 
-def test_extract_images(mock_pil_image):
-    page = MockPageLayout(image=mock_pil_image)
-    page.elements = [
-        layoutelement.LayoutElement.from_coords(1, 1, 10, 10, text=None, type="Image"),
-        layoutelement.LayoutElement.from_coords(11, 11, 20, 20, text=None, type="Image"),
-    ]
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        page.extract_images(output_dir_path=str(tmpdir))
-
-        for i, el in enumerate(page.elements):
-            expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg")
-            assert os.path.isfile(el.image_path)
-            assert el.image_path == expected_image_path
-
-
 class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):
     def initialize(self, *args, **kwargs):
         return super().initialize(*args, **kwargs)
@@ -614,8 +598,6 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
             fixed_layouts=None,
             extract_tables=False,
             pdf_image_dpi=200,
-            extract_images_in_pdf=ANY,
-            image_output_dir_path=ANY,
         )
 
 

diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.17"  # pragma: no cover
+__version__ = "0.7.18"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -9,7 +9,6 @@
 import pdf2image
 from PIL import Image, ImageSequence
 
-from unstructured_inference.constants import ElementType
 from unstructured_inference.inference.elements import (
     TextRegion,
 )
@@ -24,7 +23,6 @@
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
 )
-from unstructured_inference.utils import write_image
 from unstructured_inference.visualize import draw_bbox
 
 
@@ -230,34 +228,6 @@ def get_elements_from_layout(
         ]
         return elements
 
-    def extract_images(self, output_dir_path: Optional[str] = None):
-        """
-        Extract and save images from the page. This method iterates through the layout elements
-        of the page, identifies image regions, and extracts and saves them as separate image files.
-        """
-
-        if not output_dir_path:
-            output_dir_path = os.path.join(os.getcwd(), "figures")
-        os.makedirs(output_dir_path, exist_ok=True)
-
-        figure_number = 0
-        image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
-        for el in self.elements:
-            if (el.bbox is None) or (el.type not in image_element_types):
-                continue
-
-            figure_number += 1
-            try:
-                output_f_path = os.path.join(
-                    output_dir_path,
-                    f"figure-{self.number}-{figure_number}.jpg",
-                )
-                cropped_image = self.image.crop((el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2))
-                write_image(cropped_image, output_f_path)
-                el.image_path = output_f_path
-            except (ValueError, IOError):
-                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
-
     def _get_image_array(self) -> Union[np.ndarray, None]:
         """Converts the raw image into a numpy array."""
         if self.image_array is None:
@@ -350,8 +320,6 @@ def from_image(
         element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
         extract_tables: bool = False,
         fixed_layout: Optional[List[TextRegion]] = None,
-        extract_images_in_pdf: bool = False,
-        image_output_dir_path: Optional[str] = None,
     ):
         """Creates a PageLayout from an already-loaded PIL Image."""
 
@@ -378,9 +346,6 @@ def from_image(
         page.image_path = os.path.abspath(image_path) if image_path else None
         page.document_filename = os.path.abspath(document_filename) if document_filename else None
 
-        if extract_images_in_pdf:
-            page.extract_images(image_output_dir_path)
-
         # Clear the image to save memory
         page.image = None
 
@@ -413,8 +378,6 @@ def process_file_with_model(
     fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
     extract_tables: bool = False,
     pdf_image_dpi: int = 200,
-    extract_images_in_pdf: bool = False,
-    image_output_dir_path: Optional[str] = None,
     **kwargs,
 ) -> DocumentLayout:
     """Processes pdf file with name filename into a DocumentLayout by using a model identified by
@@ -445,8 +408,6 @@ def process_file_with_model(
             fixed_layouts=fixed_layouts,
             extract_tables=extract_tables,
             pdf_image_dpi=pdf_image_dpi,
-            extract_images_in_pdf=extract_images_in_pdf,
-            image_output_dir_path=image_output_dir_path,
             **kwargs,
         )
     )
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.7.17" # pragma: no cover
		__version__ = "0.7.18" # pragma: no cover