Skip to content

Commit

Permalink
Refactor: remove image extraction related code (#299)
Browse files Browse the repository at this point in the history
### Summary
This PR is the first part of the "image extraction" refactor to move it
from unstructured-inference repo to unstructured repo. This PR removes
all "image extraction" related code from unstructured-inference repo and
works together with the unstructured refactor PR -
Unstructured-IO/unstructured#2201.

### Note
The ingest test won't pass until we merge the unstructured refactor PR -
Unstructured-IO/unstructured#2201.
  • Loading branch information
christinestraub authored Dec 5, 2023
1 parent 2b29254 commit 794f38b
Show file tree
Hide file tree
Showing 7 changed files with 6 additions and 177 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.7.18

* refactor: remove all image extraction related code

## 0.7.17

* refactor: remove all `pdfminer` related code
Expand Down
21 changes: 0 additions & 21 deletions examples/image_extraction/README.md

This file was deleted.

94 changes: 0 additions & 94 deletions examples/image_extraction/embedded-image-extraction.py

This file was deleted.

3 changes: 0 additions & 3 deletions examples/image_extraction/requirements.txt

This file was deleted.

20 changes: 1 addition & 19 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import os.path
import tempfile
from unittest.mock import ANY, mock_open, patch
from unittest.mock import mock_open, patch

import numpy as np
import pytest
Expand Down Expand Up @@ -557,22 +557,6 @@ def test_from_image(
assert mock_detection.called == detection_model_called


def test_extract_images(mock_pil_image):
page = MockPageLayout(image=mock_pil_image)
page.elements = [
layoutelement.LayoutElement.from_coords(1, 1, 10, 10, text=None, type="Image"),
layoutelement.LayoutElement.from_coords(11, 11, 20, 20, text=None, type="Image"),
]

with tempfile.TemporaryDirectory() as tmpdir:
page.extract_images(output_dir_path=str(tmpdir))

for i, el in enumerate(page.elements):
expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg")
assert os.path.isfile(el.image_path)
assert el.image_path == expected_image_path


class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):
def initialize(self, *args, **kwargs):
return super().initialize(*args, **kwargs)
Expand Down Expand Up @@ -614,8 +598,6 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
fixed_layouts=None,
extract_tables=False,
pdf_image_dpi=200,
extract_images_in_pdf=ANY,
image_output_dir_path=ANY,
)


Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.17" # pragma: no cover
__version__ = "0.7.18" # pragma: no cover
39 changes: 0 additions & 39 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pdf2image
from PIL import Image, ImageSequence

from unstructured_inference.constants import ElementType
from unstructured_inference.inference.elements import (
TextRegion,
)
Expand All @@ -24,7 +23,6 @@
UnstructuredElementExtractionModel,
UnstructuredObjectDetectionModel,
)
from unstructured_inference.utils import write_image
from unstructured_inference.visualize import draw_bbox


Expand Down Expand Up @@ -230,34 +228,6 @@ def get_elements_from_layout(
]
return elements

def extract_images(self, output_dir_path: Optional[str] = None):
"""
Extract and save images from the page. This method iterates through the layout elements
of the page, identifies image regions, and extracts and saves them as separate image files.
"""

if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures")
os.makedirs(output_dir_path, exist_ok=True)

figure_number = 0
image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
for el in self.elements:
if (el.bbox is None) or (el.type not in image_element_types):
continue

figure_number += 1
try:
output_f_path = os.path.join(
output_dir_path,
f"figure-{self.number}-{figure_number}.jpg",
)
cropped_image = self.image.crop((el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2))
write_image(cropped_image, output_f_path)
el.image_path = output_f_path
except (ValueError, IOError):
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)

def _get_image_array(self) -> Union[np.ndarray, None]:
"""Converts the raw image into a numpy array."""
if self.image_array is None:
Expand Down Expand Up @@ -350,8 +320,6 @@ def from_image(
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
extract_tables: bool = False,
fixed_layout: Optional[List[TextRegion]] = None,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
):
"""Creates a PageLayout from an already-loaded PIL Image."""

Expand All @@ -378,9 +346,6 @@ def from_image(
page.image_path = os.path.abspath(image_path) if image_path else None
page.document_filename = os.path.abspath(document_filename) if document_filename else None

if extract_images_in_pdf:
page.extract_images(image_output_dir_path)

# Clear the image to save memory
page.image = None

Expand Down Expand Up @@ -413,8 +378,6 @@ def process_file_with_model(
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
extract_tables: bool = False,
pdf_image_dpi: int = 200,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs,
) -> DocumentLayout:
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
Expand Down Expand Up @@ -445,8 +408,6 @@ def process_file_with_model(
fixed_layouts=fixed_layouts,
extract_tables=extract_tables,
pdf_image_dpi=pdf_image_dpi,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs,
)
)
Expand Down

0 comments on commit 794f38b

Please sign in to comment.