From 794f38b27e5ed6325e930e1b5b87410a7bf53620 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Mon, 4 Dec 2023 23:41:09 -0800 Subject: [PATCH] Refactor: remove image extraction related code (#299) ### Summary This PR is the first part of the "image extraction" refactor to move it from unstructured-inference repo to unstructured repo. This PR removes all "image extraction" related code from unstructured-inference repo and works together with the unstructured refactor PR - https://github.com/Unstructured-IO/unstructured/pull/2201. ### Note The ingest test won't pass until we merge the unstructured refactor PR - https://github.com/Unstructured-IO/unstructured/pull/2201. --- CHANGELOG.md | 4 + examples/image_extraction/README.md | 21 ----- .../embedded-image-extraction.py | 94 ------------------- examples/image_extraction/requirements.txt | 3 - .../inference/test_layout.py | 20 +--- unstructured_inference/__version__.py | 2 +- unstructured_inference/inference/layout.py | 39 -------- 7 files changed, 6 insertions(+), 177 deletions(-) delete mode 100644 examples/image_extraction/README.md delete mode 100644 examples/image_extraction/embedded-image-extraction.py delete mode 100644 examples/image_extraction/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index db6a973a..884b7684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.7.18 + +* refactor: remove all image extraction related code + ## 0.7.17 * refactor: remove all `pdfminer` related code diff --git a/examples/image_extraction/README.md b/examples/image_extraction/README.md deleted file mode 100644 index cd3461d7..00000000 --- a/examples/image_extraction/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Extracting Images - -This directory contains examples of how to extract images in PDF's separately as images. - -## How to run - -Run `pip install -r requirements.txt` to install the Python dependencies. - -### Extracting Embedded Images -- Python script (embedded-image-extraction.py) -``` - $ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py -``` -The library can be `unstructured`, `pymupdf`, and `pypdf2`. For example, -``` -$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf unstructured -// or -$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pymupdf -// or -$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pypdf2 -``` diff --git a/examples/image_extraction/embedded-image-extraction.py b/examples/image_extraction/embedded-image-extraction.py deleted file mode 100644 index 2443c04a..00000000 --- a/examples/image_extraction/embedded-image-extraction.py +++ /dev/null @@ -1,94 +0,0 @@ -import io -import os.path -import pathlib -import sys - -import fitz # PyMuPDF -from PIL import Image -from PyPDF2 import PdfReader - -from unstructured_inference.inference.layout import DocumentLayout - -CUR_DIR = pathlib.Path(__file__).parent.resolve() - - -def print_result(images, page_index): - if images: - print(f"[+] Found a total of {len(images)} images in page {page_index}") - else: - print(f"[!] No images found on page {page_index}") - - -def run_with_unstructured(f_path, output_dir_path): - doc = DocumentLayout.from_file( - filename=f_path, - extract_images_in_pdf=True, - image_output_dir_path=output_dir_path, - ) - - for page_index, page in enumerate(doc.pages, start=1): - image_elements = [el for el in page.elements if el.type == "Image"] - print_result(image_elements, page_index) - - -def run_with_pymupdf(f_path, output_dir_path): - doc = fitz.open(f_path) - for page_index, page in enumerate(doc, start=1): - image_list = page.get_images(full=True) - print_result(image_list, page_index) - - for image_index, img in enumerate(image_list, start=1): - # Get the XREF of the image - xref = img[0] - # Extract the image bytes - base_image = doc.extract_image(xref) - image_bytes = base_image["image"] - # Get the image extension - image_ext = base_image["ext"] - # Load it to PIL - image = Image.open(io.BytesIO(image_bytes)) - output_f_path = os.path.join(output_dir_path, f"image_{page_index}_{image_index}.{image_ext}") - image.save(output_f_path) - - -def run_with_pypdf2(f_path, output_dir_path): - reader = PdfReader(f_path) - for page_index, page in enumerate(reader.pages, start=1): - images = page.images - print_result(images, page_index) - - for image_file_object in images: - output_f_path = os.path.join(output_dir_path, f"figure_{page_index}_{image_file_object.name}") - with open(output_f_path, "wb") as fp: - fp.write(image_file_object.data) - - -def run(f_path, library): - f_basename = os.path.splitext(os.path.basename(f_path))[0] - output_dir_path = os.path.join(output_basedir_path, library, f_basename) - os.makedirs(output_dir_path, exist_ok=True) - - if library == "unstructured": - run_with_unstructured(f_path, output_dir_path) - elif library == "pymupdf": - run_with_pymupdf(f_path, output_dir_path) - elif library == "pypdf2": - run_with_pypdf2(f_path, output_dir_path) - - -if __name__ == '__main__': - if len(sys.argv) < 3: - print( - "Please provide the path to the file name as the first argument and the image " - "extraction library as the second argument.", - ) - sys.exit(1) - - if sys.argv[2] not in ["unstructured", "pymupdf", "pypdf2"]: - print("Invalid pdf library") - sys.exit(1) - - output_basedir_path = os.path.join(CUR_DIR, "output") - os.makedirs(output_basedir_path, exist_ok=True) - - run(f_path=sys.argv[1], library=sys.argv[2]) diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt deleted file mode 100644 index 351b4120..00000000 --- a/examples/image_extraction/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -unstructured-inference -pymupdf -pypdf2 \ No newline at end of file diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 04680aa5..564a5cbb 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -1,7 +1,7 @@ import os import os.path import tempfile -from unittest.mock import ANY, mock_open, patch +from unittest.mock import mock_open, patch import numpy as np import pytest @@ -557,22 +557,6 @@ def test_from_image( assert mock_detection.called == detection_model_called -def test_extract_images(mock_pil_image): - page = MockPageLayout(image=mock_pil_image) - page.elements = [ - layoutelement.LayoutElement.from_coords(1, 1, 10, 10, text=None, type="Image"), - layoutelement.LayoutElement.from_coords(11, 11, 20, 20, text=None, type="Image"), - ] - - with tempfile.TemporaryDirectory() as tmpdir: - page.extract_images(output_dir_path=str(tmpdir)) - - for i, el in enumerate(page.elements): - expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg") - assert os.path.isfile(el.image_path) - assert el.image_path == expected_image_path - - class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel): def initialize(self, *args, **kwargs): return super().initialize(*args, **kwargs) @@ -614,8 +598,6 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m fixed_layouts=None, extract_tables=False, pdf_image_dpi=200, - extract_images_in_pdf=ANY, - image_output_dir_path=ANY, ) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index a0548a92..9d99e964 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.17" # pragma: no cover +__version__ = "0.7.18" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index e5583ece..172f377a 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -9,7 +9,6 @@ import pdf2image from PIL import Image, ImageSequence -from unstructured_inference.constants import ElementType from unstructured_inference.inference.elements import ( TextRegion, ) @@ -24,7 +23,6 @@ UnstructuredElementExtractionModel, UnstructuredObjectDetectionModel, ) -from unstructured_inference.utils import write_image from unstructured_inference.visualize import draw_bbox @@ -230,34 +228,6 @@ def get_elements_from_layout( ] return elements - def extract_images(self, output_dir_path: Optional[str] = None): - """ - Extract and save images from the page. This method iterates through the layout elements - of the page, identifies image regions, and extracts and saves them as separate image files. - """ - - if not output_dir_path: - output_dir_path = os.path.join(os.getcwd(), "figures") - os.makedirs(output_dir_path, exist_ok=True) - - figure_number = 0 - image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE] - for el in self.elements: - if (el.bbox is None) or (el.type not in image_element_types): - continue - - figure_number += 1 - try: - output_f_path = os.path.join( - output_dir_path, - f"figure-{self.number}-{figure_number}.jpg", - ) - cropped_image = self.image.crop((el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2)) - write_image(cropped_image, output_f_path) - el.image_path = output_f_path - except (ValueError, IOError): - logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True) - def _get_image_array(self) -> Union[np.ndarray, None]: """Converts the raw image into a numpy array.""" if self.image_array is None: @@ -350,8 +320,6 @@ def from_image( element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, extract_tables: bool = False, fixed_layout: Optional[List[TextRegion]] = None, - extract_images_in_pdf: bool = False, - image_output_dir_path: Optional[str] = None, ): """Creates a PageLayout from an already-loaded PIL Image.""" @@ -378,9 +346,6 @@ def from_image( page.image_path = os.path.abspath(image_path) if image_path else None page.document_filename = os.path.abspath(document_filename) if document_filename else None - if extract_images_in_pdf: - page.extract_images(image_output_dir_path) - # Clear the image to save memory page.image = None @@ -413,8 +378,6 @@ def process_file_with_model( fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, - extract_images_in_pdf: bool = False, - image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by @@ -445,8 +408,6 @@ def process_file_with_model( fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, - extract_images_in_pdf=extract_images_in_pdf, - image_output_dir_path=image_output_dir_path, **kwargs, ) )