Python SDK: Feat. process specific pages, Other fixes and improvements (

#39) * Python SDK Changes: - Feature added: now specific pages can be processed with the python sdk using "select_pages" param. Incorporates #23, #24 for python sdk - workflow for the above feature: create a new temperory pdf in the tempdir if select_pages is specified and follow the rest of the process as usual and finally map the page number in the formatted markdown to get the actual number instead of index. - raise warning when both select_pages and maintain used. - required adaptations and updates in messages, exceptions, types, processor, utils etc Fixes/improvements: - memory efficient pdf to image conversion, utilizing paths only option to directly get sorted image paths from pdf2image api Misc: - Bump the version tag - documentation updated * Minor update in README.md --------- Co-authored-by: Pradyumna Singh Rathore <[email protected]>
getomni-ai · Sep 17, 2024 · eaf13b2 · eaf13b2
1 parent b570756
commit eaf13b2
Show file tree

Hide file tree

Showing 13 changed files with 187 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -144,9 +144,12 @@ Request #3 => page_2_markdown + page_3_image
 ### Installation:
 
 - Install **poppler-utils** on the system, it should be available in path variable
-- Install py-zerox: `pip install py-zerox`
+- Install py-zerox:
+```sh
+pip install py-zerox
+```
 
-The `zerox` function is an asynchronous API that performs OCR (Optical Character Recognition) to markdown using vision models. It processes PDF files and converts them into markdown format. Make sure to set up the environment variables for the model and the model provider before using this API.
+The `pyzerox.zerox` function is an asynchronous API that performs OCR (Optical Character Recognition) to markdown using vision models. It processes PDF files and converts them into markdown format. Make sure to set up the environment variables for the model and the model provider before using this API.
 
 Refer to the [LiteLLM Documentation](https://docs.litellm.ai/docs/providers) for setting up the environment and passing the correct model name.
 
@@ -215,8 +218,13 @@ kwargs = {"vertex_credentials": vertex_credentials}
 # Define main async entrypoint
 async def main():
     file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported
-    output_dir = "./output_test"
-    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,custom_system_prompt=custom_system_prompt, **kwargs)
+
+    ## process only some pages or all
+    select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)
+
+    output_dir = "./output_test" ## directory to save the consolidated markdown file
+    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
+                        custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
     return result
 
 
@@ -239,6 +247,7 @@ async def zerox(
     output_dir: Optional[str] = None,
     temp_dir: Optional[str] = None,
     custom_system_prompt: Optional[str] = None,
+    select_pages: Optional[Union[int, Iterable[int]]] = None,
     **kwargs
 ) -> ZeroxOutput:
   ...
@@ -262,8 +271,9 @@ Parameters
 - **temp_dir** (str, optional):
   The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it.
 - **custom_system_prompt** (str, optional):
-  The system prompt to use for the model, this overrides the default system prompt of zerox. Defaults to None.
-  Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning.
+  The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None.
+- **select_pages** (Optional[Union[int, Iterable[int]]], optional):
+  Pages to process, can be a single page number or an iterable of page numbers, Defaults to None
 - **kwargs** (dict, optional):
   Additional keyword arguments to pass to the litellm.completion method.
   Refer to the LiteLLM Documentation and Completion Input for details.

diff --git a/poetry.lock b/poetry.lock
diff --git a/py_zerox/pyzerox/constants/messages.py b/py_zerox/pyzerox/constants/messages.py
@@ -19,6 +19,14 @@ class Messages:
     Custom system prompt was provided which overrides the default system prompt. We assume that you know what you are doing.  
     """
 
+    MAINTAIN_FORMAT_SELECTED_PAGES_WARNING = """
+    The maintain_format flag is set to True in conjunction with select_pages input given. This may result in unexpected behavior.
+    """
+
+    PAGE_NUMBER_OUT_OF_BOUND_ERROR = """
+    The page number(s) provided is out of bound. Please provide a valid page number(s).
+    """
+
     NON_200_RESPONSE = """
     Model API returned status code {status_code}: {data}
 

diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Any
+from typing import List, Optional, Dict, Any, Union, Iterable
 from dataclasses import dataclass, field
 
 
@@ -16,6 +16,7 @@ class ZeroxArgs:
     output_dir: Optional[str] = None
     temp_dir: Optional[str] = None
     custom_system_prompt: Optional[str] = None
+    select_pages: Optional[Union[int, Iterable[int]]] = None
     kwargs: Dict[str, Any] = field(default_factory=dict)
 
 @dataclass

diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py
@@ -1,20 +1,23 @@
 import os
 import aioshutil as async_shutil
 import tempfile
-from typing import List, Optional
+import warnings
+from typing import List, Optional, Union, Iterable
 from datetime import datetime
 import aiofiles
 import aiofiles.os as async_os
+import asyncio
 
 # Package Imports
 from ..processor import (
     convert_pdf_to_images,
     download_file,
     process_page,
     process_pages_in_batches,
-    sorted_nicely,
+    create_selected_pages_pdf,
 )
 from ..errors import FileUnavailable
+from ..constants.messages import Messages
 from ..models import litellmmodel
 from .types import Page, ZeroxOutput
 
@@ -28,6 +31,7 @@ async def zerox(
     output_dir: Optional[str] = None,
     temp_dir: Optional[str] = None,
     custom_system_prompt: Optional[str] = None,
+    select_pages: Optional[Union[int, Iterable[int]]] = None,
     **kwargs
 ) -> ZeroxOutput:
     """
@@ -38,7 +42,7 @@ async def zerox(
     :type cleanup: bool, optional
     :param concurrency: The number of concurrent processes to run, defaults to 10
     :type concurrency: int, optional
-    :param file_path: The path to the PDF file to process
+    :param file_path: The path or URL to the PDF file to process.
     :type file_path: str, optional
     :param maintain_format: Whether to maintain the format from the previous page, defaults to False
     :type maintain_format: bool, optional
@@ -50,6 +54,8 @@ async def zerox(
     :type temp_dir: str, optional
     :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None
     :type custom_system_prompt: str, optional
+    :param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None
+    :type select_pages: int or Iterable[int], optional
 
     :param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input
     :return: The markdown content generated by the model.
@@ -65,6 +71,24 @@ async def zerox(
     # File Path Validators
     if not file_path:
         raise FileUnavailable()
+
+    # Create an instance of the litellm model interface
+    vision_model = litellmmodel(model=model,**kwargs)
+
+    # override the system prompt if a custom prompt is provided
+    if custom_system_prompt:
+        vision_model.system_prompt = custom_system_prompt
+
+    # Check if both maintain_format and select_pages are provided
+    if maintain_format and select_pages is not None:
+        warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING)
+
+    # If select_pages is a single integer, convert it to a list for consistency
+    if isinstance(select_pages, int):
+        select_pages = [select_pages]
+
+    # Sort the pages to maintain consistency
+    select_pages = sorted(select_pages)
 
     # Ensure the output directory exists
     if output_dir:
@@ -91,26 +115,19 @@ async def zerox(
         local_path = await download_file(file_path=file_path, temp_dir=temp_directory)
         if not local_path:
             raise FileUnavailable()
-
+        
         raw_file_name = os.path.splitext(os.path.basename(local_path))[0]
         file_name = "".join(c.lower() if c.isalnum() else "_" for c in raw_file_name)
+
+        # create a subset pdf in temp dir with only the requested pages if select_pages is provided
+        if select_pages is not None:
+            subset_pdf_create_kwargs = {"original_pdf_path":local_path, "select_pages":select_pages, 
+                                    "save_directory":temp_directory, "suffix":"_selected_pages"}
+            local_path = await asyncio.to_thread(create_selected_pages_pdf, 
+                                                 **subset_pdf_create_kwargs)
 
-        # Convert the file to a series of images
-        await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory)
-
-        # Get a list of sorted converted images (alphanumeric human sorting)
-        images = list(sorted_nicely([
-            f"{temp_directory}/{f}"
-            for f in await async_os.listdir(temp_directory)
-            if f.endswith(".png")
-        ]))
-
-        # Create an instance of the litellm model interface
-        vision_model = litellmmodel(model=model,**kwargs)
-
-        # override the system prompt if a custom prompt is provided
-        if custom_system_prompt:
-            vision_model.system_prompt = custom_system_prompt
+        # Convert the file to a series of images, below function returns a list of image paths in page order
+        images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory)
 
         if maintain_format:
             for image in images:
@@ -155,10 +172,20 @@ async def zerox(
         # Format JSON response
         end_time = datetime.now()
         completion_time = (end_time - start_time).total_seconds() * 1000
-        formatted_pages = [
-            Page(content=content, page=i + 1, content_length=len(content))
-            for i, content in enumerate(aggregated_markdown)
-        ]
+
+        # Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers
+        if select_pages is not None:
+            # Map aggregated markdown to the selected pages
+            formatted_pages = [
+                        Page(content=content, page=select_pages[i], content_length=len(content))
+                        for i, content in enumerate(aggregated_markdown)
+                    ]
+        else:
+            # Default behavior when no select_pages is provided
+            formatted_pages = [
+                        Page(content=content, page=i + 1, content_length=len(content))
+                        for i, content in enumerate(aggregated_markdown)
+                    ]
 
         return ZeroxOutput(
             completion_time=completion_time,

diff --git a/py_zerox/pyzerox/errors/__init__.py b/py_zerox/pyzerox/errors/__init__.py
@@ -1,6 +1,7 @@
 from .exceptions import (
     NotAVisionModel,
     ModelAccessError,
+    PageNumberOutOfBoundError,
     MissingEnvironmentVariables,
     ResourceUnreachableException,
     FileUnavailable,
@@ -11,6 +12,7 @@
 __all__ = [
     "NotAVisionModel",
     "ModelAccessError",
+    "PageNumberOutOfBoundError",
     "MissingEnvironmentVariables",
     "ResourceUnreachableException",
     "FileUnavailable",

diff --git a/py_zerox/pyzerox/errors/exceptions.py b/py_zerox/pyzerox/errors/exceptions.py
@@ -35,6 +35,16 @@ def __init__(
     ):
         super().__init__(message, extra_info)
 
+class PageNumberOutOfBoundError(CustomException):
+    """Exception invalid page number(s) provided."""
+
+    def __init__(
+        self,
+        message: str = Messages.PAGE_NUMBER_OUT_OF_BOUND_ERROR,
+        extra_info: Optional[Dict] = None,
+    ):
+        super().__init__(message, extra_info)
+
 class ResourceUnreachableException(CustomException):
     """Exception raised when a resource is unreachable."""
 

diff --git a/py_zerox/pyzerox/processor/__init__.py b/py_zerox/pyzerox/processor/__init__.py
@@ -5,7 +5,7 @@
     process_pages_in_batches,
 )
 from .text import format_markdown
-from .utils import download_file, sorted_nicely
+from .utils import download_file, create_selected_pages_pdf
 
 __all__ = [
     "save_image",
@@ -15,5 +15,5 @@
     "download_file",
     "process_page",
     "process_pages_in_batches",
-    "sorted_nicely",
+    "create_selected_pages_pdf",
 ]
diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py
@@ -11,27 +11,24 @@
 from ..models import litellmmodel
 
 
-async def convert_pdf_to_images(local_path: str, temp_dir: str):
-    """Converts a PDF file to a series of images."""
+async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]:
+    """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order."""
     options = {
+        "pdf_path": local_path,
+        "output_folder": temp_dir,
         "dpi": PDFConversionDefaultOptions.DPI,
         "fmt": PDFConversionDefaultOptions.FORMAT,
         "size": PDFConversionDefaultOptions.SIZE,
         "thread_count": PDFConversionDefaultOptions.THREAD_COUNT,
         "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO,
+        "paths_only": True,
     }
-    file_name = os.path.splitext(os.path.basename(local_path))[0]
 
     try:
-        images = await asyncio.to_thread(
-            convert_from_path, local_path, **options
+        image_paths = await asyncio.to_thread(
+            convert_from_path, **options
         )
-        tasks = []
-        for i, image in enumerate(images, start=1):
-            image_path = os.path.join(temp_dir, f"{file_name}_page_{i}.png")
-            tasks.append(save_image(image, image_path))
-        await asyncio.gather(*tasks)
-        return images
+        return image_paths
     except Exception as err:
         logging.error(f"Error converting PDF to images: {err}")