Skip to content

Commit

Permalink
Python SDK: Feat. process specific pages, Other fixes and improvements (
Browse files Browse the repository at this point in the history
#39)

* Python SDK

Changes:
- Feature added: now specific pages can be processed with the python sdk using "select_pages" param. Incorporates #23, #24 for python sdk
- workflow for the above feature: create a new temperory pdf in the tempdir if select_pages is specified and follow the rest of the process as usual and finally map the page number in the formatted markdown to get the actual number instead of index.
- raise warning when both select_pages and maintain used.
- required adaptations and updates in messages, exceptions, types, processor, utils etc

Fixes/improvements:
- memory efficient pdf to image conversion, utilizing paths only option to directly get sorted image paths from pdf2image api

Misc:
- Bump the version tag
- documentation updated

* Minor update in README.md

---------

Co-authored-by: Pradyumna Singh Rathore <[email protected]>
  • Loading branch information
pradhyumna85 and Pradyumna Singh Rathore authored Sep 17, 2024
1 parent b570756 commit eaf13b2
Show file tree
Hide file tree
Showing 13 changed files with 187 additions and 55 deletions.
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,12 @@ Request #3 => page_2_markdown + page_3_image
### Installation:

- Install **poppler-utils** on the system, it should be available in path variable
- Install py-zerox: `pip install py-zerox`
- Install py-zerox:
```sh
pip install py-zerox
```

The `zerox` function is an asynchronous API that performs OCR (Optical Character Recognition) to markdown using vision models. It processes PDF files and converts them into markdown format. Make sure to set up the environment variables for the model and the model provider before using this API.
The `pyzerox.zerox` function is an asynchronous API that performs OCR (Optical Character Recognition) to markdown using vision models. It processes PDF files and converts them into markdown format. Make sure to set up the environment variables for the model and the model provider before using this API.

Refer to the [LiteLLM Documentation](https://docs.litellm.ai/docs/providers) for setting up the environment and passing the correct model name.

Expand Down Expand Up @@ -215,8 +218,13 @@ kwargs = {"vertex_credentials": vertex_credentials}
# Define main async entrypoint
async def main():
file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported
output_dir = "./output_test"
result = await zerox(file_path=file_path, model=model, output_dir=output_dir,custom_system_prompt=custom_system_prompt, **kwargs)

## process only some pages or all
select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)

output_dir = "./output_test" ## directory to save the consolidated markdown file
result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
return result


Expand All @@ -239,6 +247,7 @@ async def zerox(
output_dir: Optional[str] = None,
temp_dir: Optional[str] = None,
custom_system_prompt: Optional[str] = None,
select_pages: Optional[Union[int, Iterable[int]]] = None,
**kwargs
) -> ZeroxOutput:
...
Expand All @@ -262,8 +271,9 @@ Parameters
- **temp_dir** (str, optional):
The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it.
- **custom_system_prompt** (str, optional):
The system prompt to use for the model, this overrides the default system prompt of zerox. Defaults to None.
Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning.
The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None.
- **select_pages** (Optional[Union[int, Iterable[int]]], optional):
Pages to process, can be a single page number or an iterable of page numbers, Defaults to None
- **kwargs** (dict, optional):
Additional keyword arguments to pass to the litellm.completion method.
Refer to the LiteLLM Documentation and Completion Input for details.
Expand Down
20 changes: 19 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions py_zerox/pyzerox/constants/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ class Messages:
Custom system prompt was provided which overrides the default system prompt. We assume that you know what you are doing.
"""

MAINTAIN_FORMAT_SELECTED_PAGES_WARNING = """
The maintain_format flag is set to True in conjunction with select_pages input given. This may result in unexpected behavior.
"""

PAGE_NUMBER_OUT_OF_BOUND_ERROR = """
The page number(s) provided is out of bound. Please provide a valid page number(s).
"""

NON_200_RESPONSE = """
Model API returned status code {status_code}: {data}
Expand Down
3 changes: 2 additions & 1 deletion py_zerox/pyzerox/core/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Dict, Any
from typing import List, Optional, Dict, Any, Union, Iterable
from dataclasses import dataclass, field


Expand All @@ -16,6 +16,7 @@ class ZeroxArgs:
output_dir: Optional[str] = None
temp_dir: Optional[str] = None
custom_system_prompt: Optional[str] = None
select_pages: Optional[Union[int, Iterable[int]]] = None
kwargs: Dict[str, Any] = field(default_factory=dict)

@dataclass
Expand Down
75 changes: 51 additions & 24 deletions py_zerox/pyzerox/core/zerox.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import os
import aioshutil as async_shutil
import tempfile
from typing import List, Optional
import warnings
from typing import List, Optional, Union, Iterable
from datetime import datetime
import aiofiles
import aiofiles.os as async_os
import asyncio

# Package Imports
from ..processor import (
convert_pdf_to_images,
download_file,
process_page,
process_pages_in_batches,
sorted_nicely,
create_selected_pages_pdf,
)
from ..errors import FileUnavailable
from ..constants.messages import Messages
from ..models import litellmmodel
from .types import Page, ZeroxOutput

Expand All @@ -28,6 +31,7 @@ async def zerox(
output_dir: Optional[str] = None,
temp_dir: Optional[str] = None,
custom_system_prompt: Optional[str] = None,
select_pages: Optional[Union[int, Iterable[int]]] = None,
**kwargs
) -> ZeroxOutput:
"""
Expand All @@ -38,7 +42,7 @@ async def zerox(
:type cleanup: bool, optional
:param concurrency: The number of concurrent processes to run, defaults to 10
:type concurrency: int, optional
:param file_path: The path to the PDF file to process
:param file_path: The path or URL to the PDF file to process.
:type file_path: str, optional
:param maintain_format: Whether to maintain the format from the previous page, defaults to False
:type maintain_format: bool, optional
Expand All @@ -50,6 +54,8 @@ async def zerox(
:type temp_dir: str, optional
:param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None
:type custom_system_prompt: str, optional
:param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None
:type select_pages: int or Iterable[int], optional
:param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input
:return: The markdown content generated by the model.
Expand All @@ -65,6 +71,24 @@ async def zerox(
# File Path Validators
if not file_path:
raise FileUnavailable()

# Create an instance of the litellm model interface
vision_model = litellmmodel(model=model,**kwargs)

# override the system prompt if a custom prompt is provided
if custom_system_prompt:
vision_model.system_prompt = custom_system_prompt

# Check if both maintain_format and select_pages are provided
if maintain_format and select_pages is not None:
warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING)

# If select_pages is a single integer, convert it to a list for consistency
if isinstance(select_pages, int):
select_pages = [select_pages]

# Sort the pages to maintain consistency
select_pages = sorted(select_pages)

# Ensure the output directory exists
if output_dir:
Expand All @@ -91,26 +115,19 @@ async def zerox(
local_path = await download_file(file_path=file_path, temp_dir=temp_directory)
if not local_path:
raise FileUnavailable()

raw_file_name = os.path.splitext(os.path.basename(local_path))[0]
file_name = "".join(c.lower() if c.isalnum() else "_" for c in raw_file_name)

# create a subset pdf in temp dir with only the requested pages if select_pages is provided
if select_pages is not None:
subset_pdf_create_kwargs = {"original_pdf_path":local_path, "select_pages":select_pages,
"save_directory":temp_directory, "suffix":"_selected_pages"}
local_path = await asyncio.to_thread(create_selected_pages_pdf,
**subset_pdf_create_kwargs)

# Convert the file to a series of images
await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory)

# Get a list of sorted converted images (alphanumeric human sorting)
images = list(sorted_nicely([
f"{temp_directory}/{f}"
for f in await async_os.listdir(temp_directory)
if f.endswith(".png")
]))

# Create an instance of the litellm model interface
vision_model = litellmmodel(model=model,**kwargs)

# override the system prompt if a custom prompt is provided
if custom_system_prompt:
vision_model.system_prompt = custom_system_prompt
# Convert the file to a series of images, below function returns a list of image paths in page order
images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory)

if maintain_format:
for image in images:
Expand Down Expand Up @@ -155,10 +172,20 @@ async def zerox(
# Format JSON response
end_time = datetime.now()
completion_time = (end_time - start_time).total_seconds() * 1000
formatted_pages = [
Page(content=content, page=i + 1, content_length=len(content))
for i, content in enumerate(aggregated_markdown)
]

# Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers
if select_pages is not None:
# Map aggregated markdown to the selected pages
formatted_pages = [
Page(content=content, page=select_pages[i], content_length=len(content))
for i, content in enumerate(aggregated_markdown)
]
else:
# Default behavior when no select_pages is provided
formatted_pages = [
Page(content=content, page=i + 1, content_length=len(content))
for i, content in enumerate(aggregated_markdown)
]

return ZeroxOutput(
completion_time=completion_time,
Expand Down
2 changes: 2 additions & 0 deletions py_zerox/pyzerox/errors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .exceptions import (
NotAVisionModel,
ModelAccessError,
PageNumberOutOfBoundError,
MissingEnvironmentVariables,
ResourceUnreachableException,
FileUnavailable,
Expand All @@ -11,6 +12,7 @@
__all__ = [
"NotAVisionModel",
"ModelAccessError",
"PageNumberOutOfBoundError",
"MissingEnvironmentVariables",
"ResourceUnreachableException",
"FileUnavailable",
Expand Down
10 changes: 10 additions & 0 deletions py_zerox/pyzerox/errors/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ def __init__(
):
super().__init__(message, extra_info)

class PageNumberOutOfBoundError(CustomException):
"""Exception invalid page number(s) provided."""

def __init__(
self,
message: str = Messages.PAGE_NUMBER_OUT_OF_BOUND_ERROR,
extra_info: Optional[Dict] = None,
):
super().__init__(message, extra_info)

class ResourceUnreachableException(CustomException):
"""Exception raised when a resource is unreachable."""

Expand Down
4 changes: 2 additions & 2 deletions py_zerox/pyzerox/processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
process_pages_in_batches,
)
from .text import format_markdown
from .utils import download_file, sorted_nicely
from .utils import download_file, create_selected_pages_pdf

__all__ = [
"save_image",
Expand All @@ -15,5 +15,5 @@
"download_file",
"process_page",
"process_pages_in_batches",
"sorted_nicely",
"create_selected_pages_pdf",
]
19 changes: 8 additions & 11 deletions py_zerox/pyzerox/processor/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,24 @@
from ..models import litellmmodel


async def convert_pdf_to_images(local_path: str, temp_dir: str):
"""Converts a PDF file to a series of images."""
async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]:
"""Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order."""
options = {
"pdf_path": local_path,
"output_folder": temp_dir,
"dpi": PDFConversionDefaultOptions.DPI,
"fmt": PDFConversionDefaultOptions.FORMAT,
"size": PDFConversionDefaultOptions.SIZE,
"thread_count": PDFConversionDefaultOptions.THREAD_COUNT,
"use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO,
"paths_only": True,
}
file_name = os.path.splitext(os.path.basename(local_path))[0]

try:
images = await asyncio.to_thread(
convert_from_path, local_path, **options
image_paths = await asyncio.to_thread(
convert_from_path, **options
)
tasks = []
for i, image in enumerate(images, start=1):
image_path = os.path.join(temp_dir, f"{file_name}_page_{i}.png")
tasks.append(save_image(image, image_path))
await asyncio.gather(*tasks)
return images
return image_paths
except Exception as err:
logging.error(f"Error converting PDF to images: {err}")

Expand Down
Loading

0 comments on commit eaf13b2

Please sign in to comment.