Skip to content

Commit

Permalink
Write PDF incrementally
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-va committed Oct 15, 2024
1 parent 474e787 commit ed75e64
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
15 changes: 15 additions & 0 deletions ocr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import fitz
from mypy_boto3_textract import TextractClient as Textractor
from pymupdf.mupdf import PDF_ENCRYPT_KEEP

from ocr.crop import crop_images
from ocr.resize import resize_page
Expand Down Expand Up @@ -58,12 +59,19 @@ def process_pdf(
confidence_threshold: float,
use_aggressive_strategy: bool,
):
tmp_out_path = os.path.join(tmp_dir, f"output.pdf")

in_doc = fitz.open(in_path)
out_doc = fitz.open(in_path)

os.makedirs(tmp_dir, exist_ok=True)

in_page_count = in_doc.page_count
print(f"{in_page_count} pages")

out_doc.save(tmp_out_path, garbage=3, deflate=True)
out_doc.close()
out_doc = fitz.open(tmp_out_path)
for page_index, new_page in enumerate(iter(in_doc)):
page_number = page_index + 1
print(f"Page {page_number}")
Expand All @@ -82,7 +90,13 @@ def process_pdf(
text_layer_path = os.path.join(tmp_dir, f"page{page_number}.pdf")
lines_to_draw = process_page(out_doc, new_page, textractor, tmp_path_prefix, confidence_threshold, ignore_rects)
draw_ocr_text_page(new_page, text_layer_path, lines_to_draw)
out_doc.save(tmp_out_path, incremental=True, encryption=PDF_ENCRYPT_KEEP)

out_doc.close()
out_doc = fitz.open(tmp_out_path)
out_doc.save(out_path, garbage=3, deflate=True)
in_doc.close()
out_doc.close()

# Verify that we can read the written document, and that it still has the same number of pages. Some corrupt input
# documents might lead to an empty or to a corrupt output document, sometimes even without throwing an error. (See
Expand All @@ -93,3 +107,4 @@ def process_pdf(
raise ValueError(
"Output document contains {} pages instead of {}".format(out_page_count, in_page_count)
)
doc.close()
6 changes: 1 addition & 5 deletions ocr/util.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import gc
import os
from time import sleep

import fitz
from mypy_boto3_textract import TextractClient as Textractor
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfgen import canvas
from reportlab.pdfgen.textobject import PDFTextObject
from mypy_boto3_textract import TextractClient as Textractor

from ocr.applyocr import OCR
from ocr.readingorder import TextLine, TextWord
Expand Down

0 comments on commit ed75e64

Please sign in to comment.