Skip to content

Commit

Permalink
Merge pull request #381 from alexk307/pdf-ocr
Browse files Browse the repository at this point in the history
Convert PDF to PNG for OCR
  • Loading branch information
phutelmyer authored Jul 10, 2023
2 parents ba48eac + 93474eb commit 0d8cdae
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 0 deletions.
3 changes: 3 additions & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -445,11 +445,14 @@ scanners:
- 'image/x-ms-bmp'
- 'bmp_file'
- 'image/webp'
- 'application/pdf'
- 'pdf_file'
priority: 5
options:
extract_text: False
split_words: True
tmp_directory: '/dev/shm/'
pdf_to_png: True
'ScanOle':
- positive:
flavors:
Expand Down
7 changes: 7 additions & 0 deletions src/python/strelka/scanners/scan_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import subprocess
import tempfile

import fitz

from strelka import strelka


Expand All @@ -20,6 +22,11 @@ def scan(self, data, file, options, expire_at):
extract_text = options.get("extract_text", False)
split_words = options.get("split_words", True)
tmp_directory = options.get("tmp_directory", "/tmp/")
pdf_to_png = options.get("pdf_to_png", False)

if pdf_to_png and "application/pdf" in file.flavors.get("mime", []):
doc = fitz.open(stream=data, filetype="pdf")
data = doc.get_page_pixmap(0).tobytes("png")

with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data:
tmp_data.write(data)
Expand Down

0 comments on commit 0d8cdae

Please sign in to comment.