Skip to content

Commit

Permalink
Fix #392 Split out IO logic from high level functions (#393)
Browse files Browse the repository at this point in the history
* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
  • Loading branch information
jstockwin authored Mar 26, 2020
1 parent 1cc1b96 commit 1a4a06d
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 11 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))

### Added
- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))

### Changed
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))

Expand Down
11 changes: 7 additions & 4 deletions pdfminer/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
from .utils import open_filename


def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
Expand Down Expand Up @@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None):
"""Parse and return the text contained in a PDF file.
:param pdf_file: Path to the PDF file to be worked on
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
Expand All @@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
if laparams is None:
laparams = LAParams()

with open(pdf_file, "rb") as fp, StringIO() as output_string:
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
Expand All @@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, laparams=None):
"""Extract and yield LTPage objects
:param pdf_file: Path to the PDF file to be worked on
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
Expand All @@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
if laparams is None:
laparams = LAParams()

with open(pdf_file, "rb") as fp:
with open_filename(pdf_file, "rb") as fp:
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
Expand Down
22 changes: 22 additions & 0 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,28 @@
INF = (1 << 31) - 1


class open_filename(object):
"""
Context manager that allows opening a filename and closes it on exit,
(just like `open`), but does nothing for file-like objects.
"""
def __init__(self, filename, *args, **kwargs):
if isinstance(filename, str):
self.file_handler = open(filename, *args, **kwargs)
self.closing = True
else:
self.file_handler = filename
self.closing = False

def __enter__(self):
return self.file_handler

def __exit__(self, exc_type, exc_val, exc_tb):
if self.closing:
self.file_handler.close()
return False


def make_compat_bytes(in_str):
"Converts to bytes, encoding to unicode."
assert isinstance(in_str, str), str(type(in_str))
Expand Down
36 changes: 29 additions & 7 deletions tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,19 @@
from pdfminer.high_level import extract_text


def run(sample_path):
def run_with_string(sample_path):
absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path)
return s


def run_with_file(sample_path):
absolute_path = absolute_sample_path(sample_path)
with open(absolute_path, "rb") as in_file:
s = extract_text(in_file)
return s


test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n"
Expand All @@ -20,19 +27,34 @@ def run(sample_path):


class TestExtractText(unittest.TestCase):
def test_simple1(self):
def test_simple1_with_string(self):
test_file = "simple1.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple2_with_string(self):
test_file = "simple2.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple3_with_string(self):
test_file = "simple3.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple1_with_file(self):
test_file = "simple1.pdf"
s = run(test_file)
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple2(self):
def test_simple2_with_file(self):
test_file = "simple2.pdf"
s = run(test_file)
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])

def test_simple3(self):
def test_simple3_with_file(self):
test_file = "simple3.pdf"
s = run(test_file)
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])


Expand Down

0 comments on commit 1a4a06d

Please sign in to comment.