From d70338eddfdb30cb82c71ad7ae6be8297cab81b4 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Thu, 26 Mar 2020 21:52:00 +0000 Subject: [PATCH] Fix #392 Split out IO logic from high level functions (#393) * Allow file-like inputs to high level functions (#392) * PR Review - move open_filename to utils (cherry picked from commit 1a4a06da9fe295920e23311e12f22a37a2799899) --- CHANGELOG.md | 3 +++ pdfminer/high_level.py | 11 +++++---- pdfminer/utils.py | 22 ++++++++++++++++++ tests/test_highlevel_extracttext.py | 36 +++++++++++++++++++++++------ 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0a901f0..812fa56c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## Restored by Reverting Removal - Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346)) +### Added +- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392)) + ## [20200121] - 2020-01-21 ### Fixed diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 705979dd..e8605c11 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -12,6 +12,7 @@ from .pdfdevice import TagExtractor from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfpage import PDFPage +from .utils import open_filename # Conditional import because python 2 is stupid if sys.version_info > (3, 0): @@ -108,7 +109,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None): """Parse and return the text contained in a PDF file. - :param pdf_file: Path to the PDF file to be worked on + :param pdf_file: Either a file path or a file-like object for the PDF file + to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse @@ -121,7 +123,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, if laparams is None: laparams = LAParams() - with open(pdf_file, "rb") as fp, StringIO() as output_string: + with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) @@ -144,7 +146,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, laparams=None): """Extract and yield LTPage objects - :param pdf_file: Path to the PDF file to be worked on + :param pdf_file: Either a file path or a file-like object for the PDF file + to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse @@ -156,7 +159,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, if laparams is None: laparams = LAParams() - with open(pdf_file, "rb") as fp: + with open_filename(pdf_file, "rb") as fp: resource_manager = PDFResourceManager() device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index f7d7849c..a05c64b2 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -16,6 +16,28 @@ unicode = str +class open_filename(object): + """ + Context manager that allows opening a filename and closes it on exit, + (just like `open`), but does nothing for file-like objects. + """ + def __init__(self, filename, *args, **kwargs): + if isinstance(filename, str): + self.file_handler = open(filename, *args, **kwargs) + self.closing = True + else: + self.file_handler = filename + self.closing = False + + def __enter__(self): + return self.file_handler + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.closing: + self.file_handler.close() + return False + + def make_compat_bytes(in_str): """In Py2, does nothing. In Py3, converts to bytes, encoding to unicode.""" assert isinstance(in_str, str), str(type(in_str)) diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 4a48fefc..c5c6f957 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -4,12 +4,19 @@ from pdfminer.high_level import extract_text -def run(sample_path): +def run_with_string(sample_path): absolute_path = absolute_sample_path(sample_path) s = extract_text(absolute_path) return s +def run_with_file(sample_path): + absolute_path = absolute_sample_path(sample_path) + with open(absolute_path, "rb") as in_file: + s = extract_text(in_file) + return s + + test_strings = { "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "H e l l o \n\nW o r l d\n\n" @@ -20,19 +27,34 @@ def run(sample_path): class TestExtractText(unittest.TestCase): - def test_simple1(self): + def test_simple1_with_string(self): + test_file = "simple1.pdf" + s = run_with_string(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple2_with_string(self): + test_file = "simple2.pdf" + s = run_with_string(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple3_with_string(self): + test_file = "simple3.pdf" + s = run_with_string(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple1_with_file(self): test_file = "simple1.pdf" - s = run(test_file) + s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file]) - def test_simple2(self): + def test_simple2_with_file(self): test_file = "simple2.pdf" - s = run(test_file) + s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file]) - def test_simple3(self): + def test_simple3_with_file(self): test_file = "simple3.pdf" - s = run(test_file) + s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file])