Fix pdfminer#392 Split out IO logic from high level functions (pdfmin…

…er#393) * Allow file-like inputs to high level functions (pdfminer#392) * PR Review - move open_filename to utils (cherry picked from commit 1a4a06d)
j5int · Sep 15, 2020 · d70338e · d70338e
1 parent 1acb8cf
commit d70338e
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ## Restored by Reverting Removal
 - Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346))
 
+### Added
+- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
+
 ## [20200121] - 2020-01-21
 
 ### Fixed

diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
@@ -12,6 +12,7 @@
 from .pdfdevice import TagExtractor
 from .pdfinterp import PDFResourceManager, PDFPageInterpreter
 from .pdfpage import PDFPage
+from .utils import open_filename
 
 # Conditional import because python 2 is stupid
 if sys.version_info > (3, 0):
@@ -108,7 +109,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                  caching=True, codec='utf-8', laparams=None):
     """Parse and return the text contained in a PDF file.
 
-    :param pdf_file: Path to the PDF file to be worked on
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
     :param password: For encrypted PDFs, the password to decrypt.
     :param page_numbers: List of zero-indexed page numbers to extract.
     :param maxpages: The maximum number of pages to parse
@@ -121,7 +123,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
     if laparams is None:
         laparams = LAParams()
 
-    with open(pdf_file, "rb") as fp, StringIO() as output_string:
+    with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
         rsrcmgr = PDFResourceManager()
         device = TextConverter(rsrcmgr, output_string, codec=codec,
                                laparams=laparams)
@@ -144,7 +146,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
                   caching=True, laparams=None):
     """Extract and yield LTPage objects
 
-    :param pdf_file: Path to the PDF file to be worked on
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
     :param password: For encrypted PDFs, the password to decrypt.
     :param page_numbers: List of zero-indexed page numbers to extract.
     :param maxpages: The maximum number of pages to parse
@@ -156,7 +159,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
     if laparams is None:
         laparams = LAParams()
 
-    with open(pdf_file, "rb") as fp:
+    with open_filename(pdf_file, "rb") as fp:
         resource_manager = PDFResourceManager()
         device = PDFPageAggregator(resource_manager, laparams=laparams)
         interpreter = PDFPageInterpreter(resource_manager, device)

diff --git a/pdfminer/utils.py b/pdfminer/utils.py
@@ -16,6 +16,28 @@
     unicode = str
 
 
+class open_filename(object):
+    """
+    Context manager that allows opening a filename and closes it on exit,
+    (just like `open`), but does nothing for file-like objects.
+    """
+    def __init__(self, filename, *args, **kwargs):
+        if isinstance(filename, str):
+            self.file_handler = open(filename, *args, **kwargs)
+            self.closing = True
+        else:
+            self.file_handler = filename
+            self.closing = False
+
+    def __enter__(self):
+        return self.file_handler
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.closing:
+            self.file_handler.close()
+        return False
+
+
 def make_compat_bytes(in_str):
     """In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."""
     assert isinstance(in_str, str), str(type(in_str))

diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -4,12 +4,19 @@
 from pdfminer.high_level import extract_text
 
 
-def run(sample_path):
+def run_with_string(sample_path):
     absolute_path = absolute_sample_path(sample_path)
     s = extract_text(absolute_path)
     return s
 
 
+def run_with_file(sample_path):
+    absolute_path = absolute_sample_path(sample_path)
+    with open(absolute_path, "rb") as in_file:
+        s = extract_text(in_file)
+    return s
+
+
 test_strings = {
     "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                    "H e l l o  \n\nW o r l d\n\n"
@@ -20,19 +27,34 @@ def run(sample_path):
 
 
 class TestExtractText(unittest.TestCase):
-    def test_simple1(self):
+    def test_simple1_with_string(self):
+        test_file = "simple1.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple2_with_string(self):
+        test_file = "simple2.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple3_with_string(self):
+        test_file = "simple3.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple1_with_file(self):
         test_file = "simple1.pdf"
-        s = run(test_file)
+        s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])
 
-    def test_simple2(self):
+    def test_simple2_with_file(self):
         test_file = "simple2.pdf"
-        s = run(test_file)
+        s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])
 
-    def test_simple3(self):
+    def test_simple3_with_file(self):
         test_file = "simple3.pdf"
-        s = run(test_file)
+        s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])