Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392) * PR Review - move open_filename to utils
pdfminer · Mar 26, 2020 · 1a4a06d · 1a4a06d
1 parent 1cc1b96
commit 1a4a06d
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
 - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
 
+### Added
+- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
+
 ### Changed
 - Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
 

diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
@@ -11,6 +11,7 @@
 from .pdfdevice import TagExtractor
 from .pdfinterp import PDFResourceManager, PDFPageInterpreter
 from .pdfpage import PDFPage
+from .utils import open_filename
 
 
 def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                  caching=True, codec='utf-8', laparams=None):
     """Parse and return the text contained in a PDF file.
 
-    :param pdf_file: Path to the PDF file to be worked on
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
     :param password: For encrypted PDFs, the password to decrypt.
     :param page_numbers: List of zero-indexed page numbers to extract.
     :param maxpages: The maximum number of pages to parse
@@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
     if laparams is None:
         laparams = LAParams()
 
-    with open(pdf_file, "rb") as fp, StringIO() as output_string:
+    with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
         rsrcmgr = PDFResourceManager()
         device = TextConverter(rsrcmgr, output_string, codec=codec,
                                laparams=laparams)
@@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
                   caching=True, laparams=None):
     """Extract and yield LTPage objects
 
-    :param pdf_file: Path to the PDF file to be worked on
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
     :param password: For encrypted PDFs, the password to decrypt.
     :param page_numbers: List of zero-indexed page numbers to extract.
     :param maxpages: The maximum number of pages to parse
@@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
     if laparams is None:
         laparams = LAParams()
 
-    with open(pdf_file, "rb") as fp:
+    with open_filename(pdf_file, "rb") as fp:
         resource_manager = PDFResourceManager()
         device = PDFPageAggregator(resource_manager, laparams=laparams)
         interpreter = PDFPageInterpreter(resource_manager, device)

diff --git a/pdfminer/utils.py b/pdfminer/utils.py
@@ -11,6 +11,28 @@
 INF = (1 << 31) - 1
 
 
+class open_filename(object):
+    """
+    Context manager that allows opening a filename and closes it on exit,
+    (just like `open`), but does nothing for file-like objects.
+    """
+    def __init__(self, filename, *args, **kwargs):
+        if isinstance(filename, str):
+            self.file_handler = open(filename, *args, **kwargs)
+            self.closing = True
+        else:
+            self.file_handler = filename
+            self.closing = False
+
+    def __enter__(self):
+        return self.file_handler
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.closing:
+            self.file_handler.close()
+        return False
+
+
 def make_compat_bytes(in_str):
     "Converts to bytes, encoding to unicode."
     assert isinstance(in_str, str), str(type(in_str))

diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -4,12 +4,19 @@
 from pdfminer.high_level import extract_text
 
 
-def run(sample_path):
+def run_with_string(sample_path):
     absolute_path = absolute_sample_path(sample_path)
     s = extract_text(absolute_path)
     return s
 
 
+def run_with_file(sample_path):
+    absolute_path = absolute_sample_path(sample_path)
+    with open(absolute_path, "rb") as in_file:
+        s = extract_text(in_file)
+    return s
+
+
 test_strings = {
     "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                    "H e l l o  \n\nW o r l d\n\n"
@@ -20,19 +27,34 @@ def run(sample_path):
 
 
 class TestExtractText(unittest.TestCase):
-    def test_simple1(self):
+    def test_simple1_with_string(self):
+        test_file = "simple1.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple2_with_string(self):
+        test_file = "simple2.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple3_with_string(self):
+        test_file = "simple3.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple1_with_file(self):
         test_file = "simple1.pdf"
-        s = run(test_file)
+        s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])
 
-    def test_simple2(self):
+    def test_simple2_with_file(self):
         test_file = "simple2.pdf"
-        s = run(test_file)
+        s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])
 
-    def test_simple3(self):
+    def test_simple3_with_file(self):
         test_file = "simple3.pdf"
-        s = run(test_file)
+        s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])