Restore Python 2 compatibility

j5int · Sep 15, 2020 · 1acb8cf · 1acb8cf
1 parent bc494ff
commit 1acb8cf
Show file tree

Hide file tree

Showing 43 changed files with 4,704 additions and 4,531 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,6 @@
 language: python
 python:
+  - "2.7"
   - "3.4"
   - "3.5"
   - "3.6"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
-Nothing
+## Restored by Reverting Removal
+- Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346))
 
 ## [20200121] - 2020-01-21
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -29,9 +29,8 @@ Any contribution is appreciated! You might want to:
 * Pull requests should be merged to develop, not master. This ensures that master always equals the released version.  
 * Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case 
   of features, this will show that your code works correctly.
-* Code should work for Python 3.4+.
-* Code should conform to PEP8 coding style.
-* New features should be well documented using docstrings.
+* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (enforced by 
+  [flake8](http://flake8.pycqa.org/en/latest/)) and properly documented with docstrings.
 * Check spelling and grammar.
 * Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
 

diff --git a/README.md b/README.md
@@ -37,8 +37,8 @@ Features
 How to use
 ----------
 
- * Install Python 3.4 or newer
- * Install
+ * Install Python 2.7 or newer. Note that Python 2 support is dropped at
+  January, 2020.
 
     `pip install pdfminer.six`
 

diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py
@@ -1,4 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Fork of PDFMiner using six for Python 2+3 compatibility
+
+PDFMiner is a tool for extracting information from PDF documents.
+Unlike other PDF-related tools, it focuses entirely on getting and analyzing
+text data. PDFMiner allows to obtain the exact location of texts in a page,
+as well as other information such as fonts or lines.
+It includes a PDF converter that can transform PDF files into other text
+formats (such as HTML). It has an extensible PDF parser that can be used for
+other purposes instead of text analysis.
+"""
+import sys
+import warnings
+
 __version__ = '20200121'
 
+
+if sys.version_info < (3, 0):
+    warnings.warn('On January 1st, 2020, '
+                  'pdfminer.six will stop supporting Python 2. '
+                  'Please upgrade to Python 3. '
+                  'For more information see '
+                  'https://github.com/pdfminer/pdfminer.six/issues/194')
+
 if __name__ == '__main__':
     print(__version__)
diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py
@@ -1,19 +1,23 @@
+
+
 """ Python implementation of Arcfour encryption algorithm.
 See https://en.wikipedia.org/wiki/RC4
 This code is in the public domain.
 
 """
 
+import six  # Python 2+3 compatibility
+
 
-class Arcfour:
+class Arcfour(object):
 
     def __init__(self, key):
         # because Py3 range is not indexable
         s = [i for i in range(256)]
         j = 0
         klen = len(key)
         for i in range(256):
-            j = (j + s[i] + key[i % klen]) % 256
+            j = (j + s[i] + six.indexbytes(key, i % klen)) % 256
             (s[i], s[j]) = (s[j], s[i])
         self.s = s
         (self.i, self.j) = (0, 0)
@@ -23,12 +27,12 @@ def process(self, data):
         (i, j) = (self.i, self.j)
         s = self.s
         r = b''
-        for c in iter(data):
+        for c in six.iterbytes(data):
             i = (i+1) % 256
             j = (j+s[i]) % 256
             (s[i], s[j]) = (s[j], s[i])
             k = s[(s[i]+s[j]) % 256]
-            r += bytes((c ^ k,))
+            r += six.int2byte(c ^ k)
         (self.i, self.j) = (i, j)
         return r
 

diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py
@@ -1,3 +1,5 @@
+
+
 """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
 
 This code is in the public domain.
@@ -7,6 +9,8 @@
 import re
 import struct
 
+import six  # Python 2+3 compatibility
+
 
 # ascii85decode(data)
 def ascii85decode(data):
@@ -22,8 +26,8 @@ def ascii85decode(data):
     """
     n = b = 0
     out = b''
-    for i in iter(data):
-        c = bytes((i,))
+    for i in six.iterbytes(data):
+        c = six.int2byte(i)
         if b'!' <= c and c <= b'u':
             n += 1
             b = b*85+(ord(c)-33)
@@ -43,8 +47,9 @@ def ascii85decode(data):
 
 
 # asciihexdecode(data)
-hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
-trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
+hex_re = re.compile(b'([a-f0-9]{2})', re.IGNORECASE)
+trail_re = re.compile(b'^(?:[a-f0-9]{2}|[ \t\n\r\f\v])*'
+                      b'([a-f0-9])[ \t\n\r\f\v>]*$', re.IGNORECASE)
 
 
 def asciihexdecode(data):
@@ -59,7 +64,7 @@ def asciihexdecode(data):
     """
     def decode(x):
         i = int(x, 16)
-        return bytes((i,))
+        return six.int2byte(i)
 
     out = b''
     for x in hex_re.findall(data):

diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py
@@ -1,3 +1,4 @@
+
 # CCITT Fax decoder
 #
 # Bugs: uncompressed mode untested.
@@ -14,12 +15,20 @@
 import sys
 import array
 
+import six  # Python 2+3 compatibility
+
+if six.PY3:
+    def get_bytes(data):
+        for byte in data:
+            yield byte
+else:
+    def get_bytes(data):
+        for char in data:
+            yield ord(char)
 
-def get_bytes(data):
-    yield from data
 
+class BitParser(object):
 
-class BitParser:
     def __init__(self):
         self._pos = 0
         return

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -13,7 +13,10 @@
 import os
 import os.path
 import gzip
-import pickle as pickle
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle as pickle
 import struct
 import logging
 from .psparser import PSStackParser
@@ -26,6 +29,7 @@
 from .utils import choplist
 from .utils import nunpack
 
+import six
 
 log = logging.getLogger(__name__)
 
@@ -34,7 +38,7 @@ class CMapError(Exception):
     pass
 
 
-class CMapBase:
+class CMapBase(object):
 
     debug = 0
 
@@ -73,7 +77,7 @@ def use_cmap(self, cmap):
         assert isinstance(cmap, CMap), str(type(cmap))
 
         def copy(dst, src):
-            for (k, v) in src.items():
+            for (k, v) in six.iteritems(src):
                 if isinstance(v, dict):
                     d = {}
                     dst[k] = d
@@ -86,7 +90,7 @@ def copy(dst, src):
     def decode(self, code):
         log.debug('decode: %r, %r', self, code)
         d = self.code2cid
-        for i in iter(code):
+        for i in six.iterbytes(code):
             if i in d:
                 d = d[i]
                 if isinstance(d, int):
@@ -100,7 +104,7 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
         if code2cid is None:
             code2cid = self.code2cid
             code = ()
-        for (k, v) in sorted(code2cid.items()):
+        for (k, v) in sorted(six.iteritems(code2cid)):
             c = code+(k,)
             if isinstance(v, int):
                 out.write('code %r = cid %d\n' % (c, v))
@@ -144,7 +148,7 @@ def get_unichr(self, cid):
         return self.cid2unichr[cid]
 
     def dump(self, out=sys.stdout):
-        for (k, v) in sorted(self.cid2unichr.items()):
+        for (k, v) in sorted(six.iteritems(self.cid2unichr)):
             out.write('cid %d = unicode %r\n' % (k, v))
         return
 
@@ -179,7 +183,7 @@ def add_cid2unichr(self, cid, code):
             # Interpret as UTF-16BE.
             self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
         elif isinstance(code, int):
-            self.cid2unichr[cid] = chr(code)
+            self.cid2unichr[cid] = six.unichr(code)
         else:
             raise TypeError(code)
         return
@@ -207,7 +211,7 @@ def __init__(self, name, module, vertical):
         return
 
 
-class CMapDB:
+class CMapDB(object):
 
     _cmap_cache = {}
     _umap_cache = {}

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import logging
 import re
 import sys
@@ -22,6 +23,7 @@
 from .utils import bbox2str
 from . import utils
 
+import six
 
 log = logging.getLogger(__name__)
 
@@ -113,7 +115,7 @@ def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
                     graphicstate):
         try:
             text = font.to_unichr(cid)
-            assert isinstance(text, str), str(type(text))
+            assert isinstance(text, six.text_type), str(type(text))
         except PDFUnicodeNotDefined:
             text = self.handle_undefined_char(font, cid)
         textwidth = font.char_width(cid)
@@ -166,7 +168,7 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
                 self.outfp_binary = False
             else:
                 try:
-                    self.outfp.write("é")
+                    self.outfp.write(u"é")
                     self.outfp_binary = False
                 except TypeError:
                     self.outfp_binary = True
@@ -184,7 +186,7 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
 
     def write_text(self, text):
         text = utils.compatible_encode_method(text, self.codec, 'ignore')
-        if self.outfp_binary:
+        if six.PY3 and self.outfp_binary:
             text = text.encode()
         self.outfp.write(text)
         return
@@ -283,7 +285,7 @@ def write_header(self):
         return
 
     def write_footer(self):
-        page_links = ['<a href="#{}">{}</a>'.format(i, i)
+        page_links = ['<a href="#%s">%s</a>' % (i, i)
                       for i in range(1, self.pageno)]
         s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
             ', '.join(page_links)
@@ -386,8 +388,8 @@ def render(item):
                 if self.showpageno:
                     self.write('<div style="position:absolute; top:%dpx;">' %
                                ((self._yoffset-item.y1)*self.scale))
-                    self.write('<a name="{}">Page {}</a></div>\n'
-                               .format(item.pageid, item.pageid))
+                    self.write('<a name="%s">Page %s</a></div>\n' % (
+                        item.pageid, item.pageid))
                 for child in item:
                     render(child)
                 if item.groups is not None:
@@ -450,7 +452,7 @@ def close(self):
 
 class XMLConverter(PDFConverter):
 
-    CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
+    CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
 
     def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
                  imagewriter=None, stripcontrol=False):
@@ -481,7 +483,7 @@ def write_footer(self):
 
     def write_text(self, text):
         if self.stripcontrol:
-            text = self.CONTROL.sub('', text)
+            text = self.CONTROL.sub(u'', text)
         self.write(enc(text))
         return
 

diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
@@ -1,6 +1,8 @@
 import logging
 import re
 
+import six  # Python 2+3 compatibility
+
 from .glyphlist import glyphname2unicode
 from .latin_enc import ENCODING
 from .psparser import PSLiteral
@@ -43,7 +45,7 @@ def name2unicode(name):
                                   for i in range(0, len(name_without_uni), 4)]
                 for digit in unicode_digits:
                     raise_key_error_for_invalid_unicode(digit)
-                characters = map(chr, unicode_digits)
+                characters = map(six.unichr, unicode_digits)
                 return ''.join(characters)
 
         elif name.startswith('u'):
@@ -53,7 +55,7 @@ def name2unicode(name):
                     4 <= len(name_without_u) <= 6:
                 unicode_digit = int(name_without_u, base=16)
                 raise_key_error_for_invalid_unicode(unicode_digit)
-                return chr(unicode_digit)
+                return six.unichr(unicode_digit)
 
     raise KeyError('Could not convert unicode name "%s" to character because '
                    'it does not match specification' % name)
@@ -70,7 +72,7 @@ def raise_key_error_for_invalid_unicode(unicode_digit):
                        'it is in the range D800 through DFFF' % unicode_digit)
 
 
-class EncodingDB:
+class EncodingDB(object):
 
     std2unicode = {}
     mac2unicode = {}
-Original file line number
+Diff line change
@@ -1,5 +1,6 @@
     language: python
     python:
+      - "2.7"
       - "3.4"
       - "3.5"
       - "3.6"
@@ Expand Down @@