Skip to content

Commit

Permalink
Restore Python 2 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
David Fraser committed Sep 15, 2020
1 parent bc494ff commit 1acb8cf
Show file tree
Hide file tree
Showing 43 changed files with 4,704 additions and 4,531 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
language: python
python:
- "2.7"
- "3.4"
- "3.5"
- "3.6"
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

Nothing
## Restored by Reverting Removal
- Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346))

## [20200121] - 2020-01-21

Expand Down
5 changes: 2 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ Any contribution is appreciated! You might want to:
* Pull requests should be merged to develop, not master. This ensures that master always equals the released version.
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
of features, this will show that your code works correctly.
* Code should work for Python 3.4+.
* Code should conform to PEP8 coding style.
* New features should be well documented using docstrings.
* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (enforced by
[flake8](http://flake8.pycqa.org/en/latest/)) and properly documented with docstrings.
* Check spelling and grammar.
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ Features
How to use
----------

* Install Python 3.4 or newer
* Install
* Install Python 2.7 or newer. Note that Python 2 support is dropped at
January, 2020.

`pip install pdfminer.six`

Expand Down
23 changes: 23 additions & 0 deletions pdfminer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,27 @@
# -*- coding: utf-8 -*-
"""
Fork of PDFMiner using six for Python 2+3 compatibility
PDFMiner is a tool for extracting information from PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting and analyzing
text data. PDFMiner allows to obtain the exact location of texts in a page,
as well as other information such as fonts or lines.
It includes a PDF converter that can transform PDF files into other text
formats (such as HTML). It has an extensible PDF parser that can be used for
other purposes instead of text analysis.
"""
import sys
import warnings

__version__ = '20200121'


if sys.version_info < (3, 0):
warnings.warn('On January 1st, 2020, '
'pdfminer.six will stop supporting Python 2. '
'Please upgrade to Python 3. '
'For more information see '
'https://github.com/pdfminer/pdfminer.six/issues/194')

if __name__ == '__main__':
print(__version__)
12 changes: 8 additions & 4 deletions pdfminer/arcfour.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@


""" Python implementation of Arcfour encryption algorithm.
See https://en.wikipedia.org/wiki/RC4
This code is in the public domain.
"""

import six # Python 2+3 compatibility


class Arcfour:
class Arcfour(object):

def __init__(self, key):
# because Py3 range is not indexable
s = [i for i in range(256)]
j = 0
klen = len(key)
for i in range(256):
j = (j + s[i] + key[i % klen]) % 256
j = (j + s[i] + six.indexbytes(key, i % klen)) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
Expand All @@ -23,12 +27,12 @@ def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = b''
for c in iter(data):
for c in six.iterbytes(data):
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += bytes((c ^ k,))
r += six.int2byte(c ^ k)
(self.i, self.j) = (i, j)
return r

Expand Down
15 changes: 10 additions & 5 deletions pdfminer/ascii85.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
This code is in the public domain.
Expand All @@ -7,6 +9,8 @@
import re
import struct

import six # Python 2+3 compatibility


# ascii85decode(data)
def ascii85decode(data):
Expand All @@ -22,8 +26,8 @@ def ascii85decode(data):
"""
n = b = 0
out = b''
for i in iter(data):
c = bytes((i,))
for i in six.iterbytes(data):
c = six.int2byte(i)
if b'!' <= c and c <= b'u':
n += 1
b = b*85+(ord(c)-33)
Expand All @@ -43,8 +47,9 @@ def ascii85decode(data):


# asciihexdecode(data)
hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
hex_re = re.compile(b'([a-f0-9]{2})', re.IGNORECASE)
trail_re = re.compile(b'^(?:[a-f0-9]{2}|[ \t\n\r\f\v])*'
b'([a-f0-9])[ \t\n\r\f\v>]*$', re.IGNORECASE)


def asciihexdecode(data):
Expand All @@ -59,7 +64,7 @@ def asciihexdecode(data):
"""
def decode(x):
i = int(x, 16)
return bytes((i,))
return six.int2byte(i)

out = b''
for x in hex_re.findall(data):
Expand Down
15 changes: 12 additions & 3 deletions pdfminer/ccitt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# CCITT Fax decoder
#
# Bugs: uncompressed mode untested.
Expand All @@ -14,12 +15,20 @@
import sys
import array

import six # Python 2+3 compatibility

if six.PY3:
def get_bytes(data):
for byte in data:
yield byte
else:
def get_bytes(data):
for char in data:
yield ord(char)

def get_bytes(data):
yield from data

class BitParser(object):

class BitParser:
def __init__(self):
self._pos = 0
return
Expand Down
20 changes: 12 additions & 8 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
import os
import os.path
import gzip
import pickle as pickle
try:
import cPickle as pickle
except ImportError:
import pickle as pickle
import struct
import logging
from .psparser import PSStackParser
Expand All @@ -26,6 +29,7 @@
from .utils import choplist
from .utils import nunpack

import six

log = logging.getLogger(__name__)

Expand All @@ -34,7 +38,7 @@ class CMapError(Exception):
pass


class CMapBase:
class CMapBase(object):

debug = 0

Expand Down Expand Up @@ -73,7 +77,7 @@ def use_cmap(self, cmap):
assert isinstance(cmap, CMap), str(type(cmap))

def copy(dst, src):
for (k, v) in src.items():
for (k, v) in six.iteritems(src):
if isinstance(v, dict):
d = {}
dst[k] = d
Expand All @@ -86,7 +90,7 @@ def copy(dst, src):
def decode(self, code):
log.debug('decode: %r, %r', self, code)
d = self.code2cid
for i in iter(code):
for i in six.iterbytes(code):
if i in d:
d = d[i]
if isinstance(d, int):
Expand All @@ -100,7 +104,7 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
if code2cid is None:
code2cid = self.code2cid
code = ()
for (k, v) in sorted(code2cid.items()):
for (k, v) in sorted(six.iteritems(code2cid)):
c = code+(k,)
if isinstance(v, int):
out.write('code %r = cid %d\n' % (c, v))
Expand Down Expand Up @@ -144,7 +148,7 @@ def get_unichr(self, cid):
return self.cid2unichr[cid]

def dump(self, out=sys.stdout):
for (k, v) in sorted(self.cid2unichr.items()):
for (k, v) in sorted(six.iteritems(self.cid2unichr)):
out.write('cid %d = unicode %r\n' % (k, v))
return

Expand Down Expand Up @@ -179,7 +183,7 @@ def add_cid2unichr(self, cid, code):
# Interpret as UTF-16BE.
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
elif isinstance(code, int):
self.cid2unichr[cid] = chr(code)
self.cid2unichr[cid] = six.unichr(code)
else:
raise TypeError(code)
return
Expand Down Expand Up @@ -207,7 +211,7 @@ def __init__(self, name, module, vertical):
return


class CMapDB:
class CMapDB(object):

_cmap_cache = {}
_umap_cache = {}
Expand Down
18 changes: 10 additions & 8 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import logging
import re
import sys
Expand All @@ -22,6 +23,7 @@
from .utils import bbox2str
from . import utils

import six

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -113,7 +115,7 @@ def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
graphicstate):
try:
text = font.to_unichr(cid)
assert isinstance(text, str), str(type(text))
assert isinstance(text, six.text_type), str(type(text))
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
Expand Down Expand Up @@ -166,7 +168,7 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
self.outfp_binary = False
else:
try:
self.outfp.write("é")
self.outfp.write(u"é")
self.outfp_binary = False
except TypeError:
self.outfp_binary = True
Expand All @@ -184,7 +186,7 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,

def write_text(self, text):
text = utils.compatible_encode_method(text, self.codec, 'ignore')
if self.outfp_binary:
if six.PY3 and self.outfp_binary:
text = text.encode()
self.outfp.write(text)
return
Expand Down Expand Up @@ -283,7 +285,7 @@ def write_header(self):
return

def write_footer(self):
page_links = ['<a href="#{}">{}</a>'.format(i, i)
page_links = ['<a href="#%s">%s</a>' % (i, i)
for i in range(1, self.pageno)]
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
', '.join(page_links)
Expand Down Expand Up @@ -386,8 +388,8 @@ def render(item):
if self.showpageno:
self.write('<div style="position:absolute; top:%dpx;">' %
((self._yoffset-item.y1)*self.scale))
self.write('<a name="{}">Page {}</a></div>\n'
.format(item.pageid, item.pageid))
self.write('<a name="%s">Page %s</a></div>\n' % (
item.pageid, item.pageid))
for child in item:
render(child)
if item.groups is not None:
Expand Down Expand Up @@ -450,7 +452,7 @@ def close(self):

class XMLConverter(PDFConverter):

CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')

def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
imagewriter=None, stripcontrol=False):
Expand Down Expand Up @@ -481,7 +483,7 @@ def write_footer(self):

def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub('', text)
text = self.CONTROL.sub(u'', text)
self.write(enc(text))
return

Expand Down
8 changes: 5 additions & 3 deletions pdfminer/encodingdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import re

import six # Python 2+3 compatibility

from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING
from .psparser import PSLiteral
Expand Down Expand Up @@ -43,7 +45,7 @@ def name2unicode(name):
for i in range(0, len(name_without_uni), 4)]
for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit)
characters = map(chr, unicode_digits)
characters = map(six.unichr, unicode_digits)
return ''.join(characters)

elif name.startswith('u'):
Expand All @@ -53,7 +55,7 @@ def name2unicode(name):
4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit)
return chr(unicode_digit)
return six.unichr(unicode_digit)

raise KeyError('Could not convert unicode name "%s" to character because '
'it does not match specification' % name)
Expand All @@ -70,7 +72,7 @@ def raise_key_error_for_invalid_unicode(unicode_digit):
'it is in the range D800 through DFFF' % unicode_digit)


class EncodingDB:
class EncodingDB(object):

std2unicode = {}
mac2unicode = {}
Expand Down
Loading

0 comments on commit 1acb8cf

Please sign in to comment.