diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py
index 5c0e64c9..e64bdf21 100644
--- a/pdfminer/arcfour.py
+++ b/pdfminer/arcfour.py
@@ -1,22 +1,20 @@
-
-
""" Python implementation of Arcfour encryption algorithm.
See https://en.wikipedia.org/wiki/RC4
This code is in the public domain.
-
"""
-import six # Python 2+3 compatibility
-## Arcfour
-##
+import six
+
+
class Arcfour(object):
+ """ Python implementation of Arcfour encryption algorithm."""
def __init__(self, key):
- s = [i for i in range(256)] #because Py3 range is not indexable
+ s = [i for i in range(256)] # because Py3 range is not indexable
j = 0
klen = len(key)
for i in range(256):
- j = (j + s[i] + six.indexbytes(key,i % klen)) % 256
+ j = (j + s[i] + six.indexbytes(key, i % klen)) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
@@ -34,7 +32,8 @@ def process(self, data):
r += six.int2byte(c ^ k)
(self.i, self.j) = (i, j)
return r
-
+
encrypt = decrypt = process
+
new = Arcfour
diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py
index a9f501da..4c5dc347 100644
--- a/pdfminer/ascii85.py
+++ b/pdfminer/ascii85.py
@@ -1,34 +1,26 @@
-
-
-""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
-
+"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
This code is in the public domain.
-
"""
import re
import struct
-
-import six #Python 2+3 compatibility
+import six
-# ascii85decode(data)
def ascii85decode(data):
"""
In ASCII85 encoding, every four bytes are encoded with five ASCII
letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special
rule is used for round up.
-
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
-
"""
n = b = 0
out = b''
for i in six.iterbytes(data):
- c=six.int2byte(i)
- if b'!' <= c and c <= b'u':
+ c = six.int2byte(i)
+ if b'!' <= c <= b'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
@@ -45,6 +37,7 @@ def ascii85decode(data):
break
return out
+
# asciihexdecode(data)
hex_re = re.compile(b'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(b'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
@@ -60,15 +53,15 @@ def asciihexdecode(data):
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
"""
- def decode(x):
- i=int(x,16)
+ def decode(char):
+ i = int(char, 16)
return six.int2byte(i)
- out=b''
+ out = b''
for x in hex_re.findall(data):
- out+=decode(x)
+ out += decode(x)
m = trail_re.search(data)
if m:
- out+=decode(m.group(1)+b'0')
+ out += decode(m.group(1)+b'0')
return out
diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py
index efc34823..73a9f542 100644
--- a/pdfminer/ccitt.py
+++ b/pdfminer/ccitt.py
@@ -1,19 +1,16 @@
-
-# CCITT Fax decoder
-#
-# Bugs: uncompressed mode untested.
-#
-# cf.
-# ITU-T Recommendation T.4
-# "Standardization of Group 3 facsimile terminals for document transmission"
-# ITU-T Recommendation T.6
-# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS FOR GROUP 4 FACSIMILE APPARATUS"
-
+"""CCITT Fax decoder
+Bugs: uncompressed mode untested.
+ cf.
+ ITU-T Recommendation T.4
+ "Standardization of Group 3 facsimile terminals for document transmission"
+ ITU-T Recommendation T.6
+ "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
+ FOR GROUP 4 FACSIMILE APPARATUS"
+"""
import sys
import array
-
-import six #Python 2+3 compatibility
+import six
if six.PY3:
def get_bytes(data):
@@ -25,8 +22,6 @@ def get_bytes(data):
yield ord(char)
-## BitParser
-##
class BitParser(object):
def __init__(self):
@@ -34,7 +29,7 @@ def __init__(self):
return
@classmethod
- def add(klass, root, v, bits):
+ def add(cls, root, v, bits):
p = root
b = None
for i in range(len(bits)):
@@ -68,20 +63,18 @@ def _parse_bit(self, x):
return
-## CCITTG4Parser
-##
class CCITTG4Parser(BitParser):
MODE = [None, None]
- BitParser.add(MODE, 0, '1')
- BitParser.add(MODE, +1, '011')
- BitParser.add(MODE, -1, '010')
+ BitParser.add(MODE, 0, '1')
+ BitParser.add(MODE, +1, '011')
+ BitParser.add(MODE, -1, '010')
BitParser.add(MODE, 'h', '001')
BitParser.add(MODE, 'p', '0001')
- BitParser.add(MODE, +2, '000011')
- BitParser.add(MODE, -2, '000010')
- BitParser.add(MODE, +3, '0000011')
- BitParser.add(MODE, -3, '0000010')
+ BitParser.add(MODE, +2, '000011')
+ BitParser.add(MODE, -2, '000010')
+ BitParser.add(MODE, +3, '0000011')
+ BitParser.add(MODE, -3, '0000010')
BitParser.add(MODE, 'u', '0000001111')
BitParser.add(MODE, 'x1', '0000001000')
BitParser.add(MODE, 'x2', '0000001001')
@@ -93,85 +86,85 @@ class CCITTG4Parser(BitParser):
BitParser.add(MODE, 'e', '000000000001000000000001')
WHITE = [None, None]
- BitParser.add(WHITE, 0 , '00110101')
- BitParser.add(WHITE, 1 , '000111')
- BitParser.add(WHITE, 2 , '0111')
- BitParser.add(WHITE, 3 , '1000')
- BitParser.add(WHITE, 4 , '1011')
- BitParser.add(WHITE, 5 , '1100')
- BitParser.add(WHITE, 6 , '1110')
- BitParser.add(WHITE, 7 , '1111')
- BitParser.add(WHITE, 8 , '10011')
- BitParser.add(WHITE, 9 , '10100')
- BitParser.add(WHITE, 10 , '00111')
- BitParser.add(WHITE, 11 , '01000')
- BitParser.add(WHITE, 12 , '001000')
- BitParser.add(WHITE, 13 , '000011')
- BitParser.add(WHITE, 14 , '110100')
- BitParser.add(WHITE, 15 , '110101')
- BitParser.add(WHITE, 16 , '101010')
- BitParser.add(WHITE, 17 , '101011')
- BitParser.add(WHITE, 18 , '0100111')
- BitParser.add(WHITE, 19 , '0001100')
- BitParser.add(WHITE, 20 , '0001000')
- BitParser.add(WHITE, 21 , '0010111')
- BitParser.add(WHITE, 22 , '0000011')
- BitParser.add(WHITE, 23 , '0000100')
- BitParser.add(WHITE, 24 , '0101000')
- BitParser.add(WHITE, 25 , '0101011')
- BitParser.add(WHITE, 26 , '0010011')
- BitParser.add(WHITE, 27 , '0100100')
- BitParser.add(WHITE, 28 , '0011000')
- BitParser.add(WHITE, 29 , '00000010')
- BitParser.add(WHITE, 30 , '00000011')
- BitParser.add(WHITE, 31 , '00011010')
- BitParser.add(WHITE, 32 , '00011011')
- BitParser.add(WHITE, 33 , '00010010')
- BitParser.add(WHITE, 34 , '00010011')
- BitParser.add(WHITE, 35 , '00010100')
- BitParser.add(WHITE, 36 , '00010101')
- BitParser.add(WHITE, 37 , '00010110')
- BitParser.add(WHITE, 38 , '00010111')
- BitParser.add(WHITE, 39 , '00101000')
- BitParser.add(WHITE, 40 , '00101001')
- BitParser.add(WHITE, 41 , '00101010')
- BitParser.add(WHITE, 42 , '00101011')
- BitParser.add(WHITE, 43 , '00101100')
- BitParser.add(WHITE, 44 , '00101101')
- BitParser.add(WHITE, 45 , '00000100')
- BitParser.add(WHITE, 46 , '00000101')
- BitParser.add(WHITE, 47 , '00001010')
- BitParser.add(WHITE, 48 , '00001011')
- BitParser.add(WHITE, 49 , '01010010')
- BitParser.add(WHITE, 50 , '01010011')
- BitParser.add(WHITE, 51 , '01010100')
- BitParser.add(WHITE, 52 , '01010101')
- BitParser.add(WHITE, 53 , '00100100')
- BitParser.add(WHITE, 54 , '00100101')
- BitParser.add(WHITE, 55 , '01011000')
- BitParser.add(WHITE, 56 , '01011001')
- BitParser.add(WHITE, 57 , '01011010')
- BitParser.add(WHITE, 58 , '01011011')
- BitParser.add(WHITE, 59 , '01001010')
- BitParser.add(WHITE, 60 , '01001011')
- BitParser.add(WHITE, 61 , '00110010')
- BitParser.add(WHITE, 62 , '00110011')
- BitParser.add(WHITE, 63 , '00110100')
- BitParser.add(WHITE, 64 , '11011')
- BitParser.add(WHITE, 128 , '10010')
- BitParser.add(WHITE, 192 , '010111')
- BitParser.add(WHITE, 256 , '0110111')
- BitParser.add(WHITE, 320 , '00110110')
- BitParser.add(WHITE, 384 , '00110111')
- BitParser.add(WHITE, 448 , '01100100')
- BitParser.add(WHITE, 512 , '01100101')
- BitParser.add(WHITE, 576 , '01101000')
- BitParser.add(WHITE, 640 , '01100111')
- BitParser.add(WHITE, 704 , '011001100')
- BitParser.add(WHITE, 768 , '011001101')
- BitParser.add(WHITE, 832 , '011010010')
- BitParser.add(WHITE, 896 , '011010011')
- BitParser.add(WHITE, 960 , '011010100')
+ BitParser.add(WHITE, 0, '00110101')
+ BitParser.add(WHITE, 1, '000111')
+ BitParser.add(WHITE, 2, '0111')
+ BitParser.add(WHITE, 3, '1000')
+ BitParser.add(WHITE, 4, '1011')
+ BitParser.add(WHITE, 5, '1100')
+ BitParser.add(WHITE, 6, '1110')
+ BitParser.add(WHITE, 7, '1111')
+ BitParser.add(WHITE, 8, '10011')
+ BitParser.add(WHITE, 9, '10100')
+ BitParser.add(WHITE, 10, '00111')
+ BitParser.add(WHITE, 11, '01000')
+ BitParser.add(WHITE, 12, '001000')
+ BitParser.add(WHITE, 13, '000011')
+ BitParser.add(WHITE, 14, '110100')
+ BitParser.add(WHITE, 15, '110101')
+ BitParser.add(WHITE, 16, '101010')
+ BitParser.add(WHITE, 17, '101011')
+ BitParser.add(WHITE, 18, '0100111')
+ BitParser.add(WHITE, 19, '0001100')
+ BitParser.add(WHITE, 20, '0001000')
+ BitParser.add(WHITE, 21, '0010111')
+ BitParser.add(WHITE, 22, '0000011')
+ BitParser.add(WHITE, 23, '0000100')
+ BitParser.add(WHITE, 24, '0101000')
+ BitParser.add(WHITE, 25, '0101011')
+ BitParser.add(WHITE, 26, '0010011')
+ BitParser.add(WHITE, 27, '0100100')
+ BitParser.add(WHITE, 28, '0011000')
+ BitParser.add(WHITE, 29, '00000010')
+ BitParser.add(WHITE, 30, '00000011')
+ BitParser.add(WHITE, 31, '00011010')
+ BitParser.add(WHITE, 32, '00011011')
+ BitParser.add(WHITE, 33, '00010010')
+ BitParser.add(WHITE, 34, '00010011')
+ BitParser.add(WHITE, 35, '00010100')
+ BitParser.add(WHITE, 36, '00010101')
+ BitParser.add(WHITE, 37, '00010110')
+ BitParser.add(WHITE, 38, '00010111')
+ BitParser.add(WHITE, 39, '00101000')
+ BitParser.add(WHITE, 40, '00101001')
+ BitParser.add(WHITE, 41, '00101010')
+ BitParser.add(WHITE, 42, '00101011')
+ BitParser.add(WHITE, 43, '00101100')
+ BitParser.add(WHITE, 44, '00101101')
+ BitParser.add(WHITE, 45, '00000100')
+ BitParser.add(WHITE, 46, '00000101')
+ BitParser.add(WHITE, 47, '00001010')
+ BitParser.add(WHITE, 48, '00001011')
+ BitParser.add(WHITE, 49, '01010010')
+ BitParser.add(WHITE, 50, '01010011')
+ BitParser.add(WHITE, 51, '01010100')
+ BitParser.add(WHITE, 52, '01010101')
+ BitParser.add(WHITE, 53, '00100100')
+ BitParser.add(WHITE, 54, '00100101')
+ BitParser.add(WHITE, 55, '01011000')
+ BitParser.add(WHITE, 56, '01011001')
+ BitParser.add(WHITE, 57, '01011010')
+ BitParser.add(WHITE, 58, '01011011')
+ BitParser.add(WHITE, 59, '01001010')
+ BitParser.add(WHITE, 60, '01001011')
+ BitParser.add(WHITE, 61, '00110010')
+ BitParser.add(WHITE, 62, '00110011')
+ BitParser.add(WHITE, 63, '00110100')
+ BitParser.add(WHITE, 64, '11011')
+ BitParser.add(WHITE, 128, '10010')
+ BitParser.add(WHITE, 192, '010111')
+ BitParser.add(WHITE, 256, '0110111')
+ BitParser.add(WHITE, 320, '00110110')
+ BitParser.add(WHITE, 384, '00110111')
+ BitParser.add(WHITE, 448, '01100100')
+ BitParser.add(WHITE, 512, '01100101')
+ BitParser.add(WHITE, 576, '01101000')
+ BitParser.add(WHITE, 640, '01100111')
+ BitParser.add(WHITE, 704, '011001100')
+ BitParser.add(WHITE, 768, '011001101')
+ BitParser.add(WHITE, 832, '011010010')
+ BitParser.add(WHITE, 896, '011010011')
+ BitParser.add(WHITE, 960, '011010100')
BitParser.add(WHITE, 1024, '011010101')
BitParser.add(WHITE, 1088, '011010110')
BitParser.add(WHITE, 1152, '011010111')
@@ -199,85 +192,85 @@ class CCITTG4Parser(BitParser):
BitParser.add(WHITE, 2560, '000000011111')
BLACK = [None, None]
- BitParser.add(BLACK, 0 , '0000110111')
- BitParser.add(BLACK, 1 , '010')
- BitParser.add(BLACK, 2 , '11')
- BitParser.add(BLACK, 3 , '10')
- BitParser.add(BLACK, 4 , '011')
- BitParser.add(BLACK, 5 , '0011')
- BitParser.add(BLACK, 6 , '0010')
- BitParser.add(BLACK, 7 , '00011')
- BitParser.add(BLACK, 8 , '000101')
- BitParser.add(BLACK, 9 , '000100')
- BitParser.add(BLACK, 10 , '0000100')
- BitParser.add(BLACK, 11 , '0000101')
- BitParser.add(BLACK, 12 , '0000111')
- BitParser.add(BLACK, 13 , '00000100')
- BitParser.add(BLACK, 14 , '00000111')
- BitParser.add(BLACK, 15 , '000011000')
- BitParser.add(BLACK, 16 , '0000010111')
- BitParser.add(BLACK, 17 , '0000011000')
- BitParser.add(BLACK, 18 , '0000001000')
- BitParser.add(BLACK, 19 , '00001100111')
- BitParser.add(BLACK, 20 , '00001101000')
- BitParser.add(BLACK, 21 , '00001101100')
- BitParser.add(BLACK, 22 , '00000110111')
- BitParser.add(BLACK, 23 , '00000101000')
- BitParser.add(BLACK, 24 , '00000010111')
- BitParser.add(BLACK, 25 , '00000011000')
- BitParser.add(BLACK, 26 , '000011001010')
- BitParser.add(BLACK, 27 , '000011001011')
- BitParser.add(BLACK, 28 , '000011001100')
- BitParser.add(BLACK, 29 , '000011001101')
- BitParser.add(BLACK, 30 , '000001101000')
- BitParser.add(BLACK, 31 , '000001101001')
- BitParser.add(BLACK, 32 , '000001101010')
- BitParser.add(BLACK, 33 , '000001101011')
- BitParser.add(BLACK, 34 , '000011010010')
- BitParser.add(BLACK, 35 , '000011010011')
- BitParser.add(BLACK, 36 , '000011010100')
- BitParser.add(BLACK, 37 , '000011010101')
- BitParser.add(BLACK, 38 , '000011010110')
- BitParser.add(BLACK, 39 , '000011010111')
- BitParser.add(BLACK, 40 , '000001101100')
- BitParser.add(BLACK, 41 , '000001101101')
- BitParser.add(BLACK, 42 , '000011011010')
- BitParser.add(BLACK, 43 , '000011011011')
- BitParser.add(BLACK, 44 , '000001010100')
- BitParser.add(BLACK, 45 , '000001010101')
- BitParser.add(BLACK, 46 , '000001010110')
- BitParser.add(BLACK, 47 , '000001010111')
- BitParser.add(BLACK, 48 , '000001100100')
- BitParser.add(BLACK, 49 , '000001100101')
- BitParser.add(BLACK, 50 , '000001010010')
- BitParser.add(BLACK, 51 , '000001010011')
- BitParser.add(BLACK, 52 , '000000100100')
- BitParser.add(BLACK, 53 , '000000110111')
- BitParser.add(BLACK, 54 , '000000111000')
- BitParser.add(BLACK, 55 , '000000100111')
- BitParser.add(BLACK, 56 , '000000101000')
- BitParser.add(BLACK, 57 , '000001011000')
- BitParser.add(BLACK, 58 , '000001011001')
- BitParser.add(BLACK, 59 , '000000101011')
- BitParser.add(BLACK, 60 , '000000101100')
- BitParser.add(BLACK, 61 , '000001011010')
- BitParser.add(BLACK, 62 , '000001100110')
- BitParser.add(BLACK, 63 , '000001100111')
- BitParser.add(BLACK, 64 , '0000001111')
- BitParser.add(BLACK, 128 , '000011001000')
- BitParser.add(BLACK, 192 , '000011001001')
- BitParser.add(BLACK, 256 , '000001011011')
- BitParser.add(BLACK, 320 , '000000110011')
- BitParser.add(BLACK, 384 , '000000110100')
- BitParser.add(BLACK, 448 , '000000110101')
- BitParser.add(BLACK, 512 , '0000001101100')
- BitParser.add(BLACK, 576 , '0000001101101')
- BitParser.add(BLACK, 640 , '0000001001010')
- BitParser.add(BLACK, 704 , '0000001001011')
- BitParser.add(BLACK, 768 , '0000001001100')
- BitParser.add(BLACK, 832 , '0000001001101')
- BitParser.add(BLACK, 896 , '0000001110010')
- BitParser.add(BLACK, 960 , '0000001110011')
+ BitParser.add(BLACK, 0, '0000110111')
+ BitParser.add(BLACK, 1, '010')
+ BitParser.add(BLACK, 2, '11')
+ BitParser.add(BLACK, 3, '10')
+ BitParser.add(BLACK, 4, '011')
+ BitParser.add(BLACK, 5, '0011')
+ BitParser.add(BLACK, 6, '0010')
+ BitParser.add(BLACK, 7, '00011')
+ BitParser.add(BLACK, 8, '000101')
+ BitParser.add(BLACK, 9, '000100')
+ BitParser.add(BLACK, 10, '0000100')
+ BitParser.add(BLACK, 11, '0000101')
+ BitParser.add(BLACK, 12, '0000111')
+ BitParser.add(BLACK, 13, '00000100')
+ BitParser.add(BLACK, 14, '00000111')
+ BitParser.add(BLACK, 15, '000011000')
+ BitParser.add(BLACK, 16, '0000010111')
+ BitParser.add(BLACK, 17, '0000011000')
+ BitParser.add(BLACK, 18, '0000001000')
+ BitParser.add(BLACK, 19, '00001100111')
+ BitParser.add(BLACK, 20, '00001101000')
+ BitParser.add(BLACK, 21, '00001101100')
+ BitParser.add(BLACK, 22, '00000110111')
+ BitParser.add(BLACK, 23, '00000101000')
+ BitParser.add(BLACK, 24, '00000010111')
+ BitParser.add(BLACK, 25, '00000011000')
+ BitParser.add(BLACK, 26, '000011001010')
+ BitParser.add(BLACK, 27, '000011001011')
+ BitParser.add(BLACK, 28, '000011001100')
+ BitParser.add(BLACK, 29, '000011001101')
+ BitParser.add(BLACK, 30, '000001101000')
+ BitParser.add(BLACK, 31, '000001101001')
+ BitParser.add(BLACK, 32, '000001101010')
+ BitParser.add(BLACK, 33, '000001101011')
+ BitParser.add(BLACK, 34, '000011010010')
+ BitParser.add(BLACK, 35, '000011010011')
+ BitParser.add(BLACK, 36, '000011010100')
+ BitParser.add(BLACK, 37, '000011010101')
+ BitParser.add(BLACK, 38, '000011010110')
+ BitParser.add(BLACK, 39, '000011010111')
+ BitParser.add(BLACK, 40, '000001101100')
+ BitParser.add(BLACK, 41, '000001101101')
+ BitParser.add(BLACK, 42, '000011011010')
+ BitParser.add(BLACK, 43, '000011011011')
+ BitParser.add(BLACK, 44, '000001010100')
+ BitParser.add(BLACK, 45, '000001010101')
+ BitParser.add(BLACK, 46, '000001010110')
+ BitParser.add(BLACK, 47, '000001010111')
+ BitParser.add(BLACK, 48, '000001100100')
+ BitParser.add(BLACK, 49, '000001100101')
+ BitParser.add(BLACK, 50, '000001010010')
+ BitParser.add(BLACK, 51, '000001010011')
+ BitParser.add(BLACK, 52, '000000100100')
+ BitParser.add(BLACK, 53, '000000110111')
+ BitParser.add(BLACK, 54, '000000111000')
+ BitParser.add(BLACK, 55, '000000100111')
+ BitParser.add(BLACK, 56, '000000101000')
+ BitParser.add(BLACK, 57, '000001011000')
+ BitParser.add(BLACK, 58, '000001011001')
+ BitParser.add(BLACK, 59, '000000101011')
+ BitParser.add(BLACK, 60, '000000101100')
+ BitParser.add(BLACK, 61, '000001011010')
+ BitParser.add(BLACK, 62, '000001100110')
+ BitParser.add(BLACK, 63, '000001100111')
+ BitParser.add(BLACK, 64, '0000001111')
+ BitParser.add(BLACK, 128, '000011001000')
+ BitParser.add(BLACK, 192, '000011001001')
+ BitParser.add(BLACK, 256, '000001011011')
+ BitParser.add(BLACK, 320, '000000110011')
+ BitParser.add(BLACK, 384, '000000110100')
+ BitParser.add(BLACK, 448, '000000110101')
+ BitParser.add(BLACK, 512, '0000001101100')
+ BitParser.add(BLACK, 576, '0000001101101')
+ BitParser.add(BLACK, 640, '0000001001010')
+ BitParser.add(BLACK, 704, '0000001001011')
+ BitParser.add(BLACK, 768, '0000001001100')
+ BitParser.add(BLACK, 832, '0000001001101')
+ BitParser.add(BLACK, 896, '0000001110010')
+ BitParser.add(BLACK, 960, '0000001110011')
BitParser.add(BLACK, 1024, '0000001110100')
BitParser.add(BLACK, 1088, '0000001110101')
BitParser.add(BLACK, 1152, '0000001110110')
@@ -434,7 +427,7 @@ def reset(self):
return
def output_line(self, y, bits):
- print (y, ''.join(str(b) for b in bits))
+ print(y, ''.join(str(b) for b in bits))
return
def _reset_line(self):
@@ -454,12 +447,13 @@ def _flush_line(self):
return
def _do_vertical(self, dx):
- #print '* vertical(%d): curpos=%r, color=%r' % (dx, self._curpos, self._color)
- #print ' refline:', self._get_refline(self._curpos+1)
+ # print '* vertical(%d): curpos=%r, color=%r'
+ # % (dx, self._curpos, self._color)
+ # print ' refline:', self._get_refline(self._curpos+1)
x1 = self._curpos+1
while 1:
if x1 == 0:
- if (self._color == 1 and self._refline[x1] != self._color):
+ if self._color == 1 and self._refline[x1] != self._color:
break
elif x1 == len(self._refline):
break
@@ -481,12 +475,12 @@ def _do_vertical(self, dx):
return
def _do_pass(self):
- #print '* pass: curpos=%r, color=%r' % (self._curpos, self._color)
- #print ' refline:', self._get_refline(self._curpos+1)
+ # print '* pass: curpos=%r, color=%r' % (self._curpos, self._color)
+ # print ' refline:', self._get_refline(self._curpos+1)
x1 = self._curpos+1
while 1:
if x1 == 0:
- if (self._color == 1 and self._refline[x1] != self._color):
+ if self._color == 1 and self._refline[x1] != self._color:
break
elif x1 == len(self._refline):
break
@@ -496,7 +490,7 @@ def _do_pass(self):
x1 += 1
while 1:
if x1 == 0:
- if (self._color == 0 and self._refline[x1] == self._color):
+ if self._color == 0 and self._refline[x1] == self._color:
break
elif x1 == len(self._refline):
break
@@ -510,7 +504,8 @@ def _do_pass(self):
return
def _do_horizontal(self, n1, n2):
- #print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color)
+ # print '* horizontal(%d,%d): curpos=%r, color=%r'
+ # % (n1, n2, self._curpos, self._color)
if self._curpos < 0:
self._curpos = 0
x = self._curpos
@@ -528,7 +523,7 @@ def _do_horizontal(self, n1, n2):
return
def _do_uncompressed(self, bits):
- #print '* uncompressed(%r): curpos=%r' % (bits, self._curpos)
+ # print '* uncompressed(%r): curpos=%r' % (bits, self._curpos)
for c in bits:
self._curline[self._curpos] = int(c)
self._curpos += 1
@@ -536,8 +531,6 @@ def _do_uncompressed(self, bits):
return
-
-
class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False):
@@ -606,5 +599,6 @@ def close(self):
fp.close()
return
+
if __name__ == '__main__':
sys.exit(main(sys.argv))
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
index 19dddf35..2697a55d 100644
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@@ -1,14 +1,10 @@
-
-
""" Adobe character mapping (CMap) support.
CMaps provide the mapping between character codes and Unicode
code-points to character ids (CIDs).
More information is available on the Adobe website:
-
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
-
"""
import sys
@@ -21,6 +17,7 @@
import pickle as pickle
import struct
import logging
+import six
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
@@ -31,7 +28,6 @@
from .utils import choplist
from .utils import nunpack
-import six #Python 2+3 compatibility
log = logging.getLogger(__name__)
@@ -40,8 +36,6 @@ class CMapError(Exception):
pass
-## CMapBase
-##
class CMapBase(object):
debug = 0
@@ -67,8 +61,6 @@ def use_cmap(self, cmap):
return
-## CMap
-##
class CMap(CMapBase):
def __init__(self, **kwargs):
@@ -119,8 +111,6 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
return
-## IdentityCMap
-##
class IdentityCMap(CMapBase):
def decode(self, code):
@@ -131,8 +121,6 @@ def decode(self, code):
return ()
-## UnicodeMap
-##
class UnicodeMap(CMapBase):
def __init__(self, **kwargs):
@@ -153,12 +141,11 @@ def dump(self, out=sys.stdout):
return
-## FileCMap
-##
class FileCMap(CMap):
def add_code2cid(self, code, cid):
- assert isinstance(code, str) and isinstance(cid, int), str((type(code), type(cid)))
+ assert isinstance(code, str) and isinstance(cid, int),\
+ str((type(code), type(cid)))
d = self.code2cid
for c in code[:-1]:
c = ord(c)
@@ -173,8 +160,6 @@ def add_code2cid(self, code, cid):
return
-## FileUnicodeMap
-##
class FileUnicodeMap(UnicodeMap):
def add_cid2unichr(self, cid, code):
@@ -192,8 +177,6 @@ def add_cid2unichr(self, cid, code):
return
-## PyCMap
-##
class PyCMap(CMap):
def __init__(self, name, module):
@@ -204,8 +187,6 @@ def __init__(self, name, module):
return
-## PyUnicodeMap
-##
class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
@@ -218,8 +199,6 @@ def __init__(self, name, module, vertical):
return
-## CMapDB
-##
class CMapDB(object):
_cmap_cache = {}
@@ -260,18 +239,17 @@ def get_cmap(klass, name):
return cmap
@classmethod
- def get_unicode_map(klass, name, vertical=False):
+ def get_unicode_map(cls, name, vertical=False):
try:
- return klass._umap_cache[name][vertical]
+ return cls._umap_cache[name][vertical]
except KeyError:
pass
- data = klass._load_data('to-unicode-%s' % name)
- klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
+ data = cls._load_data('to-unicode-%s' % name)
+ cls._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v)
+ for v in (False, True)]
return umaps[vertical]
-## CMapParser
-##
class CMapParser(PSStackParser):
def __init__(self, cmap, fp):
@@ -315,7 +293,7 @@ def do_keyword(self, pos, token):
return
if not self._in_cmap:
return
- #
+
if token is self.KEYWORD_DEF:
try:
((_, k), (_, v)) = self.pop(2)
@@ -347,8 +325,9 @@ def do_keyword(self, pos, token):
if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
- if (not isinstance(s, str) or not isinstance(e, str) or
- not isinstance(cid, int) or len(s) != len(e)):
+ if (not isinstance(s, str) or
+ not isinstance(e, str) or
+ not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
eprefix = e[:-4]
@@ -359,7 +338,7 @@ def do_keyword(self, pos, token):
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
- #assert s1 <= e1, str((s1, e1))
+ # assert s1 <= e1, str((s1, e1))
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
@@ -381,12 +360,12 @@ def do_keyword(self, pos, token):
if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
- if (not isinstance(s, bytes) or not isinstance(e, bytes) or
- len(s) != len(e)):
- continue
+ if (not isinstance(s, bytes) or
+ not isinstance(e, bytes) or len(s) != len(e)):
+ continue
s1 = nunpack(s)
e1 = nunpack(e)
- #assert s1 <= e1, str((s1, e1))
+ # assert s1 <= e1, str((s1, e1))
if isinstance(code, list):
for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i])
@@ -427,11 +406,12 @@ def main(argv):
for fname in args:
fp = file(fname, 'rb')
cmap = FileUnicodeMap()
- #cmap = FileCMap()
+ # cmap = FileCMap()
CMapParser(cmap, fp).run()
fp.close()
cmap.dump()
return
+
if __name__ == '__main__':
sys.exit(main(sys.argv))
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 02545e83..43554265 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-
import logging
import re
+import six
from .pdfdevice import PDFTextDevice
from .pdffont import PDFUnicodeNotDefined
from .layout import LTContainer
@@ -23,13 +24,10 @@
from .utils import bbox2str
from . import utils
-import six # Python 2+3 compatibility
log = logging.getLogger(__name__)
-## PDFLayoutAnalyzer
-##
class PDFLayoutAnalyzer(PDFTextDevice):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
@@ -86,7 +84,8 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
if x0 == x1 or y0 == y1:
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1),
- stroke, fill, evenodd, gstate.scolor, gstate.ncolor))
+ stroke, fill, evenodd, gstate.scolor,
+ gstate.ncolor))
return
if shape == 'mlllh':
# rectangle
@@ -99,9 +98,10 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
- (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
+ (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2),
- stroke, fill, evenodd, gstate.scolor, gstate.ncolor))
+ stroke, fill, evenodd, gstate.scolor,
+ gstate.ncolor))
return
# other shapes
pts = []
@@ -109,7 +109,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
for i in range(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill,
- evenodd, gstate.scolor, gstate.ncolor))
+ evenodd, gstate.scolor, gstate.ncolor))
return
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
@@ -120,7 +120,8 @@ def render_char(self, matrix, font, fontsize, scaling, rise, cid):
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
- item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
+ item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
+ textdisp)
self.cur_item.add(item)
return item.adv
@@ -132,12 +133,11 @@ def receive_layout(self, ltpage):
return
-## PDFPageAggregator
-##
class PDFPageAggregator(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
- PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+ PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
+ laparams=laparams)
self.result = None
return
@@ -149,12 +149,11 @@ def get_result(self):
return self.result
-## PDFConverter
-##
class PDFConverter(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
- PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+ PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
+ laparams=laparams)
self.outfp = outfp
self.codec = codec
if hasattr(self.outfp, 'mode'):
@@ -177,13 +176,12 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
return
-## TextConverter
-##
class TextConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False, imagewriter=None):
- PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+ PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec,
+ pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.imagewriter = imagewriter
return
@@ -226,12 +224,10 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
return
-## HTMLConverter
-##
class HTMLConverter(PDFConverter):
RECT_COLORS = {
- #'char': 'green',
+ # 'char': 'green',
'figure': 'yellow',
'textline': 'magenta',
'textbox': 'cyan',
@@ -250,7 +246,8 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
pagemargin=50, imagewriter=None, debug=0,
rect_colors={'curve': 'black', 'page': 'gray'},
text_colors={'char': 'black'}):
- PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+ PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
+ laparams=laparams)
self.scale = scale
self.fontscale = fontscale
self.layoutmode = layoutmode
@@ -277,15 +274,19 @@ def write(self, text):
def write_header(self):
self.write('
\n')
if self.codec:
- self.write('\n' % self.codec)
+ self.write('\n' % self.codec)
else:
- self.write('\n')
+ self.write('\n')
self.write('\n')
return
def write_footer(self):
- self.write('Page: %s
\n' %
- ', '.join('%s' % (i, i) for i in range(1, self.pageno)))
+ self.write(''
+ 'Page: %s
\n' %
+ ', '.join('%s' %
+ (i, i) for i in range(1, self.pageno)))
self.write('\n')
return
@@ -296,22 +297,24 @@ def write_text(self, text):
def place_rect(self, color, borderwidth, x, y, w, h):
color = self.rect_colors.get(color)
if color is not None:
- self.write('\n' %
+ self.write('\n' %
(color, borderwidth,
x*self.scale, (self._yoffset-y)*self.scale,
w*self.scale, h*self.scale))
return
def place_border(self, color, borderwidth, item):
- self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
+ self.place_rect(color, borderwidth, item.x0, item.y1,
+ item.width, item.height)
return
def place_image(self, item, borderwidth, x, y, w, h):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
- self.write('\n' %
+ self.write('\n' %
(enc(name, None), borderwidth,
x*self.scale, (self._yoffset-y)*self.scale,
w*self.scale, h*self.scale))
@@ -320,8 +323,10 @@ def place_image(self, item, borderwidth, x, y, w, h):
def place_text(self, color, text, x, y, size):
color = self.text_colors.get(color)
if color is not None:
- self.write('' %
- (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale))
+ self.write('' %
+ (color, x*self.scale, (self._yoffset-y)*self.scale,
+ size*self.scale*self.fontscale))
self.write_text(text)
self.write('\n')
return
@@ -329,14 +334,15 @@ def place_text(self, color, text, x, y, size):
def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
self._fontstack.append(self._font)
self._font = None
- self.write('' %
(color, borderwidth, writing_mode,
x*self.scale, (self._yoffset-y)*self.scale,
w*self.scale, h*self.scale))
return
- def end_div(self, color):
+ def end_div(self, _):
if self._font is not None:
self.write('')
self._font = self._fontstack.pop()
@@ -373,7 +379,8 @@ def render(item):
if self.showpageno:
self.write('
' %
((self._yoffset-item.y1)*self.scale))
- self.write('
Page %s\n' % (item.pageid, item.pageid))
+ self.write('
Page %s \n' %
+ (item.pageid, item.pageid))
for child in item:
render(child)
if item.groups is not None:
@@ -382,12 +389,14 @@ def render(item):
elif isinstance(item, LTCurve):
self.place_border('curve', 1, item)
elif isinstance(item, LTFigure):
- self.begin_div('figure', 1, item.x0, item.y1, item.width, item.height)
+ self.begin_div('figure', 1, item.x0, item.y1,
+ item.width, item.height)
for child in item:
render(child)
self.end_div('figure')
elif isinstance(item, LTImage):
- self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
+ self.place_image(item, 1, item.x0, item.y1,
+ item.width, item.height)
else:
if self.layoutmode == 'exact':
if isinstance(item, LTTextLine):
@@ -396,12 +405,14 @@ def render(item):
render(child)
elif isinstance(item, LTTextBox):
self.place_border('textbox', 1, item)
- self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
+ self.place_text('textbox', str(item.index+1),
+ item.x0, item.y1, 20)
for child in item:
render(child)
elif isinstance(item, LTChar):
self.place_border('char', 1, item)
- self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
+ self.place_text('char', item.get_text(), item.x0,
+ item.y1, item.size)
else:
if isinstance(item, LTTextLine):
for child in item:
@@ -409,13 +420,15 @@ def render(item):
if self.layoutmode != 'loose':
self.put_newline()
elif isinstance(item, LTTextBox):
- self.begin_div('textbox', 1, item.x0, item.y1, item.width, item.height,
+ self.begin_div('textbox', 1, item.x0, item.y1,
+ item.width, item.height,
item.get_writing_mode())
for child in item:
render(child)
self.end_div('textbox')
elif isinstance(item, LTChar):
- self.put_text(item.get_text(), item.fontname, item.size)
+ self.put_text(item.get_text(), item.fontname,
+ item.size)
elif isinstance(item, LTText):
self.write_text(item.get_text())
return
@@ -428,15 +441,14 @@ def close(self):
return
-## XMLConverter
-##
class XMLConverter(PDFConverter):
CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None, stripcontrol=False):
- PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+ PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
+ laparams=laparams)
self.imagewriter = imagewriter
self.stripcontrol = stripcontrol
self.write_header()
@@ -470,7 +482,7 @@ def receive_layout(self, ltpage):
def show_group(item):
if isinstance(item, LTTextBox):
self.write('\n' %
- (item.index, bbox2str(item.bbox)))
+ (item.index, bbox2str(item.bbox)))
elif isinstance(item, LTTextGroup):
self.write('\n' % bbox2str(item.bbox))
for child in item:
@@ -481,7 +493,7 @@ def show_group(item):
def render(item):
if isinstance(item, LTPage):
self.write('\n' %
- (item.pageid, bbox2str(item.bbox), item.rotate))
+ (item.pageid, bbox2str(item.bbox), item.rotate))
for child in item:
render(child)
if item.groups is not None:
@@ -492,16 +504,17 @@ def render(item):
self.write('\n')
elif isinstance(item, LTLine):
self.write('\n' %
- (item.linewidth, bbox2str(item.bbox)))
+ (item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTRect):
self.write('\n' %
- (item.linewidth, bbox2str(item.bbox)))
+ (item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTCurve):
self.write('\n' %
- (item.linewidth, bbox2str(item.bbox), item.get_pts()))
+ (item.linewidth, bbox2str(item.bbox),
+ item.get_pts()))
elif isinstance(item, LTFigure):
self.write('\n')
@@ -515,13 +528,14 @@ def render(item):
if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"'
self.write('\n' %
- (item.index, bbox2str(item.bbox), wmode))
+ (item.index, bbox2str(item.bbox), wmode))
for child in item:
render(child)
self.write('\n')
elif isinstance(item, LTChar):
self.write('' %
- (enc(item.fontname, None), bbox2str(item.bbox), item.size))
+ (enc(item.fontname, None), bbox2str(item.bbox),
+ item.size))
self.write_text(item.get_text())
self.write('\n')
elif isinstance(item, LTText):
@@ -530,10 +544,10 @@ def render(item):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('\n' %
- (enc(name, None), item.width, item.height))
+ (enc(name, None), item.width, item.height))
else:
self.write('\n' %
- (item.width, item.height))
+ (item.width, item.height))
else:
assert False, str(('Unhandled', item))
return
diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
index 870bd28e..9bb9e449 100644
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@@ -1,16 +1,14 @@
import re
+import six
from .psparser import PSLiteral
from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING
-import six # Python 2+3 compatibility
STRIP_NAME = re.compile(r'[0-9]+')
-## name2unicode
-##
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode:
@@ -21,8 +19,6 @@ def name2unicode(name):
return six.unichr(int(m.group(0)))
-## EncodingDB
-##
class EncodingDB(object):
std2unicode = {}
@@ -48,8 +44,8 @@ class EncodingDB(object):
}
@classmethod
- def get_encoding(klass, name, diff=None):
- cid2unicode = klass.encodings.get(name, klass.std2unicode)
+ def get_encoding(cls, name, diff=None):
+ cid2unicode = cls.encodings.get(name, cls.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
diff --git a/pdfminer/fontmetrics.py b/pdfminer/fontmetrics.py
index 8b3779ac..27d26cf9 100644
--- a/pdfminer/fontmetrics.py
+++ b/pdfminer/fontmetrics.py
@@ -1,5 +1,3 @@
-
-
""" Font metrics for the Adobe core 14 fonts.
Font metrics are used to compute the boundary of each character
@@ -11,8 +9,7 @@
"""
-### BEGIN Verbatim copy of the license part
-
+# BEGIN Verbatim copy of the license part
#
# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe
#
@@ -25,8 +22,7 @@
# paragraph is not modified. Adobe Systems has no responsibility or
# obligation to support the use of the AFM files.
#
-
-### END Verbatim copy of the license part
+# END Verbatim copy of the license part
FONT_METRICS = {
'Courier': ({'FontName': 'Courier', 'Descent': -194.0, 'FontBBox': (-6.0, -249.0, 639.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {u' ': 600, u'!': 600, u'"': 600, u'#': 600, u'$': 600, u'%': 600, u'&': 600, u"'": 600, u'(': 600, u')': 600, u'*': 600, u'+': 600, u',': 600, u'-': 600, u'.': 600, u'/': 600, u'0': 600, u'1': 600, u'2': 600, u'3': 600, u'4': 600, u'5': 600, u'6': 600, u'7': 600, u'8': 600, u'9': 600, u':': 600, u';': 600, u'<': 600, u'=': 600, u'>': 600, u'?': 600, u'@': 600, u'A': 600, u'B': 600, u'C': 600, u'D': 600, u'E': 600, u'F': 600, u'G': 600, u'H': 600, u'I': 600, u'J': 600, u'K': 600, u'L': 600, u'M': 600, u'N': 600, u'O': 600, u'P': 600, u'Q': 600, u'R': 600, u'S': 600, u'T': 600, u'U': 600, u'V': 600, u'W': 600, u'X': 600, u'Y': 600, u'Z': 600, u'[': 600, u'\\': 600, u']': 600, u'^': 600, u'_': 600, u'`': 600, u'a': 600, u'b': 600, u'c': 600, u'd': 600, u'e': 600, u'f': 600, u'g': 600, u'h': 600, u'i': 600, u'j': 600, u'k': 600, u'l': 600, u'm': 600, u'n': 600, u'o': 600, u'p': 600, u'q': 600, u'r': 600, u's': 600, u't': 600, u'u': 600, u'v': 600, u'w': 600, u'x': 600, u'y': 600, u'z': 600, u'{': 600, u'|': 600, u'}': 600, u'~': 600, u'\xa1': 600, u'\xa2': 600, u'\xa3': 600, u'\xa4': 600, u'\xa5': 600, u'\xa6': 600, u'\xa7': 600, u'\xa8': 600, u'\xa9': 600, u'\xaa': 600, u'\xab': 600, u'\xac': 600, u'\xae': 600, u'\xaf': 600, u'\xb0': 600, u'\xb1': 600, u'\xb2': 600, u'\xb3': 600, u'\xb4': 600, u'\xb5': 600, u'\xb6': 600, u'\xb7': 600, u'\xb8': 600, u'\xb9': 600, u'\xba': 600, u'\xbb': 600, u'\xbc': 600, u'\xbd': 600, u'\xbe': 600, u'\xbf': 600, u'\xc0': 600, u'\xc1': 600, u'\xc2': 600, u'\xc3': 600, u'\xc4': 600, u'\xc5': 600, u'\xc6': 600, u'\xc7': 600, u'\xc8': 600, u'\xc9': 600, u'\xca': 600, u'\xcb': 600, u'\xcc': 600, u'\xcd': 600, u'\xce': 600, u'\xcf': 600, u'\xd0': 600, u'\xd1': 600, u'\xd2': 600, u'\xd3': 600, u'\xd4': 600, u'\xd5': 600, u'\xd6': 600, u'\xd7': 600, u'\xd8': 600, u'\xd9': 600, u'\xda': 600, u'\xdb': 600, u'\xdc': 600, u'\xdd': 600, u'\xde': 600, u'\xdf': 600, u'\xe0': 600, u'\xe1': 600, u'\xe2': 600, u'\xe3': 600, u'\xe4': 600, u'\xe5': 600, u'\xe6': 600, u'\xe7': 600, u'\xe8': 600, u'\xe9': 600, u'\xea': 600, u'\xeb': 600, u'\xec': 600, u'\xed': 600, u'\xee': 600, u'\xef': 600, u'\xf0': 600, u'\xf1': 600, u'\xf2': 600, u'\xf3': 600, u'\xf4': 600, u'\xf5': 600, u'\xf6': 600, u'\xf7': 600, u'\xf8': 600, u'\xf9': 600, u'\xfa': 600, u'\xfb': 600, u'\xfc': 600, u'\xfd': 600, u'\xfe': 600, u'\xff': 600, u'\u0100': 600, u'\u0101': 600, u'\u0102': 600, u'\u0103': 600, u'\u0104': 600, u'\u0105': 600, u'\u0106': 600, u'\u0107': 600, u'\u010c': 600, u'\u010d': 600, u'\u010e': 600, u'\u010f': 600, u'\u0110': 600, u'\u0111': 600, u'\u0112': 600, u'\u0113': 600, u'\u0116': 600, u'\u0117': 600, u'\u0118': 600, u'\u0119': 600, u'\u011a': 600, u'\u011b': 600, u'\u011e': 600, u'\u011f': 600, u'\u0122': 600, u'\u0123': 600, u'\u012a': 600, u'\u012b': 600, u'\u012e': 600, u'\u012f': 600, u'\u0130': 600, u'\u0131': 600, u'\u0136': 600, u'\u0137': 600, u'\u0139': 600, u'\u013a': 600, u'\u013b': 600, u'\u013c': 600, u'\u013d': 600, u'\u013e': 600, u'\u0141': 600, u'\u0142': 600, u'\u0143': 600, u'\u0144': 600, u'\u0145': 600, u'\u0146': 600, u'\u0147': 600, u'\u0148': 600, u'\u014c': 600, u'\u014d': 600, u'\u0150': 600, u'\u0151': 600, u'\u0152': 600, u'\u0153': 600, u'\u0154': 600, u'\u0155': 600, u'\u0156': 600, u'\u0157': 600, u'\u0158': 600, u'\u0159': 600, u'\u015a': 600, u'\u015b': 600, u'\u015e': 600, u'\u015f': 600, u'\u0160': 600, u'\u0161': 600, u'\u0162': 600, u'\u0163': 600, u'\u0164': 600, u'\u0165': 600, u'\u016a': 600, u'\u016b': 600, u'\u016e': 600, u'\u016f': 600, u'\u0170': 600, u'\u0171': 600, u'\u0172': 600, u'\u0173': 600, u'\u0178': 600, u'\u0179': 600, u'\u017a': 600, u'\u017b': 600, u'\u017c': 600, u'\u017d': 600, u'\u017e': 600, u'\u0192': 600, u'\u0218': 600, u'\u0219': 600, u'\u02c6': 600, u'\u02c7': 600, u'\u02d8': 600, u'\u02d9': 600, u'\u02da': 600, u'\u02db': 600, u'\u02dc': 600, u'\u02dd': 600, u'\u2013': 600, u'\u2014': 600, u'\u2018': 600, u'\u2019': 600, u'\u201a': 600, u'\u201c': 600, u'\u201d': 600, u'\u201e': 600, u'\u2020': 600, u'\u2021': 600, u'\u2022': 600, u'\u2026': 600, u'\u2030': 600, u'\u2039': 600, u'\u203a': 600, u'\u2044': 600, u'\u2122': 600, u'\u2202': 600, u'\u2206': 600, u'\u2211': 600, u'\u2212': 600, u'\u221a': 600, u'\u2260': 600, u'\u2264': 600, u'\u2265': 600, u'\u25ca': 600, u'\uf6c3': 600, u'\ufb01': 600, u'\ufb02': 600}),
diff --git a/pdfminer/glyphlist.py b/pdfminer/glyphlist.py
index 848b0971..259f4242 100644
--- a/pdfminer/glyphlist.py
+++ b/pdfminer/glyphlist.py
@@ -12,7 +12,7 @@
"""
-# ###################################################################################
+# #############################################################################
# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
#
# Permission is hereby granted, free of charge, to any person obtaining a
@@ -42,7 +42,7 @@
# those concerning merchantability or fitness for a particular purpose or
# non-infringement of any third party rights regarding the Adobe
# materials.
-# ###################################################################################
+# #############################################################################
# Name: Adobe Glyph List
# Table version: 2.0
# Date: September 20, 2002
@@ -4336,4 +4336,4 @@
'zuhiragana': u'\u305A',
'zukatakana': u'\u30BA',
}
-#--end
+# --end
diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
index cdef1e7f..e4a3c1b9 100644
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@@ -3,33 +3,43 @@
Functions that encapsulate "usual" use-cases for pdfminer, for use making
bundled scripts and for using pdfminer as a module for routine tasks.
"""
-
-import six
import sys
+import six
-from .pdfdocument import PDFDocument
-from .pdfparser import PDFParser
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
-from .pdfdevice import PDFDevice, TagExtractor
+from .pdfdevice import TagExtractor
from .pdfpage import PDFPage
from .converter import XMLConverter, HTMLConverter, TextConverter
-from .cmapdb import CMapDB
from .image import ImageWriter
-def extract_text_to_fp(inf, outfp,
- _py2_no_more_posargs=None, # Bloody Python2 needs a shim
- output_type='text', codec='utf-8', laparams = None,
- maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
- layoutmode='normal', output_dir=None, strip_control=False,
- debug=False, disable_caching=False, **other):
+def extract_text_to_fp(inf,
+ outfp,
+ _py2_no_more_posargs=None, # Python2 needs a shim
+ output_type='text',
+ codec='utf-8',
+ laparams=None,
+ maxpages=0,
+ page_numbers=None,
+ password="",
+ scale=1.0,
+ rotation=0,
+ layoutmode='normal',
+ output_dir=None,
+ strip_control=False,
+ debug=False,
+ disable_caching=False,
+ **other):
"""
Parses text from inf-file and writes to outfp file-like object.
Takes loads of optional arguments but the defaults are somewhat sane.
- Beware laparams: Including an empty LAParams is not the same as passing None!
- Returns nothing, acting as it does on two streams. Use StringIO to get strings.
-
- output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly.
+ Beware laparams:
+ Including an empty LAParams is not the same as passing None!
+ Returns nothing, acting as it does on two streams. Use StringIO
+ to get strings.
+
+ output_type: May be 'text', 'xml', 'html', 'tag'.
+ Only 'text' works properly.
codec: Text decoding codec
laparams: An LAParams object from pdfminer.layout.
Default is None but may not layout correctly.
@@ -50,7 +60,7 @@ def extract_text_to_fp(inf, outfp,
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
-
+
rsrcmgr = PDFResourceManager(caching=not disable_caching)
if output_type == 'text':
@@ -79,6 +89,6 @@ def extract_text_to_fp(inf, outfp,
caching=not disable_caching,
check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
- interpreter.process_page(page)
+ interpreter.process_page(page)
device.close()
diff --git a/pdfminer/image.py b/pdfminer/image.py
index e85815c8..bc6f07b0 100644
--- a/pdfminer/image.py
+++ b/pdfminer/image.py
@@ -13,8 +13,6 @@ def align32(x):
return ((x+3)//4)*4
-## BMPWriter
-##
class BMPWriter(object):
def __init__(self, fp, bits, width, height):
@@ -33,9 +31,11 @@ def __init__(self, fp, bits, width, height):
self.linesize = align32((self.width*self.bits+7)//8)
self.datasize = self.linesize * self.height
headersize = 14+40+ncols*4
- info = struct.pack('' %
- (self.char_margin, self.line_margin, self.word_margin, self.all_texts))
+ return ('' %
+ (self.char_margin, self.line_margin,
+ self.word_margin, self.all_texts))
-## LTItem
-##
class LTItem(object):
- def analyze(self, laparams):
+ def analyze(self, _):
"""Perform the layout analysis."""
return
-## LTText
-##
class LTText(object):
def __repr__(self):
@@ -76,8 +69,6 @@ def get_text(self):
raise NotImplementedError
-## LTComponent
-##
class LTComponent(LTItem):
def __init__(self, bbox):
@@ -92,10 +83,13 @@ def __repr__(self):
# Disable comparison.
def __lt__(self, _):
raise ValueError
+
def __le__(self, _):
raise ValueError
+
def __gt__(self, _):
raise ValueError
+
def __ge__(self, _):
raise ValueError
@@ -121,15 +115,13 @@ def hdistance(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_hoverlap(obj):
return 0
- else:
- return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
+ return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def hoverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_hoverlap(obj):
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
- else:
- return 0
+ return 0
def is_voverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
@@ -139,22 +131,19 @@ def vdistance(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_voverlap(obj):
return 0
- else:
- return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
+ return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def voverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_voverlap(obj):
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
- else:
- return 0
+ return 0
-## LTCurve
-##
class LTCurve(LTComponent):
- def __init__(self, linewidth, pts, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None):
+ def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False,
+ stroking_color=None, non_stroking_color=None):
LTComponent.__init__(self, get_bound(pts))
self.pts = pts
self.linewidth = linewidth
@@ -169,27 +158,27 @@ def get_pts(self):
return ','.join('%.3f,%.3f' % p for p in self.pts)
-## LTLine
-##
class LTLine(LTCurve):
- def __init__(self, linewidth, p0, p1, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None):
- LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color)
+ def __init__(self, linewidth, p0, p1, stroke=False, fill=False,
+ evenodd=False, stroking_color=None, non_stroking_color=None):
+ LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
+ stroking_color, non_stroking_color)
return
-## LTRect
-##
class LTRect(LTCurve):
- def __init__(self, linewidth, bbox, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None):
+ def __init__(self, linewidth, bbox, stroke=False, fill=False,
+ evenodd=False, stroking_color=None, non_stroking_color=None):
(x0, y0, x1, y1) = bbox
- LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, fill, evenodd, stroking_color, non_stroking_color)
+ LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0),
+ (x1, y1), (x0, y1)],
+ stroke, fill, evenodd, stroking_color,
+ non_stroking_color)
return
-## LTImage
-##
class LTImage(LTComponent):
def __init__(self, name, stream, bbox):
@@ -211,8 +200,6 @@ def __repr__(self):
bbox2str(self.bbox), self.srcsize))
-## LTAnno
-##
class LTAnno(LTItem, LTText):
def __init__(self, text):
@@ -223,8 +210,6 @@ def get_text(self):
return self._text
-## LTChar
-##
class LTChar(LTComponent, LTText):
def __init__(self, matrix, font, fontsize, scaling, rise,
@@ -255,8 +240,8 @@ def __init__(self, matrix, font, fontsize, scaling, rise,
ty = descent + rise
bll = (0, ty)
bur = (self.adv, ty+height)
- (a, b, c, d, e, f) = self.matrix
- self.upright = (0 < a*d*scaling and b*c <= 0)
+ (a, b, c, d, _, _) = self.matrix
+ self.upright = (b*c <= 0 < a*d*scaling)
(x0, y0) = apply_matrix_pt(self.matrix, bll)
(x1, y1) = apply_matrix_pt(self.matrix, bur)
if x1 < x0:
@@ -279,13 +264,14 @@ def __repr__(self):
def get_text(self):
return self._text
- def is_compatible(self, obj):
- """Returns True if two characters can coexist in the same line."""
+ def is_compatible(self, _):
+ """Returns always True.
+ Was documented as:
+ Returns True if two characters can coexist in the same line.
+ """
return True
-## LTContainer
-##
class LTContainer(LTComponent):
def __init__(self, bbox):
@@ -314,8 +300,6 @@ def analyze(self, laparams):
return
-## LTExpandableContainer
-##
class LTExpandableContainer(LTContainer):
def __init__(self):
@@ -329,8 +313,6 @@ def add(self, obj):
return
-## LTTextContainer
-##
class LTTextContainer(LTExpandableContainer, LTText):
def __init__(self):
@@ -339,11 +321,10 @@ def __init__(self):
return
def get_text(self):
- return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
+ return ''.join(obj.get_text()
+ for obj in self if isinstance(obj, LTText))
-## LTTextLine
-##
class LTTextLine(LTTextContainer):
def __init__(self, word_margin):
@@ -417,13 +398,10 @@ def find_neighbors(self, plane, ratio):
abs(obj.y1-self.y1) < d))]
-## LTTextBox
-##
-## A set of text objects that are grouped within
-## a certain rectangular area.
-##
class LTTextBox(LTTextContainer):
-
+ """
+ A set of text objects that are grouped within a certain rectangular area.
+ """
def __init__(self):
LTTextContainer.__init__(self)
self.index = -1
@@ -457,8 +435,6 @@ def get_writing_mode(self):
return 'tb-rl'
-## LTTextGroup
-##
class LTTextGroup(LTTextContainer):
def __init__(self, objs):
@@ -489,8 +465,6 @@ def analyze(self, laparams):
return
-## LTLayoutContainer
-##
class LTLayoutContainer(LTContainer):
def __init__(self, bbox):
@@ -516,7 +490,8 @@ def group_objects(self, laparams, objs):
# (char_margin)
halign = (obj0.is_compatible(obj1) and
obj0.is_voverlap(obj1) and
- (min(obj0.height, obj1.height) * laparams.line_overlap <
+ (min(obj0.height, obj1.height) *
+ laparams.line_overlap <
obj0.voverlap(obj1)) and
(obj0.hdistance(obj1) <
max(obj0.width, obj1.width) * laparams.char_margin))
@@ -538,13 +513,15 @@ def group_objects(self, laparams, objs):
valign = (laparams.detect_vertical and
obj0.is_compatible(obj1) and
obj0.is_hoverlap(obj1) and
- (min(obj0.width, obj1.width) * laparams.line_overlap <
+ (min(obj0.width, obj1.width) *
+ laparams.line_overlap <
obj0.hoverlap(obj1)) and
(obj0.vdistance(obj1) <
- max(obj0.height, obj1.height) * laparams.char_margin))
+ max(obj0.height, obj1.height) *
+ laparams.char_margin))
if ((halign and isinstance(line, LTTextLineHorizontal)) or
- (valign and isinstance(line, LTTextLineVertical))):
+ (valign and isinstance(line, LTTextLineVertical))):
line.add(obj1)
elif line is not None:
yield line
@@ -577,7 +554,8 @@ def group_textlines(self, laparams, lines):
boxes = {}
for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin)
- if line not in neighbors: continue
+ if line not in neighbors:
+ continue
members = []
for obj1 in neighbors:
members.append(obj1)
@@ -592,7 +570,8 @@ def group_textlines(self, laparams, lines):
boxes[obj] = box
done = set()
for line in lines:
- if line not in boxes: continue
+ if line not in boxes:
+ continue
box = boxes[line]
if box in done:
continue
@@ -621,7 +600,8 @@ def dist(obj1, obj2):
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
- return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
+ return (x1-x0)*(y1-y0) - \
+ obj1.width*obj1.height - obj2.width*obj2.height
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2.
@@ -634,8 +614,8 @@ def isany(obj1, obj2):
return objs.difference((obj1, obj2))
def key_obj(t):
- (c,d,_,_) = t
- return (c,d)
+ (c, d, _, _) = t
+ return c, d
# XXX this still takes O(n^2) :(
dists = []
@@ -654,14 +634,14 @@ def key_obj(t):
dists.append((1, d, obj1, obj2))
continue
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
- isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
+ isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
- dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
- if (obj1 in plane and obj2 in plane) ]
+ dists = [(c, d, obj1, obj2) for (c, d, obj1, obj2) in dists
+ if (obj1 in plane and obj2 in plane)]
for other in plane:
dists.append((0, dist(group, other), group, other))
dists = csort(dists, key=key_obj)
@@ -672,7 +652,8 @@ def key_obj(t):
def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
- (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
+ (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
+ self)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs:
@@ -682,7 +663,7 @@ def analyze(self, laparams):
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.group_textlines(laparams, textlines))
- if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes:
+ if -1 <= laparams.boxes_flow <= +1 and textboxes:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
@@ -692,24 +673,22 @@ def analyze(self, laparams):
else:
def getkey(box):
if isinstance(box, LTTextBoxVertical):
- return (0, -box.x1, box.y0)
- else:
- return (1, box.y0, box.x0)
+ return 0, -box.x1, box.y0
+ return 1, box.y0, box.x0
textboxes.sort(key=getkey)
self._objs = textboxes + otherobjs + empties
return
-## LTFigure
-##
class LTFigure(LTLayoutContainer):
def __init__(self, name, bbox, matrix):
self.name = name
self.matrix = matrix
(x, y, w, h) = bbox
- bbox = get_bound(apply_matrix_pt(matrix, (p, q))
- for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
+ bbox = get_bound(
+ apply_matrix_pt(matrix, (p, q))
+ for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
LTLayoutContainer.__init__(self, bbox)
return
@@ -725,10 +704,7 @@ def analyze(self, laparams):
return
-## LTPage
-##
class LTPage(LTLayoutContainer):
-
def __init__(self, pageid, bbox, rotate=0):
LTLayoutContainer.__init__(self, bbox)
self.pageid = pageid
diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py
index 078ac040..adb97809 100644
--- a/pdfminer/lzw.py
+++ b/pdfminer/lzw.py
@@ -1,16 +1,13 @@
+import logging
from io import BytesIO
+import six
-import six #Python 2+3 compatibility
-
-import logging
class CorruptDataError(Exception):
pass
-## LZWDecoder
-##
class LZWDecoder(object):
def __init__(self, fp):
@@ -98,5 +95,5 @@ def run(self):
# lzwdecode
def lzwdecode(data):
fp = BytesIO(data)
- s=LZWDecoder(fp).run()
+ s = LZWDecoder(fp).run()
return b''.join(s)
diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py
index 6fe6eaa2..9de732b8 100644
--- a/pdfminer/pdfcolor.py
+++ b/pdfminer/pdfcolor.py
@@ -1,10 +1,8 @@
+import six
from .psparser import LIT
-import six #Python 2+3 compatibility
-## PDFColorSpace
-##
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
@@ -12,26 +10,24 @@
class PDFColorSpace(object):
- def __init__(self, name, ncomponents):
- self.name = name
+ def __init__(self, my_name, ncomponents):
+ self.name = my_name
self.ncomponents = ncomponents
return
def __repr__(self):
- return '' % (self.name, self.ncomponents)
+ return '' % \
+ (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = {}
-for (name, n) in six.iteritems({
- 'CalRGB': 3,
- 'CalGray': 1,
- 'Lab': 3,
- 'DeviceRGB': 3,
- 'DeviceCMYK': 4,
- 'DeviceGray': 1,
- 'Separation': 1,
- 'Indexed': 1,
- 'Pattern': 1,
-}) :
- PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
-
\ No newline at end of file
+for (name, n) in six.iteritems({'CalRGB': 3,
+ 'CalGray': 1,
+ 'Lab': 3,
+ 'DeviceRGB': 3,
+ 'DeviceCMYK': 4,
+ 'DeviceGray': 1,
+ 'Separation': 1,
+ 'Indexed': 1,
+ 'Pattern': 1, }):
+ PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
index 94351016..3936af1d 100644
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@@ -1,10 +1,8 @@
from .pdffont import PDFUnicodeNotDefined
-
from . import utils
-## PDFDevice
-##
+
class PDFDevice(object):
def __init__(self, rsrcmgr):
@@ -59,8 +57,6 @@ def render_string(self, textstate, seq):
return
-## PDFTextDevice
-##
class PDFTextDevice(PDFDevice):
def render_string(self, textstate, seq):
@@ -84,8 +80,8 @@ def render_string(self, textstate, seq):
scaling, charspace, wordspace, rise, dxscale)
return
- def render_string_horizontal(self, seq, matrix, pos,
- font, fontsize, scaling, charspace, wordspace, rise, dxscale):
+ def render_string_horizontal(self, seq, matrix, pos, font, fontsize,
+ scaling, charspace, wordspace, rise, dxscale):
(x, y) = pos
needcharspace = False
for obj in seq:
@@ -96,15 +92,16 @@ def render_string_horizontal(self, seq, matrix, pos,
for cid in font.decode(obj):
if needcharspace:
x += charspace
- x += self.render_char(utils.translate_matrix(matrix, (x, y)),
+ x += self.render_char(utils.translate_matrix(matrix,
+ (x, y)),
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
- return (x, y)
+ return x, y
- def render_string_vertical(self, seq, matrix, pos,
- font, fontsize, scaling, charspace, wordspace, rise, dxscale):
+ def render_string_vertical(self, seq, matrix, pos, font, fontsize, scaling,
+ charspace, wordspace, rise, dxscale):
(x, y) = pos
needcharspace = False
for obj in seq:
@@ -115,19 +112,18 @@ def render_string_vertical(self, seq, matrix, pos,
for cid in font.decode(obj):
if needcharspace:
y += charspace
- y += self.render_char(utils.translate_matrix(matrix, (x, y)),
+ y += self.render_char(utils.translate_matrix(matrix,
+ (x, y)),
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
y += wordspace
needcharspace = True
- return (x, y)
+ return x, y
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
return 0
-## TagExtractor
-##
class TagExtractor(PDFDevice):
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
@@ -157,7 +153,8 @@ def render_string(self, textstate, seq):
return
def begin_page(self, page, ctm):
- output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
+ output = '' % \
+ (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
self.outfp.write(utils.make_compat_bytes(output))
return
@@ -169,7 +166,8 @@ def end_page(self, page):
def begin_tag(self, tag, props=None):
s = ''
if isinstance(props, dict):
- s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
+ s = ''.join(' %s="%s"' %
+ (utils.enc(k), utils.enc(str(v))) for (k, v)
in sorted(props.iteritems()))
out_s = '<%s%s>' % (utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s))
diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
index 5fb9cce4..927dfc12 100644
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@@ -2,8 +2,7 @@
import re
import struct
import logging
-
-import six # Python 2+3 compatibility
+import six
try:
import hashlib as md5
except ImportError:
@@ -39,34 +38,37 @@
log = logging.getLogger(__name__)
-## Exceptions
-##
+
class PDFNoValidXRef(PDFSyntaxError):
pass
+
class PDFNoOutlines(PDFException):
pass
+
class PDFDestinationNotFound(PDFException):
pass
+
class PDFEncryptionError(PDFException):
pass
+
class PDFPasswordIncorrect(PDFEncryptionError):
pass
+
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
+
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_CATALOG = LIT('Catalog')
-## XRefs
-##
class PDFBaseXRef(object):
def get_trailer(self):
@@ -82,8 +84,6 @@ def get_pos(self, objid):
raise KeyError(objid)
-## PDFXRef
-##
class PDFXRef(PDFBaseXRef):
def __init__(self):
@@ -109,14 +109,16 @@ def load(self, parser):
break
f = line.strip().split(b' ')
if len(f) != 2:
- raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
+ raise PDFNoValidXRef('Trailer not found: %r: line=%r' %
+ (parser, line))
try:
if six.PY2:
(start, nobjs) = map(long, f)
else:
(start, nobjs) = map(int, f)
except ValueError:
- raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
+ raise PDFNoValidXRef('Invalid line: %r: line=%r' %
+ (parser, line))
for objid in range(start, start+nobjs):
try:
(_, line) = parser.nextline()
@@ -124,11 +126,13 @@ def load(self, parser):
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(b' ')
if len(f) != 3:
- raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
+ raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' %
+ (parser, line))
(pos, genno, use) = f
if use != b'n':
continue
- self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
+ self.offsets[objid] = \
+ (None, long(pos) if six.PY2 else int(pos), int(genno))
log.info('xref objects: %r', self.offsets)
self.load_trailer(parser)
return
@@ -160,8 +164,6 @@ def get_pos(self, objid):
raise
-## PDFXRefFallback
-##
class PDFXRefFallback(PDFXRef):
def __repr__(self):
@@ -182,7 +184,7 @@ def load(self, parser):
log.info('trailer: %r', self.trailer)
break
if six.PY3:
- line=line.decode('latin-1') #default pdf encoding
+ line = line.decode('latin-1') # default pdf encoding
m = self.PDFOBJ_CUE.match(line)
if not m:
continue
@@ -193,7 +195,8 @@ def load(self, parser):
# expand ObjStm.
parser.seek(pos)
(_, obj) = parser.nextobject()
- if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
+ if isinstance(obj, PDFStream) and obj.get('Type') is\
+ LITERAL_OBJSTM:
stream = stream_value(obj)
try:
n = stream['N']
@@ -216,8 +219,6 @@ def load(self, parser):
return
-## PDFXRefStream
-##
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
@@ -228,14 +229,15 @@ def __init__(self):
return
def __repr__(self):
- return '' % (self.ranges)
+ return '' % self.ranges
def load(self, parser):
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
- if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
+ if not isinstance(stream, PDFStream) or stream['Type'] \
+ is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (0, size))
@@ -267,7 +269,7 @@ def get_objids(self):
def get_pos(self, objid):
index = 0
for (start, nobjs) in self.ranges:
- if start <= objid and objid < start+nobjs:
+ if start <= objid < start+nobjs:
index += objid - start
break
else:
@@ -288,8 +290,6 @@ def get_pos(self, objid):
raise KeyError(objid)
-## PDFSecurityHandler
-##
class PDFStandardSecurityHandler(object):
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
@@ -306,7 +306,8 @@ def __init__(self, docid, param, password=''):
def init(self):
self.init_params()
if self.r not in self.supported_revisions:
- raise PDFEncryptionError('Unsupported revision: param=%r' % self.param)
+ raise PDFEncryptionError('Unsupported revision:'
+ ' param=%r' % self.param)
self.init_key()
return
@@ -412,7 +413,8 @@ def decrypt(self, objid, genno, data, attrs=None):
return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid, genno, data):
- key = self.key + struct.pack('H', fp.read(2))[0]
+ value = b1 << 24 | b2 << 16 |\
+ struct.unpack('>H', fp.read(2))[0]
stack.append(value)
return d
@@ -180,84 +175,84 @@ def getdict(data):
class CFFFont(object):
STANDARD_STRINGS = (
- '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
- 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
- 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
- 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
- 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
- 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
- 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
- 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
- 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
- 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
- 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
- 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
- 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
- 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
- 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
- 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
- 'quotesinglbase', 'quotedblbase', 'quotedblright',
- 'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
- 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
- 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
- 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
- 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
- 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
- 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
- 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
- 'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
- 'multiply', 'threesuperior', 'copyright', 'Aacute',
- 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
- 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
- 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
- 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
- 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
- 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
- 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
- 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
- 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
- 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
- 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
- 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
- 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
- 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
- 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
- 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
- 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
- 'commasuperior', 'threequartersemdash', 'periodsuperior',
- 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
- 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
- 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
- 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
- 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
- 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
- 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
- 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
- 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
- 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
- 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
- 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
- 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
- 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
- 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
- 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
- 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
- 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
- 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
- 'seveninferior', 'eightinferior', 'nineinferior',
- 'centinferior', 'dollarinferior', 'periodinferior',
- 'commainferior', 'Agravesmall', 'Aacutesmall',
- 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
- 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
- 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
- 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
- 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
- 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
- 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
- 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
- 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
- '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
- 'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
+ '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
+ 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
+ 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
+ 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
+ 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
+ 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
+ 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
+ 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
+ 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+ 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+ 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
+ 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
+ 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
+ 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
+ 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
+ 'quotesinglbase', 'quotedblbase', 'quotedblright',
+ 'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
+ 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
+ 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
+ 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
+ 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
+ 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
+ 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
+ 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
+ 'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
+ 'multiply', 'threesuperior', 'copyright', 'Aacute',
+ 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
+ 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
+ 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
+ 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
+ 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
+ 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
+ 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
+ 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
+ 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
+ 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
+ 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
+ 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
+ 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
+ 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
+ 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
+ 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
+ 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
+ 'commasuperior', 'threequartersemdash', 'periodsuperior',
+ 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
+ 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
+ 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
+ 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
+ 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
+ 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
+ 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
+ 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
+ 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
+ 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
+ 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
+ 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
+ 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
+ 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
+ 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
+ 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
+ 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
+ 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
+ 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
+ 'seveninferior', 'eightinferior', 'nineinferior',
+ 'centinferior', 'dollarinferior', 'periodinferior',
+ 'commainferior', 'Agravesmall', 'Aacutesmall',
+ 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
+ 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
+ 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
+ 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
+ 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
+ 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
+ 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
+ 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
+ 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
+ '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
+ 'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
)
class INDEX(object):
@@ -289,7 +284,8 @@ def __init__(self, name, fp):
self.name = name
self.fp = fp
# Header
- (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
+ (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB',
+ self.fp.read(4))
self.fp.read(hdrsize-4)
# Name INDEX
self.name_index = self.INDEX(self.fp)
@@ -316,7 +312,8 @@ def __init__(self, name, fp):
if format == b'\x00':
# Format 0
(n,) = struct.unpack('B', self.fp.read(1))
- for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
+ for (code, gid) in enumerate(struct.unpack('B'*n,
+ self.fp.read(n))):
self.code2gid[code] = gid
self.gid2code[gid] = code
elif format == b'\x01':
@@ -339,7 +336,8 @@ def __init__(self, name, fp):
if format == b'\x00':
# Format 0
n = self.nglyphs-1
- for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
+ for (gid, sid) in enumerate(struct.unpack('>'+'H'*n,
+ self.fp.read(2*n))):
gid += 1
name = self.getstr(sid)
self.name2gid[name] = gid
@@ -360,9 +358,9 @@ def __init__(self, name, fp):
assert False, str(('Unhandled', format))
else:
raise ValueError('unsupported charset format: %r' % format)
- #print self.code2gid
- #print self.name2gid
- #assert 0
+ # print self.code2gid
+ # print self.name2gid
+ # assert 0
return
def getstr(self, sid):
@@ -371,8 +369,6 @@ def getstr(self, sid):
return self.string_index[sid-len(self.STANDARD_STRINGS)]
-## TrueTypeFont
-##
class TrueTypeFont(object):
class CMapNotFound(Exception):
@@ -405,7 +401,8 @@ def create_unicode_map(self):
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
if fmttype == 0:
- char2gid.update(enumerate(struct.unpack('>256B', fp.read(256))))
+ char2gid.update(enumerate(struct.unpack('>256B',
+ fp.read(256))))
elif fmttype == 2:
subheaderkeys = struct.unpack('>256H', fp.read(512))
firstbytes = [0]*8192
@@ -414,8 +411,10 @@ def create_unicode_map(self):
nhdrs = max(subheaderkeys)//8 + 1
hdrs = []
for i in range(nhdrs):
- (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
- hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
+ (firstcode, entcount, delta, offset) = struct.unpack(
+ '>HHhH', fp.read(8))
+ hdrs.append((i, firstcode, entcount,
+ delta, fp.tell()-2+offset))
for (i, firstcode, entcount, delta, pos) in hdrs:
if not entcount:
continue
@@ -439,7 +438,9 @@ def create_unicode_map(self):
if idr:
fp.seek(pos+idr)
for c in range(sc, ec+1):
- char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff
+ char2gid[c] = (
+ struct.unpack('>H',
+ fp.read(2))[0] + idd) & 0xffff
else:
for c in range(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
@@ -452,8 +453,6 @@ def create_unicode_map(self):
return unicode_map
-## Fonts
-##
class PDFFontError(PDFException):
pass
@@ -461,11 +460,11 @@ class PDFFontError(PDFException):
class PDFUnicodeNotDefined(PDFFontError):
pass
+
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C')
-# PDFFont
class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None):
@@ -478,7 +477,8 @@ def __init__(self, descriptor, widths, default_width=None):
self.ascent = num_value(descriptor.get('Ascent', 0))
self.descent = num_value(descriptor.get('Descent', 0))
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
- self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
+ self.default_width = default_width or num_value(descriptor.get(
+ 'MissingWidth', 0))
self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
self.hscale = self.vscale = .001
@@ -542,7 +542,8 @@ def __init__(self, descriptor, widths, spec):
else:
encoding = LITERAL_STANDARD_ENCODING
if isinstance(encoding, dict):
- name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
+ name = literal_name(encoding.get('BaseEncoding',
+ LITERAL_STANDARD_ENCODING))
diff = list_value(encoding.get('Differences', []))
self.cid2unicode = EncodingDB.get_encoding(name, diff)
else:
@@ -582,7 +583,7 @@ def __init__(self, rsrcmgr, spec):
except KeyError:
descriptor = dict_value(spec.get('FontDescriptor', {}))
firstchar = int_value(spec.get('FirstChar', 0))
- #lastchar = int_value(spec.get('LastChar', 255))
+ # lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
PDFSimpleFont.__init__(self, descriptor, widths, spec)
@@ -599,19 +600,17 @@ def __repr__(self):
return '' % self.basefont
-# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '' % self.basefont
-# PDFType3Font
class PDFType3Font(PDFSimpleFont):
- def __init__(self, rsrcmgr, spec):
+ def __init__(self, _, spec):
firstchar = int_value(spec.get('FirstChar', 0))
- #lastchar = int_value(spec.get('LastChar', 0))
+ # lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
if 'FontDescriptor' in spec:
@@ -629,7 +628,6 @@ def __repr__(self):
return ''
-# PDFCIDFont
class PDFCIDFont(PDFFont):
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
@@ -640,8 +638,11 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
- self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
- resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
+ self.cidcoding = '%s-%s' % (
+ resolve1(self.cidsysteminfo.get('Registry',
+ b'unknown')).decode("latin1"),
+ resolve1(self.cidsysteminfo.get('Ordering',
+ b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
@@ -678,15 +679,17 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
pass
else:
try:
- self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
- except CMapDB.CMapNotFound as e:
+ self.unicode_map = CMapDB.get_unicode_map(
+ self.cidcoding, self.cmap.is_vertical())
+ except CMapDB.CMapNotFound:
pass
self.vertical = self.cmap.is_vertical()
if self.vertical:
# writing mode: vertical
widths = get_widths2(list_value(spec.get('W2', [])))
- self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in six.iteritems(widths))
+ self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in
+ six.iteritems(widths))
(vy, w) = spec.get('DW2', [880, -1000])
self.default_disp = (None, vy)
widths = dict((cid, w) for (cid, (w, _)) in six.iteritems(widths))
@@ -701,7 +704,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
return
def __repr__(self):
- return '' % (self.basefont, self.cidcoding)
+ return '' %\
+ (self.basefont, self.cidcoding)
def is_vertical(self):
return self.vertical
@@ -713,7 +717,9 @@ def decode(self, bytes):
return self.cmap.decode(bytes)
def char_disp(self, cid):
- "Returns an integer for horizontal fonts, a tuple for vertical fonts."
+ """
+ Returns an integer for horizontal fonts, a tuple for vertical fonts.
+ """
return self.disps.get(cid, self.default_disp)
def to_unichr(self, cid):
@@ -725,15 +731,15 @@ def to_unichr(self, cid):
raise PDFUnicodeNotDefined(self.cidcoding, cid)
-# main
def main(argv):
for fname in argv[1:]:
fp = open(fname, 'rb')
- #font = TrueTypeFont(fname, fp)
+ # font = TrueTypeFont(fname, fp)
font = CFFFont(fname, fp)
- print (font)
+ print(font)
fp.close()
return
+
if __name__ == '__main__':
sys.exit(main(sys.argv))
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index 0c2328d3..e3a6aed2 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -2,6 +2,7 @@
import re
import logging
from io import BytesIO
+import six
from .cmapdb import CMapDB
from .cmapdb import CMap
from .psparser import PSTypeError
@@ -31,28 +32,25 @@
from .utils import mult_matrix
from .utils import MATRIX_IDENTITY
-import six # Python 2+3 compatibility
log = logging.getLogger(__name__)
-## Exceptions
-##
+
class PDFResourceError(PDFException):
pass
+
class PDFInterpreterError(PDFException):
pass
-## Constants
-##
+
LITERAL_PDF = LIT('PDF')
LITERAL_TEXT = LIT('Text')
LITERAL_FONT = LIT('Font')
LITERAL_FORM = LIT('Form')
LITERAL_IMAGE = LIT('Image')
-## PDFTextState
-##
+
class PDFTextState(object):
def __init__(self):
@@ -70,7 +68,8 @@ def __init__(self):
return
def __repr__(self):
- return ('' %
(self.font, self.fontsize, self.charspace, self.wordspace,
@@ -97,8 +96,6 @@ def reset(self):
return
-## PDFGraphicState
-##
class PDFGraphicState(object):
def __init__(self):
@@ -139,10 +136,7 @@ def __repr__(self):
self.scolor, self.ncolor))
-## Resource Manager
-##
class PDFResourceManager(object):
-
"""Repository of shared resources.
ResourceManager facilitates reuse of shared resources
@@ -162,7 +156,7 @@ def get_procset(self, procs):
elif proc is LITERAL_TEXT:
pass
else:
- #raise PDFResourceError('ProcSet %r is not supported.' % proc)
+ # raise PDFResourceError('ProcSet %r is not supported.' % proc)
pass
return
@@ -219,8 +213,6 @@ def get_font(self, objid, spec):
return font
-## PDFContentParser
-##
class PDFContentParser(PSStackParser):
def __init__(self, streams):
@@ -264,20 +256,22 @@ def get_inline_data(self, pos, target=b'EI'):
while i <= len(target):
self.fillbuf()
if i:
- c = six.indexbytes(self.buf,self.charpos)
- c=six.int2byte(c)
+ c = six.indexbytes(self.buf, self.charpos)
+ c = six.int2byte(c)
data += c
self.charpos += 1
if len(target) <= i and c.isspace():
i += 1
- elif i < len(target) and c == (six.int2byte(target[i]) if six.PY3 else target[i]):
+ elif i < len(target) \
+ and c == (six.int2byte(target[i])
+ if six.PY3 else target[i]):
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
- #print 'found', (0, self.buf[j:j+10])
+ # print 'found', (0, self.buf[j:j+10])
data += self.buf[self.charpos:j+1]
self.charpos = j+1
i = 1
@@ -286,7 +280,7 @@ def get_inline_data(self, pos, target=b'EI'):
self.charpos = len(self.buf)
data = data[:-(len(target)+1)] # strip the last part
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
- return (pos, data)
+ return pos, data
def flush(self):
self.add_results(*self.popall())
@@ -304,7 +298,8 @@ def do_keyword(self, pos, token):
try:
(_, objs) = self.end_type('inline')
if len(objs) % 2 != 0:
- raise PSTypeError('Invalid dictionary construct: %r' % objs)
+ raise PSTypeError('Invalid dictionary construct: %r' %
+ objs)
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
(pos, data) = self.get_inline_data(pos+len(b'ID '))
obj = PDFStream(d, data)
@@ -318,8 +313,6 @@ def do_keyword(self, pos, token):
return
-## Interpreter
-##
class PDFPageInterpreter(object):
def __init__(self, rsrcmgr, device):
@@ -330,9 +323,8 @@ def __init__(self, rsrcmgr, device):
def dup(self):
return self.__class__(self.rsrcmgr, self.device)
- # init_resources(resources):
- # Prepare the fonts and XObjects listed in the Resource attribute.
def init_resources(self, resources):
+ """Prepare the fonts and XObjects listed in the Resource attribute."""
self.resources = resources
self.fontmap = {}
self.xobjmap = {}
@@ -345,9 +337,11 @@ def get_colorspace(spec):
name = literal_name(spec[0])
else:
name = literal_name(spec)
- if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
+ if name == 'ICCBased' and \
+ isinstance(spec, list) and len(spec) >= 2:
return PDFColorSpace(name, stream_value(spec[1])['N'])
- elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
+ elif name == 'DeviceN' and\
+ isinstance(spec, list) and len(spec) >= 2:
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name)
@@ -370,9 +364,8 @@ def get_colorspace(spec):
self.xobjmap[xobjid] = xobjstrm
return
- # init_state(ctm)
- # Initialize the text and graphic states for rendering a page.
def init_state(self, ctm):
+ """Initialize the text and graphic states for rendering a page."""
# gstack: stack for graphical states.
self.gstack = []
self.ctm = ctm
@@ -400,7 +393,7 @@ def pop(self, n):
return x
def get_current_state(self):
- return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
+ return self.ctm, self.textstate.copy(), self.graphicstate.copy()
def set_current_state(self, state):
(self.ctm, self.textstate, self.graphicstate) = state
@@ -460,8 +453,8 @@ def do_i(self, flatness):
return
# load-gstate
- def do_gs(self, name):
- #XXX
+ def do_gs(self, _):
+ # XXX
return
# moveto
@@ -505,7 +498,8 @@ def do_re(self, x, y, w, h):
# stroke
def do_S(self):
- self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
+ self.device.paint_path(self.graphicstate, True, False, False,
+ self.curpath)
self.curpath = []
return
@@ -517,7 +511,8 @@ def do_s(self):
# fill
def do_f(self):
- self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
+ self.device.paint_path(self.graphicstate, False, True, False,
+ self.curpath)
self.curpath = []
return
# fill (obsolete)
@@ -525,19 +520,22 @@ def do_f(self):
# fill-even-odd
def do_f_a(self):
- self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
+ self.device.paint_path(self.graphicstate, False, True, True,
+ self.curpath)
self.curpath = []
return
# fill-and-stroke
def do_B(self):
- self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
+ self.device.paint_path(self.graphicstate, True, True, False,
+ self.curpath)
self.curpath = []
return
# fill-and-stroke-even-odd
def do_B_a(self):
- self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
+ self.device.paint_path(self.graphicstate, True, True, True,
+ self.curpath)
self.curpath = []
return
@@ -587,37 +585,37 @@ def do_cs(self, name):
# setgray-stroking
def do_G(self, gray):
self.graphicstate.color = gray
- #self.do_CS(LITERAL_DEVICE_GRAY)
+ # self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
self.graphicstate.color = gray
- #self.do_cs(LITERAL_DEVICE_GRAY)
+ # self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
self.graphicstate.color = (r, g, b)
- #self.do_CS(LITERAL_DEVICE_RGB)
+ # self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
self.graphicstate.color = (r, g, b)
- #self.do_cs(LITERAL_DEVICE_RGB)
+ # self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
self.graphicstate.color = (c, m, y, k)
- #self.do_CS(LITERAL_DEVICE_CMYK)
+ # self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
self.graphicstate.color = (c, m, y, k)
- #self.do_cs(LITERAL_DEVICE_CMYK)
+ # self.do_cs(LITERAL_DEVICE_CMYK)
return
# setcolor
@@ -737,7 +735,7 @@ def do_Td(self, tx, ty):
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.linematrix = (0, 0)
- #print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
+ # print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
return
# text-move
@@ -746,7 +744,7 @@ def do_TD(self, tx, ty):
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.leading = ty
self.textstate.linematrix = (0, 0)
- #print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
+ # print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
return
# textmatrix
@@ -758,13 +756,15 @@ def do_Tm(self, a, b, c, d, e, f):
# nextline
def do_T_a(self):
(a, b, c, d, e, f) = self.textstate.matrix
- self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
+ self.textstate.matrix = (a, b, c, d,
+ self.textstate.leading*c+e,
+ self.textstate.leading*d+f)
self.textstate.linematrix = (0, 0)
return
# show-pos
def do_TJ(self, seq):
- #print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
+ # print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
if self.textstate.font is None:
if settings.STRICT:
raise PDFInterpreterError('No font specified!')
@@ -824,9 +824,11 @@ def do_Do(self, xobjid):
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
xobjres = xobj.get('Resources')
- resources = dict_value(xobjres) if xobjres else self.resources.copy()
+ resources = dict_value(xobjres) if xobjres \
+ else self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix)
- interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
+ interpreter.render_contents(resources, [xobj],
+ ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
@@ -877,7 +879,8 @@ def execute(self, streams):
break
if isinstance(obj, PSKeyword):
name = keyword_name(obj)
- method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
+ method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').\
+ replace("'", '_q')
if hasattr(self, method):
func = getattr(self, method)
nargs = six.get_function_code(func).co_argcount-1
@@ -891,7 +894,8 @@ def execute(self, streams):
func()
else:
if settings.STRICT:
- raise PDFInterpreterError('Unknown operator: %r' % name)
+ raise PDFInterpreterError('Unknown operator: %r' %
+ name)
else:
self.push(obj)
return
diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py
index 418aeb24..b97eba35 100644
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@@ -1,5 +1,6 @@
import logging
+import six
from . import settings
from .psparser import LIT
from .pdftypes import PDFObjectNotFound
@@ -11,7 +12,6 @@
from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed
-import six # Python 2+3 compatibility
log = logging.getLogger(__name__)
@@ -19,10 +19,8 @@
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
-## PDFPage
-##
-class PDFPage(object):
+class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
@@ -73,12 +71,13 @@ def __init__(self, doc, pageid, attrs):
return
def __repr__(self):
- return '' % (self.resources, self.mediabox)
+ return '' % \
+ (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod
- def create_pages(klass, document):
+ def create_pages(cls, document):
def search(obj, parent):
if isinstance(obj, int):
objid = obj
@@ -87,7 +86,7 @@ def search(obj, parent):
objid = obj.objid
tree = dict_value(obj).copy()
for (k, v) in six.iteritems(parent):
- if k in klass.INHERITABLE_ATTRS and k not in tree:
+ if k in cls.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
tree_type = tree.get('Type')
@@ -104,8 +103,9 @@ def search(obj, parent):
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:
- for (objid, tree) in search(document.catalog['Pages'], document.catalog):
- yield klass(document, objid, tree)
+ for (objid, tree) in search(document.catalog['Pages'],
+ document.catalog):
+ yield cls(document, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
@@ -113,14 +113,15 @@ def search(obj, parent):
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
- if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
- yield klass(document, objid, obj)
+ if isinstance(obj, dict) and obj.get('Type')\
+ is LITERAL_PAGE:
+ yield cls(document, objid, obj)
except PDFObjectNotFound:
pass
return
@classmethod
- def get_pages(klass, fp,
+ def get_pages(cls, fp,
pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object.
@@ -129,9 +130,10 @@ def get_pages(klass, fp,
doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction. If not, abort.
if check_extractable and not doc.is_extractable:
- raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
+ raise PDFTextExtractionNotAllowed('Text extraction is not allowed:'
+ ' %r' % fp)
# Process each page contained in the document.
- for (pageno, page) in enumerate(klass.create_pages(doc)):
+ for (pageno, page) in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index 1dc17d7b..702f792a 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -15,16 +15,11 @@
log = logging.getLogger(__name__)
-## Exceptions
-##
class PDFSyntaxError(PDFException):
pass
-## PDFParser
-##
class PDFParser(PSStackParser):
-
"""
PDFParser fetch PDF objects from a file stream.
It can handle indirect references by referring to
@@ -64,14 +59,11 @@ def do_keyword(self, pos, token):
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
-
elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
-
elif token is self.KEYWORD_NULL:
# null object
self.push((pos, None))
-
elif token is self.KEYWORD_R:
# reference to indirect object
try:
@@ -81,7 +73,6 @@ def do_keyword(self, pos, token):
self.push((pos, obj))
except PSSyntaxError:
pass
-
elif token is self.KEYWORD_STREAM:
# stream object
((_, dic),) = self.pop(1)
@@ -106,7 +97,7 @@ def do_keyword(self, pos, token):
self.seek(pos+objlen)
while 1:
try:
- (linepos, line) = self.nextline()
+ (_, line) = self.nextline()
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
@@ -122,10 +113,10 @@ def do_keyword(self, pos, token):
data += line
self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary
- log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
+ log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...',
+ pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))
-
else:
# others
self.push((pos, token))
@@ -133,10 +124,7 @@ def do_keyword(self, pos, token):
return
-## PDFStreamParser
-##
class PDFStreamParser(PDFParser):
-
"""
PDFStreamParser is used to parse PDF content streams
that is contained in each page and has instructions
@@ -154,6 +142,7 @@ def flush(self):
return
KEYWORD_OBJ = KWD(b'obj')
+
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py
index 40cca46b..9eeccbff 100644
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@@ -1,6 +1,7 @@
import zlib
import logging
+import six
from .lzw import lzwdecode
from .ascii85 import ascii85decode
from .ascii85 import asciihexdecode
@@ -13,12 +14,9 @@
from .utils import apply_png_predictor
from .utils import isnumber
-import six #Python 2+3 compatibility
-
log = logging.getLogger(__name__)
LITERAL_CRYPT = LIT('Crypt')
-
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
@@ -29,29 +27,30 @@
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
-## PDF Objects
-##
class PDFObject(PSObject):
pass
+
class PDFException(PSException):
pass
+
class PDFTypeError(PDFException):
pass
+
class PDFValueError(PDFException):
pass
+
class PDFObjectNotFound(PDFException):
pass
+
class PDFNotImplementedError(PDFException):
pass
-## PDFObjRef
-##
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
@@ -60,11 +59,11 @@ def __init__(self, doc, objid, _):
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
- #self.genno = genno # Never used.
+ # self.genno = genno # Never used.
return
def __repr__(self):
- return '' % (self.objid)
+ return '' % self.objid
def resolve(self, default=None):
try:
@@ -73,7 +72,6 @@ def resolve(self, default=None):
return default
-# resolve
def resolve1(x, default=None):
"""Resolves an object.
@@ -179,8 +177,6 @@ def stream_value(x):
return x
-## PDFStream type
-##
class PDFStream(PDFObject):
def __init__(self, attrs, rawdata, decipher=None):
@@ -201,10 +197,12 @@ def set_objid(self, objid, genno):
def __repr__(self):
if self.data is None:
assert self.rawdata is not None
- return '' % (self.objid, len(self.rawdata), self.attrs)
+ return '' % \
+ (self.objid, len(self.rawdata), self.attrs)
else:
assert self.data is not None
- return '' % (self.objid, len(self.data), self.attrs)
+ return '' % \
+ (self.objid, len(self.data), self.attrs)
def __contains__(self, name):
return name in self.attrs
@@ -239,10 +237,11 @@ def get_filters(self):
if hasattr(fltr, 'resolve'):
fltr = fltr.resolve()[0]
_filters.append(fltr)
- return list(zip(_filters, params)) #solves https://github.com/pdfminer/pdfminer.six/issues/15
+ return list(zip(_filters, params))
def decode(self):
- assert self.data is None and self.rawdata is not None, str((self.data, self.rawdata))
+ assert self.data is None and self.rawdata is not None, \
+ str((self.data, self.rawdata))
data = self.rawdata
if self.decipher:
# Handle encryption
@@ -252,14 +251,15 @@ def decode(self):
self.data = data
self.rawdata = None
return
- for (f,params) in filters:
+ for (f, params) in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
try:
data = zlib.decompress(data)
except zlib.error as e:
if settings.STRICT:
- raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
+ raise PDFException('Invalid zlib bytes: %r, %r' %
+ (e, data))
data = b''
elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data)
@@ -272,7 +272,8 @@ def decode(self):
elif f in LITERALS_CCITTFAX_DECODE:
data = ccittfaxdecode(data, params)
elif f in LITERALS_DCT_DECODE:
- # This is probably a JPG stream - it does not need to be decoded twice.
+ # This is probably a JPG stream
+ # it does not need to be decoded twice.
# Just return the stream to the user.
pass
elif f == LITERAL_CRYPT:
@@ -290,10 +291,13 @@ def decode(self):
# PNG predictor
colors = int_value(params.get('Colors', 1))
columns = int_value(params.get('Columns', 1))
- bitspercomponent = int_value(params.get('BitsPerComponent', 8))
- data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
+ bitspercomponent = int_value(params.get('BitsPerComponent',
+ 8))
+ data = apply_png_predictor(pred, colors, columns,
+ bitspercomponent, data)
else:
- raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
+ raise PDFNotImplementedError('Unsupported predictor: %r' %
+ pred)
self.data = data
self.rawdata = None
return
diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
index 9b214af0..1ee52633 100644
--- a/pdfminer/psparser.py
+++ b/pdfminer/psparser.py
@@ -4,8 +4,7 @@
import re
import logging
-
-import six # Python 2+3 compatibility
+import six
from . import settings
from .utils import choplist
@@ -13,8 +12,6 @@
log = logging.getLogger(__name__)
-## PS Exceptions
-##
class PSException(Exception):
pass
@@ -35,22 +32,15 @@ class PSValueError(PSException):
pass
-## Basic PostScript Types
-##
+# Basic PostScript Types
-## PSObject
-##
class PSObject(object):
-
"""Base class for all PS or PDF-related data types."""
pass
-## PSLiteral
-##
class PSLiteral(PSObject):
-
"""A class that represents a PostScript literal.
Postscript literals are used as identifiers, such as
@@ -66,14 +56,11 @@ def __init__(self, name):
self.name = name
def __repr__(self):
- name=self.name
+ name = self.name
return '/%r' % name
-## PSKeyword
-##
class PSKeyword(PSObject):
-
"""A class that represents a PostScript keyword.
PostScript keywords are a dozen of predefined words.
@@ -89,14 +76,11 @@ def __init__(self, name):
return
def __repr__(self):
- name=self.name
+ name = self.name
return '/%r' % name
-## PSSymbolTable
-##
class PSSymbolTable(object):
-
"""A utility class for storing PSLiteral/PSKeyword objects.
Interned objects can be checked its identity with "is" operator.
@@ -115,6 +99,7 @@ def intern(self, name):
self.dict[name] = lit
return lit
+
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
@@ -132,31 +117,30 @@ def literal_name(x):
if settings.STRICT:
raise PSTypeError('Literal required: %r' % (x,))
else:
- name=x
+ name = x
else:
- name=x.name
+ name = x.name
if six.PY3:
try:
- name = str(name,'utf-8')
+ name = str(name, 'utf-8')
except:
pass
return name
+
def keyword_name(x):
if not isinstance(x, PSKeyword):
if settings.STRICT:
raise PSTypeError('Keyword required: %r' % x)
else:
- name=x
+ name = x
else:
- name=x.name
+ name = x.name
if six.PY3:
- name = str(name,'utf-8','ignore')
+ name = str(name, 'utf-8', 'ignore')
return name
-## PSBaseParser
-##
EOL = re.compile(br'[\r\n]')
SPC = re.compile(br'\s')
NONSPC = re.compile(br'\S')
@@ -168,13 +152,12 @@ def keyword_name(x):
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(br'[()\134]')
OCT_STRING = re.compile(br'[0-7]')
-ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13, b'(': 40, b')': 41, b'\\': 92}
+ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13,
+ b'(': 40, b')': 41, b'\\': 92}
class PSBaseParser(object):
-
- """Most basic PostScript parser that performs only tokenization.
- """
+ """Most basic PostScript parser that performs only tokenization."""
BUFSIZ = 4096
def __init__(self, fp):
@@ -183,7 +166,8 @@ def __init__(self, fp):
return
def __repr__(self):
- return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos)
+ return '<%s: %r, bufpos=%d>' % \
+ (self.__class__.__name__, self.fp, self.bufpos)
def flush(self):
return
@@ -193,7 +177,7 @@ def close(self):
return
def tell(self):
- return self.bufpos+self.charpos
+ return self.bufpos + self.charpos
def poll(self, pos=None, n=80):
pos0 = self.fp.tell()
@@ -205,8 +189,7 @@ def poll(self, pos=None, n=80):
return
def seek(self, pos):
- """Seeks the parser to the given position.
- """
+ """Seeks the parser to the given position."""
log.debug('seek: %r', pos)
self.fp.seek(pos)
# reset the status for nextline()
@@ -232,8 +215,7 @@ def fillbuf(self):
return
def nextline(self):
- """Fetches a next line that ends either with \\r or \\n.
- """
+ """Fetches a next line that ends either with \\r or \\n."""
linebuf = b''
linepos = self.bufpos + self.charpos
eol = False
@@ -259,11 +241,10 @@ def nextline(self):
self.charpos = len(self.buf)
log.debug('nextline: %r, %r', linepos, linebuf)
- return (linepos, linebuf)
+ return linepos, linebuf
def revreadlines(self):
"""Fetches a next line backword.
-
This is used to locate the trailers at the end of a file.
"""
self.fp.seek(0, 2)
@@ -338,12 +319,12 @@ def _parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self._curtoken += s[i:]
- return (self._parse_comment, len(s))
+ return self._parse_comment, len(s)
j = m.start(0)
self._curtoken += s[i:j]
self._parse1 = self._parse_main
# We ignore comments.
- #self._tokens.append(self._curtoken)
+ # self._tokens.append(self._curtoken)
return j
def _parse_literal(self, s, i):
@@ -359,7 +340,7 @@ def _parse_literal(self, s, i):
self._parse1 = self._parse_literal_hex
return j+1
try:
- self._curtoken=str(self._curtoken,'utf-8')
+ self._curtoken = str(self._curtoken, 'utf-8')
except:
pass
self._add_token(LIT(self._curtoken))
@@ -444,7 +425,8 @@ def _parse_string(self, s, i):
return j+1
if c == b')':
self.paren -= 1
- if self.paren: # WTF, they said balanced parens need no special treatment.
+ if self.paren:
+ # WTF, they said balanced parens need no special treatment.
self._curtoken += c
return j+1
self._add_token(self._curtoken)
@@ -490,7 +472,8 @@ def _parse_hexstring(self, s, i):
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
- token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
+ token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),
+ SPC.sub(b'', self._curtoken))
self._add_token(token)
self._parse1 = self._parse_main
return j
@@ -504,8 +487,6 @@ def nexttoken(self):
return token
-## PSStackParser
-##
class PSStackParser(PSBaseParser):
def __init__(self, fp):
@@ -559,7 +540,7 @@ def end_type(self, type):
objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop()
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
- return (pos, objs)
+ return pos, objs
def do_keyword(self, pos, token):
return
@@ -568,12 +549,15 @@ def nextobject(self):
"""Yields a list of objects.
Returns keywords, literals, strings, numbers, arrays and dictionaries.
- Arrays and dictionaries are represented as Python lists and dictionaries.
+ Arrays and dictionaries are represented as Python lists and
+ dictionaries.
"""
while not self.results:
(pos, token) = self.nexttoken()
- #print (pos,token), (self.curtype, self.curstack)
- if isinstance(token, (six.integer_types, float, bool, six.string_types, six.binary_type, PSLiteral)):
+ # print (pos,token), (self.curtype, self.curstack)
+ if isinstance(token, (six.integer_types, float, bool,
+ six.string_types,
+ six.binary_type, PSLiteral)):
# normal token
self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN:
@@ -594,9 +578,11 @@ def nextobject(self):
try:
(pos, objs) = self.end_type('d')
if len(objs) % 2 != 0:
- raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
+ raise PSSyntaxError('Invalid dictionary construct: %r'
+ % objs)
# construct a Python dictionary.
- d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
+ d = dict((literal_name(k), v) for (k, v) in
+ choplist(2, objs) if v is not None)
self.push((pos, d))
except PSTypeError:
if settings.STRICT:
@@ -611,11 +597,13 @@ def nextobject(self):
except PSTypeError:
if settings.STRICT:
raise
- elif isinstance(token,PSKeyword):
- log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
+ elif isinstance(token, PSKeyword):
+ log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
+ token, self.curstack)
self.do_keyword(pos, token)
else:
- log.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
+ log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
+ token, self.curstack)
self.do_keyword(pos, token)
raise
if self.context:
diff --git a/pdfminer/rijndael.py b/pdfminer/rijndael.py
index 2d3a7ab2..02e41972 100644
--- a/pdfminer/rijndael.py
+++ b/pdfminer/rijndael.py
@@ -1,28 +1,24 @@
-
-
-""" Python implementation of Rijndael encryption algorithm.
-
+"""Python implementation of Rijndael encryption algorithm.
This code is in the public domain.
-
This code is based on a public domain C implementation
by Philip J. Erdelsky:
- http://www.efgh.com/software/rijndael.htm
-
+ http://www.efgh.com/software/rijndael.htm
"""
import struct
def KEYLENGTH(keybits):
- return (keybits)//8
+ return keybits//8
def RKLENGTH(keybits):
- return (keybits)//8+28
+ return keybits//8+28
def NROUNDS(keybits):
- return (keybits)//32+6
+ return keybits//32+6
+
Te0 = [
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -701,20 +697,28 @@ def NROUNDS(keybits):
# 128-bit blocks, Rijndael never uses more than 10 rcon values
]
-if len(struct.pack('L',0)) == 4:
+
+if len(struct.pack('L', 0)) == 4:
# 32bit
- def GETU32(x): return struct.unpack('>L', x)[0]
- def PUTU32(x): return struct.pack('>L', x)
+ def GETU32(x):
+ return struct.unpack('>L', x)[0]
+
+ def PUTU32(x):
+ return struct.pack('>L', x)
else:
# 64bit
- def GETU32(x): return struct.unpack('>I', x)[0]
- def PUTU32(x): return struct.pack('>I', x)
+ def GETU32(x):
+ return struct.unpack('>I', x)[0]
+
+ def PUTU32(x):
+ return struct.pack('>I', x)
-# Expand the cipher key into the encryption key schedule.
-#
-# @return the number of rounds for the given cipher key size.
def rijndaelSetupEncrypt(key, keybits):
+ """Expand the cipher key into the encryption key schedule.
+
+ :return: the number of rounds for the given cipher key size.
+ """
i = p = 0
rk = [0]*RKLENGTH(keybits)
rk[0] = GETU32(key[0:4])
@@ -726,15 +730,16 @@ def rijndaelSetupEncrypt(key, keybits):
temp = rk[p+3]
rk[p+4] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
- (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
- (Te4[(temp ) & 0xff] & 0x0000ff00) ^
- (Te4[(temp >> 24) ] & 0x000000ff) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[temp & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24)] & 0x000000ff) ^
rcon[i])
rk[p+5] = rk[p+1] ^ rk[p+4]
rk[p+6] = rk[p+2] ^ rk[p+5]
rk[p+7] = rk[p+3] ^ rk[p+6]
i += 1
- if i == 10: return (rk, 10)
+ if i == 10:
+ return rk, 10
p += 4
rk[4] = GETU32(key[16:20])
@@ -744,15 +749,16 @@ def rijndaelSetupEncrypt(key, keybits):
temp = rk[p+5]
rk[p+6] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
- (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
- (Te4[(temp ) & 0xff] & 0x0000ff00) ^
- (Te4[(temp >> 24) ] & 0x000000ff) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[temp & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24)] & 0x000000ff) ^
rcon[i])
rk[p+7] = rk[p+1] ^ rk[p+6]
rk[p+8] = rk[p+2] ^ rk[p+7]
rk[p+9] = rk[p+3] ^ rk[p+8]
i += 1
- if i == 8: return (rk, 12)
+ if i == 8:
+ return rk, 12
rk[p+10] = rk[p+4] ^ rk[p+9]
rk[p+11] = rk[p+5] ^ rk[p+10]
p += 6
@@ -764,21 +770,22 @@ def rijndaelSetupEncrypt(key, keybits):
temp = rk[p+7]
rk[p+8] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
- (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
- (Te4[(temp ) & 0xff] & 0x0000ff00) ^
- (Te4[(temp >> 24) ] & 0x000000ff) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[temp & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24)] & 0x000000ff) ^
rcon[i])
rk[p+9] = rk[p+1] ^ rk[p+8]
rk[p+10] = rk[p+2] ^ rk[p+9]
rk[p+11] = rk[p+3] ^ rk[p+10]
i += 1
- if i == 7: return (rk, 14)
+ if i == 7:
+ return rk, 14
temp = rk[p+11]
rk[p+12] = (rk[p+4] ^
- (Te4[(temp >> 24) ] & 0xff000000) ^
+ (Te4[(temp >> 24)] & 0xff000000) ^
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
- (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
- (Te4[(temp ) & 0xff] & 0x000000ff))
+ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[temp & 0xff] & 0x000000ff))
rk[p+13] = rk[p+5] ^ rk[p+12]
rk[p+14] = rk[p+6] ^ rk[p+13]
rk[p+15] = rk[p+7] ^ rk[p+14]
@@ -787,10 +794,11 @@ def rijndaelSetupEncrypt(key, keybits):
raise ValueError(keybits)
-# Expand the cipher key into the decryption key schedule.
-#
-# @return the number of rounds for the given cipher key size.
def rijndaelSetupDecrypt(key, keybits):
+ """Expand the cipher key into the decryption key schedule.
+
+ :return: the number of rounds for the given cipher key size.
+ """
# expand the cipher key:
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
@@ -798,45 +806,38 @@ def rijndaelSetupDecrypt(key, keybits):
i = 0
j = 4*nrounds
while i < j:
- temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
- temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
- temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
- temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
+ for num in range(0, 4):
+ rk[i + num], rk[j + num] = rk[j + num], rk[i + num]
i += 4
j -= 4
- # apply the inverse MixColumn transform to all round keys but the first and the last:
+ # apply the inverse MixColumn transform to all
+ # round keys but the first and the last:
p = 0
for i in range(1, nrounds):
p += 4
- rk[p+0] = (
- Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
- Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
- Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
- Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
- rk[p+1] = (
- Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
- Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
- Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
- Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
- rk[p+2] = (
- Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
- Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
- Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
- Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
- rk[p+3] = (
- Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
- Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
- Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
- Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
-
- return (rk, nrounds)
+ rk[p+0] = (Td0[Te4[(rk[p+0] >> 24)] & 0xff] ^
+ Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[p+0] & 0xff] & 0xff])
+ rk[p+1] = (Td0[Te4[(rk[p+1] >> 24)] & 0xff] ^
+ Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[p+1] & 0xff] & 0xff])
+ rk[p+2] = (Td0[Te4[(rk[p+2] >> 24)] & 0xff] ^
+ Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[p+2] & 0xff] & 0xff])
+ rk[p+3] = (Td0[Te4[(rk[p+3] >> 24)] & 0xff] ^
+ Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[rk[p+3] & 0xff] & 0xff])
+ return rk, nrounds
def rijndaelEncrypt(rk, nrounds, plaintext):
assert len(plaintext) == 16, str(len(plaintext))
- # map byte array block to cipher state
- # and add initial round key:
+ # map byte array block to cipher state and add initial round key:
s0 = GETU32(plaintext[0:4]) ^ rk[0]
s1 = GETU32(plaintext[4:8]) ^ rk[1]
s2 = GETU32(plaintext[8:12]) ^ rk[2]
@@ -846,89 +847,65 @@ def rijndaelEncrypt(rk, nrounds, plaintext):
r = nrounds >> 1
p = 0
while 1:
- t0 = (
- Te0[(s0 >> 24) ] ^
- Te1[(s1 >> 16) & 0xff] ^
- Te2[(s2 >> 8) & 0xff] ^
- Te3[(s3 ) & 0xff] ^
- rk[p+4])
- t1 = (
- Te0[(s1 >> 24) ] ^
- Te1[(s2 >> 16) & 0xff] ^
- Te2[(s3 >> 8) & 0xff] ^
- Te3[(s0 ) & 0xff] ^
- rk[p+5])
- t2 = (
- Te0[(s2 >> 24) ] ^
- Te1[(s3 >> 16) & 0xff] ^
- Te2[(s0 >> 8) & 0xff] ^
- Te3[(s1 ) & 0xff] ^
- rk[p+6])
- t3 = (
- Te0[(s3 >> 24) ] ^
- Te1[(s0 >> 16) & 0xff] ^
- Te2[(s1 >> 8) & 0xff] ^
- Te3[(s2 ) & 0xff] ^
- rk[p+7])
+ t0 = (Te0[(s0 >> 24)] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[s3 & 0xff] ^
+ rk[p+4])
+ t1 = (Te0[(s1 >> 24)] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[s0 & 0xff] ^
+ rk[p+5])
+ t2 = (Te0[(s2 >> 24)] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[s1 & 0xff] ^
+ rk[p+6])
+ t3 = (Te0[(s3 >> 24)] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[s2 & 0xff] ^
+ rk[p+7])
p += 8
r -= 1
- if r == 0: break
- s0 = (
- Te0[(t0 >> 24) ] ^
- Te1[(t1 >> 16) & 0xff] ^
- Te2[(t2 >> 8) & 0xff] ^
- Te3[(t3 ) & 0xff] ^
- rk[p+0])
- s1 = (
- Te0[(t1 >> 24) ] ^
- Te1[(t2 >> 16) & 0xff] ^
- Te2[(t3 >> 8) & 0xff] ^
- Te3[(t0 ) & 0xff] ^
- rk[p+1])
- s2 = (
- Te0[(t2 >> 24) ] ^
- Te1[(t3 >> 16) & 0xff] ^
- Te2[(t0 >> 8) & 0xff] ^
- Te3[(t1 ) & 0xff] ^
- rk[p+2])
- s3 = (
- Te0[(t3 >> 24) ] ^
- Te1[(t0 >> 16) & 0xff] ^
- Te2[(t1 >> 8) & 0xff] ^
- Te3[(t2 ) & 0xff] ^
- rk[p+3])
+ if r == 0:
+ break
+ s0 = (Te0[(t0 >> 24)] ^ Te1[(t1 >> 16) & 0xff] ^
+ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[p+0])
+ s1 = (Te0[(t1 >> 24)] ^ Te1[(t2 >> 16) & 0xff] ^
+ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[p+1])
+ s2 = (Te0[(t2 >> 24)] ^ Te1[(t3 >> 16) & 0xff] ^
+ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[p+2])
+ s3 = (Te0[(t3 >> 24)] ^ Te1[(t0 >> 16) & 0xff] ^
+ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[p+3])
ciphertext = b''
- # apply last round and
- # map cipher state to byte array block:
- s0 = (
- (Te4[(t0 >> 24) ] & 0xff000000) ^
- (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
- (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
- (Te4[(t3 ) & 0xff] & 0x000000ff) ^
- rk[p+0])
+ # apply last round and map cipher state to byte array block:
+ s0 = ((Te4[(t0 >> 24)] & 0xff000000) ^
+ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t3 & 0xff] & 0x000000ff) ^
+ rk[p+0])
ciphertext += PUTU32(s0)
- s1 = (
- (Te4[(t1 >> 24) ] & 0xff000000) ^
- (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
- (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
- (Te4[(t0 ) & 0xff] & 0x000000ff) ^
- rk[p+1])
+ s1 = ((Te4[(t1 >> 24)] & 0xff000000) ^
+ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t0 & 0xff] & 0x000000ff) ^
+ rk[p+1])
ciphertext += PUTU32(s1)
- s2 = (
- (Te4[(t2 >> 24) ] & 0xff000000) ^
- (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
- (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
- (Te4[(t1 ) & 0xff] & 0x000000ff) ^
- rk[p+2])
+ s2 = ((Te4[(t2 >> 24)] & 0xff000000) ^
+ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t1 & 0xff] & 0x000000ff) ^
+ rk[p+2])
ciphertext += PUTU32(s2)
- s3 = (
- (Te4[(t3 >> 24) ] & 0xff000000) ^
- (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
- (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
- (Te4[(t2 ) & 0xff] & 0x000000ff) ^
- rk[p+3])
+ s3 = ((Te4[(t3 >> 24)] & 0xff000000) ^
+ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[t2 & 0xff] & 0x000000ff) ^
+ rk[p+3])
ciphertext += PUTU32(s3)
assert len(ciphertext) == 16, str(len(ciphertext))
@@ -949,89 +926,53 @@ def rijndaelDecrypt(rk, nrounds, ciphertext):
r = nrounds >> 1
p = 0
while 1:
- t0 = (
- Td0[(s0 >> 24) ] ^
- Td1[(s3 >> 16) & 0xff] ^
- Td2[(s2 >> 8) & 0xff] ^
- Td3[(s1 ) & 0xff] ^
- rk[p+4])
- t1 = (
- Td0[(s1 >> 24) ] ^
- Td1[(s0 >> 16) & 0xff] ^
- Td2[(s3 >> 8) & 0xff] ^
- Td3[(s2 ) & 0xff] ^
- rk[p+5])
- t2 = (
- Td0[(s2 >> 24) ] ^
- Td1[(s1 >> 16) & 0xff] ^
- Td2[(s0 >> 8) & 0xff] ^
- Td3[(s3 ) & 0xff] ^
- rk[p+6])
- t3 = (
- Td0[(s3 >> 24) ] ^
- Td1[(s2 >> 16) & 0xff] ^
- Td2[(s1 >> 8) & 0xff] ^
- Td3[(s0 ) & 0xff] ^
- rk[p+7])
+ t0 = (Td0[(s0 >> 24)] ^ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[p+4])
+ t1 = (Td0[(s1 >> 24)] ^ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[p+5])
+ t2 = (Td0[(s2 >> 24)] ^ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[p+6])
+ t3 = (Td0[(s3 >> 24)] ^ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[p+7])
p += 8
r -= 1
- if r == 0: break
- s0 = (
- Td0[(t0 >> 24) ] ^
- Td1[(t3 >> 16) & 0xff] ^
- Td2[(t2 >> 8) & 0xff] ^
- Td3[(t1 ) & 0xff] ^
- rk[p+0])
- s1 = (
- Td0[(t1 >> 24) ] ^
- Td1[(t0 >> 16) & 0xff] ^
- Td2[(t3 >> 8) & 0xff] ^
- Td3[(t2 ) & 0xff] ^
- rk[p+1])
- s2 = (
- Td0[(t2 >> 24) ] ^
- Td1[(t1 >> 16) & 0xff] ^
- Td2[(t0 >> 8) & 0xff] ^
- Td3[(t3 ) & 0xff] ^
- rk[p+2])
- s3 = (
- Td0[(t3 >> 24) ] ^
- Td1[(t2 >> 16) & 0xff] ^
- Td2[(t1 >> 8) & 0xff] ^
- Td3[(t0 ) & 0xff] ^
- rk[p+3])
+ if r == 0:
+ break
+ s0 = (Td0[(t0 >> 24)] ^ Td1[(t3 >> 16) & 0xff] ^
+ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[p+0])
+ s1 = (Td0[(t1 >> 24)] ^ Td1[(t0 >> 16) & 0xff] ^
+ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[p+1])
+ s2 = (Td0[(t2 >> 24)] ^ Td1[(t1 >> 16) & 0xff] ^
+ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[p+2])
+ s3 = (Td0[(t3 >> 24)] ^ Td1[(t2 >> 16) & 0xff] ^
+ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[p+3])
plaintext = b''
- # apply last round and
- # map cipher state to byte array block:
- s0 = (
- (Td4[(t0 >> 24) ] & 0xff000000) ^
- (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
- (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
- (Td4[(t1 ) & 0xff] & 0x000000ff) ^
- rk[p+0])
+ # apply last round and map cipher state to byte array block:
+ s0 = ((Td4[(t0 >> 24)] & 0xff000000) ^
+ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t1 & 0xff] & 0x000000ff) ^
+ rk[p+0])
plaintext += PUTU32(s0)
- s1 = (
- (Td4[(t1 >> 24) ] & 0xff000000) ^
- (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
- (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
- (Td4[(t2 ) & 0xff] & 0x000000ff) ^
- rk[p+1])
+ s1 = ((Td4[(t1 >> 24)] & 0xff000000) ^
+ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t2 & 0xff] & 0x000000ff) ^
+ rk[p+1])
plaintext += PUTU32(s1)
- s2 = (
- (Td4[(t2 >> 24) ] & 0xff000000) ^
- (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
- (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
- (Td4[(t3 ) & 0xff] & 0x000000ff) ^
- rk[p+2])
+ s2 = ((Td4[(t2 >> 24)] & 0xff000000) ^
+ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t3 & 0xff] & 0x000000ff) ^
+ rk[p+2])
plaintext += PUTU32(s2)
- s3 = (
- (Td4[(t3 >> 24) ] & 0xff000000) ^
- (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
- (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
- (Td4[(t0 ) & 0xff] & 0x000000ff) ^
- rk[p+3])
+ s3 = ((Td4[(t3 >> 24)] & 0xff000000) ^
+ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[t0 & 0xff] & 0x000000ff) ^
+ rk[p+3])
plaintext += PUTU32(s3)
assert len(plaintext) == 16, str(len(plaintext))
@@ -1040,7 +981,6 @@ def rijndaelDecrypt(rk, nrounds, ciphertext):
# decrypt(key, fin, fout, keybits=256)
class RijndaelDecryptor(object):
-
"""
>>> key = b'00010203050607080a0b0c0d0f101112'.decode('hex')
>>> ciphertext = b'd8f532538289ef7d06b506a4fd5be9c9'.decode('hex')
@@ -1049,10 +989,13 @@ class RijndaelDecryptor(object):
"""
def __init__(self, key, keybits=256):
- assert len(key) == KEYLENGTH(keybits), str((len(key), KEYLENGTH(keybits)))
+ assert len(key) == KEYLENGTH(keybits), \
+ str((len(key), KEYLENGTH(keybits)))
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
- assert len(self.rk) == RKLENGTH(keybits), str((len(self.rk), RKLENGTH(keybits)))
- assert self.nrounds == NROUNDS(keybits), str((self.nrounds, NROUNDS(keybits)))
+ assert len(self.rk) == RKLENGTH(keybits), \
+ str((len(self.rk), RKLENGTH(keybits)))
+ assert self.nrounds == NROUNDS(keybits), \
+ str((self.nrounds, NROUNDS(keybits)))
return
def decrypt(self, ciphertext):
@@ -1064,10 +1007,13 @@ def decrypt(self, ciphertext):
class RijndaelEncryptor(object):
def __init__(self, key, keybits=256):
- assert len(key) == KEYLENGTH(keybits), str((len(key), KEYLENGTH(keybits)))
+ assert len(key) == KEYLENGTH(keybits), \
+ str((len(key), KEYLENGTH(keybits)))
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
- assert len(self.rk) == RKLENGTH(keybits), str((len(self.rk), RKLENGTH(keybits)))
- assert self.nrounds == NROUNDS(keybits), str((self.nrounds, NROUNDS(keybits)))
+ assert len(self.rk) == RKLENGTH(keybits), \
+ str((len(self.rk), RKLENGTH(keybits)))
+ assert self.nrounds == NROUNDS(keybits), \
+ str((self.nrounds, NROUNDS(keybits)))
return
def encrypt(self, plaintext):
diff --git a/pdfminer/runlength.py b/pdfminer/runlength.py
index 54bc7691..814c3943 100644
--- a/pdfminer/runlength.py
+++ b/pdfminer/runlength.py
@@ -1,12 +1,11 @@
+"""
+RunLength decoder (Adobe version) implementation based on PDF Reference
+version 1.4 section 3.3.4.
+ * public domain *
+"""
-#
-# RunLength decoder (Adobe version) implementation based on PDF Reference
-# version 1.4 section 3.3.4.
-#
-# * public domain *
-#
+import six
-import six #Python 2+3 compatibility
def rldecode(data):
"""
@@ -25,20 +24,19 @@ def rldecode(data):
decoded = b''
i = 0
while i < len(data):
- #print 'data[%d]=:%d:' % (i,ord(data[i]))
- length = six.indexbytes(data,i)
+ # print 'data[%d]=:%d:' % (i,ord(data[i]))
+ length = six.indexbytes(data, i)
if length == 128:
break
- if length >= 0 and length < 128:
- for j in range(i+1,(i+1)+(length+1)):
- decoded+=six.int2byte(six.indexbytes(data,j))
- #print 'length=%d, run=%s' % (length+1,run)
-
+ if 0 <= length < 128:
+ for j in range(i+1, (i+1)+(length+1)):
+ decoded += six.int2byte(six.indexbytes(data, j))
+ # print 'length=%d, run=%s' % (length+1,run)
+
i = (i+1) + (length+1)
if length > 128:
- run = six.int2byte(six.indexbytes(data,i+1))*(257-length)
- #print 'length=%d, run=%s' % (257-length,run)
- decoded+=run
+ run = six.int2byte(six.indexbytes(data, i+1))*(257-length)
+ # print 'length=%d, run=%s' % (257-length,run)
+ decoded += run
i = (i+1) + 1
return decoded
-
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 339759a1..bba4e74b 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -3,51 +3,61 @@
Miscellaneous Routines.
"""
import struct
-# from sys import maxint as INF #doesn't work anymore under Python3,
+import six
+
+# from sys import maxint as INF doesn't work anymore under Python3,
# but PDF still uses 32 bits ints
-INF = (1<<31) - 1
+INF = (1 << 31) - 1
-import six #Python 2+3 compatibility
if six.PY3:
import chardet # For str encoding detection in Py3
unicode = str
+
def make_compat_bytes(in_str):
- "In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
+ """In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."""
assert isinstance(in_str, str), str(type(in_str))
if six.PY2:
return in_str
- else:
- return in_str.encode()
+ return in_str.encode()
+
def make_compat_str(in_str):
- "In Py2, does nothing. In Py3, converts to string, guessing encoding."
+ """In Py2, does nothing. In Py3, converts to string, guessing encoding."""
assert isinstance(in_str, (bytes, str, unicode)), str(type(in_str))
if six.PY3 and isinstance(in_str, bytes):
- enc = chardet.detect(in_str)
- in_str = in_str.decode(enc['encoding'])
+ encoding = chardet.detect(in_str)
+ in_str = in_str.decode(encoding['encoding'])
return in_str
-def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
- "When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
+
+def compatible_encode_method(bytesorstring,
+ encoding='utf-8',
+ erraction='ignore'):
+ """When Py2 str.encode is called, it often means bytes.encode in Py3.
+ This does either.
+ """
+
if six.PY2:
- assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring))
+ assert isinstance(bytesorstring, (str, unicode)),\
+ str(type(bytesorstring))
return bytesorstring.encode(encoding, erraction)
+
if six.PY3:
- if isinstance(bytesorstring, str): return bytesorstring
+ if isinstance(bytesorstring, str):
+ return bytesorstring
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
return bytesorstring.decode(encoding, erraction)
-## PNG Predictor
-##
-def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
+
+def apply_png_predictor(_, colors, columns, bitspercomponent, data):
+ """Apply png predictor"""
+
if bitspercomponent != 8:
- # unsupported
raise ValueError("Unsupported `bitspercomponent': %d" %
bitspercomponent)
nbytes = colors * columns * bitspercomponent // 8
- i = 0
buf = b''
line0 = b'\x00' * columns
for i in range(0, len(data), nbytes+1):
@@ -84,56 +94,48 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
c = ((c+a+b)//2) & 255
line2 += six.int2byte(c)
else:
- # unsupported
raise ValueError("Unsupported predictor value: %d" % ft)
buf += line2
line0 = line2
return buf
-## Matrix operations
-##
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix(m1, m0):
+ """Returns the multiplication of two matrices."""
(a1, b1, c1, d1, e1, f1) = m1
(a0, b0, c0, d0, e0, f0) = m0
- """Returns the multiplication of two matrices."""
- return (a0*a1+c0*b1, b0*a1+d0*b1,
- a0*c1+c0*d1, b0*c1+d0*d1,
- a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
+ return (a0*a1+c0*b1, b0*a1+d0*b1, a0*c1+c0*d1,
+ b0*c1+d0*d1, a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix(m, v):
"""Translates a matrix by (x, y)."""
(a, b, c, d, e, f) = m
(x, y) = v
- return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
+ return a, b, c, d, x*a+y*c+e, x*b+y*d+f
def apply_matrix_pt(m, v):
+ """Applies a matrix to a point."""
(a, b, c, d, e, f) = m
(x, y) = v
- """Applies a matrix to a point."""
- return (a*x+c*y+e, b*x+d*y+f)
+ return a*x+c*y+e, b*x+d*y+f
def apply_matrix_norm(m, v):
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
- (a, b, c, d, e, f) = m
+ (a, b, c, d, _, _) = m
(p, q) = v
- return (a*p+c*q, b*p+d*q)
-
+ return a*p+c*q, b*p+d*q
-## Utility functions
-##
-# isnumber
def isnumber(x):
return isinstance(x, (six.integer_types, float))
-# uniq
+
def uniq(objs):
"""Eliminates duplicated elements."""
done = set()
@@ -145,14 +147,12 @@ def uniq(objs):
return
-# csort
def csort(objs, key):
"""Order-preserving sorting function."""
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
-# fsplit
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
t = []
@@ -162,17 +162,15 @@ def fsplit(pred, objs):
t.append(obj)
else:
f.append(obj)
- return (t, f)
+ return t, f
-# drange
def drange(v0, v1, d):
"""Returns a discrete range."""
assert v0 < v1, str((v0, v1, d))
return range(int(v0)//d, int(v1+d)//d)
-# get_bound
def get_bound(pts):
"""Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
@@ -181,10 +179,9 @@ def get_bound(pts):
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
- return (x0, y0, x1, y1)
+ return x0, y0, x1, y1
-# pick
def pick(seq, func, maxobj=None):
"""Picks the object obj where func(obj) has the highest value."""
maxscore = None
@@ -195,7 +192,6 @@ def pick(seq, func, maxobj=None):
return maxobj
-# choplist
def choplist(n, seq):
"""Groups every n elements of the list."""
r = []
@@ -207,7 +203,6 @@ def choplist(n, seq):
return
-# nunpack
def nunpack(s, default=0):
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
l = len(s)
@@ -227,7 +222,6 @@ def nunpack(s, default=0):
raise TypeError('invalid length: %d' % l)
-# decode_text
PDFDocEncoding = ''.join(six.unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
@@ -268,16 +262,15 @@ def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode."""
if s.startswith(b'\xfe\xff'):
return six.text_type(s[2:], 'utf-16be', 'ignore')
- else:
- return ''.join(PDFDocEncoding[c] for c in s)
+ return ''.join(PDFDocEncoding[c] for c in s)
-# enc
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
return ''
- x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
+ x = x.replace('&', '&').replace('>', '>').\
+ replace('<', '<').replace('"', '"')
if codec:
x = x.encode(codec, 'xmlcharrefreplace')
return x
@@ -293,14 +286,13 @@ def matrix2str(m):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
-## Plane
-##
-## A set-like data structure for objects placed on a plane.
-## Can efficiently find objects in a certain rectangular area.
-## It maintains two parallel lists of objects, each of
-## which is sorted by its x or y coordinate.
-##
class Plane(object):
+ """Plane
+ A set-like data structure for objects placed on a plane.
+ Can efficiently find objects in a certain rectangular area.
+ It maintains two parallel lists of objects, each of
+ which is sorted by its x or y coordinate.
+ """
def __init__(self, bbox, gridsize=50):
self._seq = [] # preserve the object order.
@@ -311,10 +303,10 @@ def __init__(self, bbox, gridsize=50):
return
def __repr__(self):
- return ('' % list(self))
+ return '' % list(self)
def __iter__(self):
- return ( obj for obj in self._seq if obj in self._objs )
+ return (obj for obj in self._seq if obj in self._objs)
def __len__(self):
return len(self._objs)
@@ -324,8 +316,8 @@ def __contains__(self, obj):
def _getrange(self, bbox):
(x0, y0, x1, y1) = bbox
- if (x1 <= self.x0 or self.x1 <= x0 or
- y1 <= self.y0 or self.y1 <= y0): return
+ if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
+ return
x0 = max(self.x0, x0)
y0 = max(self.y0, y0)
x1 = min(self.x1, x1)
@@ -375,8 +367,8 @@ def find(self, bbox):
if obj in done:
continue
done.add(obj)
- if (obj.x1 <= x0 or x1 <= obj.x0 or
- obj.y1 <= y0 or y1 <= obj.y0):
+ if obj.x1 <= x0 or x1 <= obj.x0 \
+ or obj.y1 <= y0 or y1 <= obj.y0:
continue
yield obj
return