diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index 5c0e64c9..e64bdf21 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -1,22 +1,20 @@ - - """ Python implementation of Arcfour encryption algorithm. See https://en.wikipedia.org/wiki/RC4 This code is in the public domain. - """ -import six # Python 2+3 compatibility -## Arcfour -## +import six + + class Arcfour(object): + """ Python implementation of Arcfour encryption algorithm.""" def __init__(self, key): - s = [i for i in range(256)] #because Py3 range is not indexable + s = [i for i in range(256)] # because Py3 range is not indexable j = 0 klen = len(key) for i in range(256): - j = (j + s[i] + six.indexbytes(key,i % klen)) % 256 + j = (j + s[i] + six.indexbytes(key, i % klen)) % 256 (s[i], s[j]) = (s[j], s[i]) self.s = s (self.i, self.j) = (0, 0) @@ -34,7 +32,8 @@ def process(self, data): r += six.int2byte(c ^ k) (self.i, self.j) = (i, j) return r - + encrypt = decrypt = process + new = Arcfour diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index a9f501da..4c5dc347 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -1,34 +1,26 @@ - - -""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version). - +"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version). This code is in the public domain. - """ import re import struct - -import six #Python 2+3 compatibility +import six -# ascii85decode(data) def ascii85decode(data): """ In ASCII85 encoding, every four bytes are encoded with five ASCII letters, using 85 different types of characters (as 256**4 < 85**5). When the length of the original bytes is not a multiple of 4, a special rule is used for round up. - The Adobe's ASCII85 implementation is slightly different from its original in handling the last characters. - """ n = b = 0 out = b'' for i in six.iterbytes(data): - c=six.int2byte(i) - if b'!' <= c and c <= b'u': + c = six.int2byte(i) + if b'!' <= c <= b'u': n += 1 b = b*85+(ord(c)-33) if n == 5: @@ -45,6 +37,7 @@ def ascii85decode(data): break return out + # asciihexdecode(data) hex_re = re.compile(b'([a-f\d]{2})', re.IGNORECASE) trail_re = re.compile(b'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) @@ -60,15 +53,15 @@ def asciihexdecode(data): the EOD marker after reading an odd number of hexadecimal digits, it will behave as if a 0 followed the last digit. """ - def decode(x): - i=int(x,16) + def decode(char): + i = int(char, 16) return six.int2byte(i) - out=b'' + out = b'' for x in hex_re.findall(data): - out+=decode(x) + out += decode(x) m = trail_re.search(data) if m: - out+=decode(m.group(1)+b'0') + out += decode(m.group(1)+b'0') return out diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index efc34823..73a9f542 100644 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -1,19 +1,16 @@ - -# CCITT Fax decoder -# -# Bugs: uncompressed mode untested. -# -# cf. -# ITU-T Recommendation T.4 -# "Standardization of Group 3 facsimile terminals for document transmission" -# ITU-T Recommendation T.6 -# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS FOR GROUP 4 FACSIMILE APPARATUS" - +"""CCITT Fax decoder +Bugs: uncompressed mode untested. + cf. + ITU-T Recommendation T.4 + "Standardization of Group 3 facsimile terminals for document transmission" + ITU-T Recommendation T.6 + "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS + FOR GROUP 4 FACSIMILE APPARATUS" +""" import sys import array - -import six #Python 2+3 compatibility +import six if six.PY3: def get_bytes(data): @@ -25,8 +22,6 @@ def get_bytes(data): yield ord(char) -## BitParser -## class BitParser(object): def __init__(self): @@ -34,7 +29,7 @@ def __init__(self): return @classmethod - def add(klass, root, v, bits): + def add(cls, root, v, bits): p = root b = None for i in range(len(bits)): @@ -68,20 +63,18 @@ def _parse_bit(self, x): return -## CCITTG4Parser -## class CCITTG4Parser(BitParser): MODE = [None, None] - BitParser.add(MODE, 0, '1') - BitParser.add(MODE, +1, '011') - BitParser.add(MODE, -1, '010') + BitParser.add(MODE, 0, '1') + BitParser.add(MODE, +1, '011') + BitParser.add(MODE, -1, '010') BitParser.add(MODE, 'h', '001') BitParser.add(MODE, 'p', '0001') - BitParser.add(MODE, +2, '000011') - BitParser.add(MODE, -2, '000010') - BitParser.add(MODE, +3, '0000011') - BitParser.add(MODE, -3, '0000010') + BitParser.add(MODE, +2, '000011') + BitParser.add(MODE, -2, '000010') + BitParser.add(MODE, +3, '0000011') + BitParser.add(MODE, -3, '0000010') BitParser.add(MODE, 'u', '0000001111') BitParser.add(MODE, 'x1', '0000001000') BitParser.add(MODE, 'x2', '0000001001') @@ -93,85 +86,85 @@ class CCITTG4Parser(BitParser): BitParser.add(MODE, 'e', '000000000001000000000001') WHITE = [None, None] - BitParser.add(WHITE, 0 , '00110101') - BitParser.add(WHITE, 1 , '000111') - BitParser.add(WHITE, 2 , '0111') - BitParser.add(WHITE, 3 , '1000') - BitParser.add(WHITE, 4 , '1011') - BitParser.add(WHITE, 5 , '1100') - BitParser.add(WHITE, 6 , '1110') - BitParser.add(WHITE, 7 , '1111') - BitParser.add(WHITE, 8 , '10011') - BitParser.add(WHITE, 9 , '10100') - BitParser.add(WHITE, 10 , '00111') - BitParser.add(WHITE, 11 , '01000') - BitParser.add(WHITE, 12 , '001000') - BitParser.add(WHITE, 13 , '000011') - BitParser.add(WHITE, 14 , '110100') - BitParser.add(WHITE, 15 , '110101') - BitParser.add(WHITE, 16 , '101010') - BitParser.add(WHITE, 17 , '101011') - BitParser.add(WHITE, 18 , '0100111') - BitParser.add(WHITE, 19 , '0001100') - BitParser.add(WHITE, 20 , '0001000') - BitParser.add(WHITE, 21 , '0010111') - BitParser.add(WHITE, 22 , '0000011') - BitParser.add(WHITE, 23 , '0000100') - BitParser.add(WHITE, 24 , '0101000') - BitParser.add(WHITE, 25 , '0101011') - BitParser.add(WHITE, 26 , '0010011') - BitParser.add(WHITE, 27 , '0100100') - BitParser.add(WHITE, 28 , '0011000') - BitParser.add(WHITE, 29 , '00000010') - BitParser.add(WHITE, 30 , '00000011') - BitParser.add(WHITE, 31 , '00011010') - BitParser.add(WHITE, 32 , '00011011') - BitParser.add(WHITE, 33 , '00010010') - BitParser.add(WHITE, 34 , '00010011') - BitParser.add(WHITE, 35 , '00010100') - BitParser.add(WHITE, 36 , '00010101') - BitParser.add(WHITE, 37 , '00010110') - BitParser.add(WHITE, 38 , '00010111') - BitParser.add(WHITE, 39 , '00101000') - BitParser.add(WHITE, 40 , '00101001') - BitParser.add(WHITE, 41 , '00101010') - BitParser.add(WHITE, 42 , '00101011') - BitParser.add(WHITE, 43 , '00101100') - BitParser.add(WHITE, 44 , '00101101') - BitParser.add(WHITE, 45 , '00000100') - BitParser.add(WHITE, 46 , '00000101') - BitParser.add(WHITE, 47 , '00001010') - BitParser.add(WHITE, 48 , '00001011') - BitParser.add(WHITE, 49 , '01010010') - BitParser.add(WHITE, 50 , '01010011') - BitParser.add(WHITE, 51 , '01010100') - BitParser.add(WHITE, 52 , '01010101') - BitParser.add(WHITE, 53 , '00100100') - BitParser.add(WHITE, 54 , '00100101') - BitParser.add(WHITE, 55 , '01011000') - BitParser.add(WHITE, 56 , '01011001') - BitParser.add(WHITE, 57 , '01011010') - BitParser.add(WHITE, 58 , '01011011') - BitParser.add(WHITE, 59 , '01001010') - BitParser.add(WHITE, 60 , '01001011') - BitParser.add(WHITE, 61 , '00110010') - BitParser.add(WHITE, 62 , '00110011') - BitParser.add(WHITE, 63 , '00110100') - BitParser.add(WHITE, 64 , '11011') - BitParser.add(WHITE, 128 , '10010') - BitParser.add(WHITE, 192 , '010111') - BitParser.add(WHITE, 256 , '0110111') - BitParser.add(WHITE, 320 , '00110110') - BitParser.add(WHITE, 384 , '00110111') - BitParser.add(WHITE, 448 , '01100100') - BitParser.add(WHITE, 512 , '01100101') - BitParser.add(WHITE, 576 , '01101000') - BitParser.add(WHITE, 640 , '01100111') - BitParser.add(WHITE, 704 , '011001100') - BitParser.add(WHITE, 768 , '011001101') - BitParser.add(WHITE, 832 , '011010010') - BitParser.add(WHITE, 896 , '011010011') - BitParser.add(WHITE, 960 , '011010100') + BitParser.add(WHITE, 0, '00110101') + BitParser.add(WHITE, 1, '000111') + BitParser.add(WHITE, 2, '0111') + BitParser.add(WHITE, 3, '1000') + BitParser.add(WHITE, 4, '1011') + BitParser.add(WHITE, 5, '1100') + BitParser.add(WHITE, 6, '1110') + BitParser.add(WHITE, 7, '1111') + BitParser.add(WHITE, 8, '10011') + BitParser.add(WHITE, 9, '10100') + BitParser.add(WHITE, 10, '00111') + BitParser.add(WHITE, 11, '01000') + BitParser.add(WHITE, 12, '001000') + BitParser.add(WHITE, 13, '000011') + BitParser.add(WHITE, 14, '110100') + BitParser.add(WHITE, 15, '110101') + BitParser.add(WHITE, 16, '101010') + BitParser.add(WHITE, 17, '101011') + BitParser.add(WHITE, 18, '0100111') + BitParser.add(WHITE, 19, '0001100') + BitParser.add(WHITE, 20, '0001000') + BitParser.add(WHITE, 21, '0010111') + BitParser.add(WHITE, 22, '0000011') + BitParser.add(WHITE, 23, '0000100') + BitParser.add(WHITE, 24, '0101000') + BitParser.add(WHITE, 25, '0101011') + BitParser.add(WHITE, 26, '0010011') + BitParser.add(WHITE, 27, '0100100') + BitParser.add(WHITE, 28, '0011000') + BitParser.add(WHITE, 29, '00000010') + BitParser.add(WHITE, 30, '00000011') + BitParser.add(WHITE, 31, '00011010') + BitParser.add(WHITE, 32, '00011011') + BitParser.add(WHITE, 33, '00010010') + BitParser.add(WHITE, 34, '00010011') + BitParser.add(WHITE, 35, '00010100') + BitParser.add(WHITE, 36, '00010101') + BitParser.add(WHITE, 37, '00010110') + BitParser.add(WHITE, 38, '00010111') + BitParser.add(WHITE, 39, '00101000') + BitParser.add(WHITE, 40, '00101001') + BitParser.add(WHITE, 41, '00101010') + BitParser.add(WHITE, 42, '00101011') + BitParser.add(WHITE, 43, '00101100') + BitParser.add(WHITE, 44, '00101101') + BitParser.add(WHITE, 45, '00000100') + BitParser.add(WHITE, 46, '00000101') + BitParser.add(WHITE, 47, '00001010') + BitParser.add(WHITE, 48, '00001011') + BitParser.add(WHITE, 49, '01010010') + BitParser.add(WHITE, 50, '01010011') + BitParser.add(WHITE, 51, '01010100') + BitParser.add(WHITE, 52, '01010101') + BitParser.add(WHITE, 53, '00100100') + BitParser.add(WHITE, 54, '00100101') + BitParser.add(WHITE, 55, '01011000') + BitParser.add(WHITE, 56, '01011001') + BitParser.add(WHITE, 57, '01011010') + BitParser.add(WHITE, 58, '01011011') + BitParser.add(WHITE, 59, '01001010') + BitParser.add(WHITE, 60, '01001011') + BitParser.add(WHITE, 61, '00110010') + BitParser.add(WHITE, 62, '00110011') + BitParser.add(WHITE, 63, '00110100') + BitParser.add(WHITE, 64, '11011') + BitParser.add(WHITE, 128, '10010') + BitParser.add(WHITE, 192, '010111') + BitParser.add(WHITE, 256, '0110111') + BitParser.add(WHITE, 320, '00110110') + BitParser.add(WHITE, 384, '00110111') + BitParser.add(WHITE, 448, '01100100') + BitParser.add(WHITE, 512, '01100101') + BitParser.add(WHITE, 576, '01101000') + BitParser.add(WHITE, 640, '01100111') + BitParser.add(WHITE, 704, '011001100') + BitParser.add(WHITE, 768, '011001101') + BitParser.add(WHITE, 832, '011010010') + BitParser.add(WHITE, 896, '011010011') + BitParser.add(WHITE, 960, '011010100') BitParser.add(WHITE, 1024, '011010101') BitParser.add(WHITE, 1088, '011010110') BitParser.add(WHITE, 1152, '011010111') @@ -199,85 +192,85 @@ class CCITTG4Parser(BitParser): BitParser.add(WHITE, 2560, '000000011111') BLACK = [None, None] - BitParser.add(BLACK, 0 , '0000110111') - BitParser.add(BLACK, 1 , '010') - BitParser.add(BLACK, 2 , '11') - BitParser.add(BLACK, 3 , '10') - BitParser.add(BLACK, 4 , '011') - BitParser.add(BLACK, 5 , '0011') - BitParser.add(BLACK, 6 , '0010') - BitParser.add(BLACK, 7 , '00011') - BitParser.add(BLACK, 8 , '000101') - BitParser.add(BLACK, 9 , '000100') - BitParser.add(BLACK, 10 , '0000100') - BitParser.add(BLACK, 11 , '0000101') - BitParser.add(BLACK, 12 , '0000111') - BitParser.add(BLACK, 13 , '00000100') - BitParser.add(BLACK, 14 , '00000111') - BitParser.add(BLACK, 15 , '000011000') - BitParser.add(BLACK, 16 , '0000010111') - BitParser.add(BLACK, 17 , '0000011000') - BitParser.add(BLACK, 18 , '0000001000') - BitParser.add(BLACK, 19 , '00001100111') - BitParser.add(BLACK, 20 , '00001101000') - BitParser.add(BLACK, 21 , '00001101100') - BitParser.add(BLACK, 22 , '00000110111') - BitParser.add(BLACK, 23 , '00000101000') - BitParser.add(BLACK, 24 , '00000010111') - BitParser.add(BLACK, 25 , '00000011000') - BitParser.add(BLACK, 26 , '000011001010') - BitParser.add(BLACK, 27 , '000011001011') - BitParser.add(BLACK, 28 , '000011001100') - BitParser.add(BLACK, 29 , '000011001101') - BitParser.add(BLACK, 30 , '000001101000') - BitParser.add(BLACK, 31 , '000001101001') - BitParser.add(BLACK, 32 , '000001101010') - BitParser.add(BLACK, 33 , '000001101011') - BitParser.add(BLACK, 34 , '000011010010') - BitParser.add(BLACK, 35 , '000011010011') - BitParser.add(BLACK, 36 , '000011010100') - BitParser.add(BLACK, 37 , '000011010101') - BitParser.add(BLACK, 38 , '000011010110') - BitParser.add(BLACK, 39 , '000011010111') - BitParser.add(BLACK, 40 , '000001101100') - BitParser.add(BLACK, 41 , '000001101101') - BitParser.add(BLACK, 42 , '000011011010') - BitParser.add(BLACK, 43 , '000011011011') - BitParser.add(BLACK, 44 , '000001010100') - BitParser.add(BLACK, 45 , '000001010101') - BitParser.add(BLACK, 46 , '000001010110') - BitParser.add(BLACK, 47 , '000001010111') - BitParser.add(BLACK, 48 , '000001100100') - BitParser.add(BLACK, 49 , '000001100101') - BitParser.add(BLACK, 50 , '000001010010') - BitParser.add(BLACK, 51 , '000001010011') - BitParser.add(BLACK, 52 , '000000100100') - BitParser.add(BLACK, 53 , '000000110111') - BitParser.add(BLACK, 54 , '000000111000') - BitParser.add(BLACK, 55 , '000000100111') - BitParser.add(BLACK, 56 , '000000101000') - BitParser.add(BLACK, 57 , '000001011000') - BitParser.add(BLACK, 58 , '000001011001') - BitParser.add(BLACK, 59 , '000000101011') - BitParser.add(BLACK, 60 , '000000101100') - BitParser.add(BLACK, 61 , '000001011010') - BitParser.add(BLACK, 62 , '000001100110') - BitParser.add(BLACK, 63 , '000001100111') - BitParser.add(BLACK, 64 , '0000001111') - BitParser.add(BLACK, 128 , '000011001000') - BitParser.add(BLACK, 192 , '000011001001') - BitParser.add(BLACK, 256 , '000001011011') - BitParser.add(BLACK, 320 , '000000110011') - BitParser.add(BLACK, 384 , '000000110100') - BitParser.add(BLACK, 448 , '000000110101') - BitParser.add(BLACK, 512 , '0000001101100') - BitParser.add(BLACK, 576 , '0000001101101') - BitParser.add(BLACK, 640 , '0000001001010') - BitParser.add(BLACK, 704 , '0000001001011') - BitParser.add(BLACK, 768 , '0000001001100') - BitParser.add(BLACK, 832 , '0000001001101') - BitParser.add(BLACK, 896 , '0000001110010') - BitParser.add(BLACK, 960 , '0000001110011') + BitParser.add(BLACK, 0, '0000110111') + BitParser.add(BLACK, 1, '010') + BitParser.add(BLACK, 2, '11') + BitParser.add(BLACK, 3, '10') + BitParser.add(BLACK, 4, '011') + BitParser.add(BLACK, 5, '0011') + BitParser.add(BLACK, 6, '0010') + BitParser.add(BLACK, 7, '00011') + BitParser.add(BLACK, 8, '000101') + BitParser.add(BLACK, 9, '000100') + BitParser.add(BLACK, 10, '0000100') + BitParser.add(BLACK, 11, '0000101') + BitParser.add(BLACK, 12, '0000111') + BitParser.add(BLACK, 13, '00000100') + BitParser.add(BLACK, 14, '00000111') + BitParser.add(BLACK, 15, '000011000') + BitParser.add(BLACK, 16, '0000010111') + BitParser.add(BLACK, 17, '0000011000') + BitParser.add(BLACK, 18, '0000001000') + BitParser.add(BLACK, 19, '00001100111') + BitParser.add(BLACK, 20, '00001101000') + BitParser.add(BLACK, 21, '00001101100') + BitParser.add(BLACK, 22, '00000110111') + BitParser.add(BLACK, 23, '00000101000') + BitParser.add(BLACK, 24, '00000010111') + BitParser.add(BLACK, 25, '00000011000') + BitParser.add(BLACK, 26, '000011001010') + BitParser.add(BLACK, 27, '000011001011') + BitParser.add(BLACK, 28, '000011001100') + BitParser.add(BLACK, 29, '000011001101') + BitParser.add(BLACK, 30, '000001101000') + BitParser.add(BLACK, 31, '000001101001') + BitParser.add(BLACK, 32, '000001101010') + BitParser.add(BLACK, 33, '000001101011') + BitParser.add(BLACK, 34, '000011010010') + BitParser.add(BLACK, 35, '000011010011') + BitParser.add(BLACK, 36, '000011010100') + BitParser.add(BLACK, 37, '000011010101') + BitParser.add(BLACK, 38, '000011010110') + BitParser.add(BLACK, 39, '000011010111') + BitParser.add(BLACK, 40, '000001101100') + BitParser.add(BLACK, 41, '000001101101') + BitParser.add(BLACK, 42, '000011011010') + BitParser.add(BLACK, 43, '000011011011') + BitParser.add(BLACK, 44, '000001010100') + BitParser.add(BLACK, 45, '000001010101') + BitParser.add(BLACK, 46, '000001010110') + BitParser.add(BLACK, 47, '000001010111') + BitParser.add(BLACK, 48, '000001100100') + BitParser.add(BLACK, 49, '000001100101') + BitParser.add(BLACK, 50, '000001010010') + BitParser.add(BLACK, 51, '000001010011') + BitParser.add(BLACK, 52, '000000100100') + BitParser.add(BLACK, 53, '000000110111') + BitParser.add(BLACK, 54, '000000111000') + BitParser.add(BLACK, 55, '000000100111') + BitParser.add(BLACK, 56, '000000101000') + BitParser.add(BLACK, 57, '000001011000') + BitParser.add(BLACK, 58, '000001011001') + BitParser.add(BLACK, 59, '000000101011') + BitParser.add(BLACK, 60, '000000101100') + BitParser.add(BLACK, 61, '000001011010') + BitParser.add(BLACK, 62, '000001100110') + BitParser.add(BLACK, 63, '000001100111') + BitParser.add(BLACK, 64, '0000001111') + BitParser.add(BLACK, 128, '000011001000') + BitParser.add(BLACK, 192, '000011001001') + BitParser.add(BLACK, 256, '000001011011') + BitParser.add(BLACK, 320, '000000110011') + BitParser.add(BLACK, 384, '000000110100') + BitParser.add(BLACK, 448, '000000110101') + BitParser.add(BLACK, 512, '0000001101100') + BitParser.add(BLACK, 576, '0000001101101') + BitParser.add(BLACK, 640, '0000001001010') + BitParser.add(BLACK, 704, '0000001001011') + BitParser.add(BLACK, 768, '0000001001100') + BitParser.add(BLACK, 832, '0000001001101') + BitParser.add(BLACK, 896, '0000001110010') + BitParser.add(BLACK, 960, '0000001110011') BitParser.add(BLACK, 1024, '0000001110100') BitParser.add(BLACK, 1088, '0000001110101') BitParser.add(BLACK, 1152, '0000001110110') @@ -434,7 +427,7 @@ def reset(self): return def output_line(self, y, bits): - print (y, ''.join(str(b) for b in bits)) + print(y, ''.join(str(b) for b in bits)) return def _reset_line(self): @@ -454,12 +447,13 @@ def _flush_line(self): return def _do_vertical(self, dx): - #print '* vertical(%d): curpos=%r, color=%r' % (dx, self._curpos, self._color) - #print ' refline:', self._get_refline(self._curpos+1) + # print '* vertical(%d): curpos=%r, color=%r' + # % (dx, self._curpos, self._color) + # print ' refline:', self._get_refline(self._curpos+1) x1 = self._curpos+1 while 1: if x1 == 0: - if (self._color == 1 and self._refline[x1] != self._color): + if self._color == 1 and self._refline[x1] != self._color: break elif x1 == len(self._refline): break @@ -481,12 +475,12 @@ def _do_vertical(self, dx): return def _do_pass(self): - #print '* pass: curpos=%r, color=%r' % (self._curpos, self._color) - #print ' refline:', self._get_refline(self._curpos+1) + # print '* pass: curpos=%r, color=%r' % (self._curpos, self._color) + # print ' refline:', self._get_refline(self._curpos+1) x1 = self._curpos+1 while 1: if x1 == 0: - if (self._color == 1 and self._refline[x1] != self._color): + if self._color == 1 and self._refline[x1] != self._color: break elif x1 == len(self._refline): break @@ -496,7 +490,7 @@ def _do_pass(self): x1 += 1 while 1: if x1 == 0: - if (self._color == 0 and self._refline[x1] == self._color): + if self._color == 0 and self._refline[x1] == self._color: break elif x1 == len(self._refline): break @@ -510,7 +504,8 @@ def _do_pass(self): return def _do_horizontal(self, n1, n2): - #print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color) + # print '* horizontal(%d,%d): curpos=%r, color=%r' + # % (n1, n2, self._curpos, self._color) if self._curpos < 0: self._curpos = 0 x = self._curpos @@ -528,7 +523,7 @@ def _do_horizontal(self, n1, n2): return def _do_uncompressed(self, bits): - #print '* uncompressed(%r): curpos=%r' % (bits, self._curpos) + # print '* uncompressed(%r): curpos=%r' % (bits, self._curpos) for c in bits: self._curline[self._curpos] = int(c) self._curpos += 1 @@ -536,8 +531,6 @@ def _do_uncompressed(self, bits): return - - class CCITTFaxDecoder(CCITTG4Parser): def __init__(self, width, bytealign=False, reversed=False): @@ -606,5 +599,6 @@ def close(self): fp.close() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 19dddf35..2697a55d 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -1,14 +1,10 @@ - - """ Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode code-points to character ids (CIDs). More information is available on the Adobe website: - http://opensource.adobe.com/wiki/display/cmap/CMap+Resources - """ import sys @@ -21,6 +17,7 @@ import pickle as pickle import struct import logging +import six from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import PSEOF @@ -31,7 +28,6 @@ from .utils import choplist from .utils import nunpack -import six #Python 2+3 compatibility log = logging.getLogger(__name__) @@ -40,8 +36,6 @@ class CMapError(Exception): pass -## CMapBase -## class CMapBase(object): debug = 0 @@ -67,8 +61,6 @@ def use_cmap(self, cmap): return -## CMap -## class CMap(CMapBase): def __init__(self, **kwargs): @@ -119,8 +111,6 @@ def dump(self, out=sys.stdout, code2cid=None, code=None): return -## IdentityCMap -## class IdentityCMap(CMapBase): def decode(self, code): @@ -131,8 +121,6 @@ def decode(self, code): return () -## UnicodeMap -## class UnicodeMap(CMapBase): def __init__(self, **kwargs): @@ -153,12 +141,11 @@ def dump(self, out=sys.stdout): return -## FileCMap -## class FileCMap(CMap): def add_code2cid(self, code, cid): - assert isinstance(code, str) and isinstance(cid, int), str((type(code), type(cid))) + assert isinstance(code, str) and isinstance(cid, int),\ + str((type(code), type(cid))) d = self.code2cid for c in code[:-1]: c = ord(c) @@ -173,8 +160,6 @@ def add_code2cid(self, code, cid): return -## FileUnicodeMap -## class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid, code): @@ -192,8 +177,6 @@ def add_cid2unichr(self, cid, code): return -## PyCMap -## class PyCMap(CMap): def __init__(self, name, module): @@ -204,8 +187,6 @@ def __init__(self, name, module): return -## PyUnicodeMap -## class PyUnicodeMap(UnicodeMap): def __init__(self, name, module, vertical): @@ -218,8 +199,6 @@ def __init__(self, name, module, vertical): return -## CMapDB -## class CMapDB(object): _cmap_cache = {} @@ -260,18 +239,17 @@ def get_cmap(klass, name): return cmap @classmethod - def get_unicode_map(klass, name, vertical=False): + def get_unicode_map(cls, name, vertical=False): try: - return klass._umap_cache[name][vertical] + return cls._umap_cache[name][vertical] except KeyError: pass - data = klass._load_data('to-unicode-%s' % name) - klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)] + data = cls._load_data('to-unicode-%s' % name) + cls._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) + for v in (False, True)] return umaps[vertical] -## CMapParser -## class CMapParser(PSStackParser): def __init__(self, cmap, fp): @@ -315,7 +293,7 @@ def do_keyword(self, pos, token): return if not self._in_cmap: return - # + if token is self.KEYWORD_DEF: try: ((_, k), (_, v)) = self.pop(2) @@ -347,8 +325,9 @@ def do_keyword(self, pos, token): if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] for (s, e, cid) in choplist(3, objs): - if (not isinstance(s, str) or not isinstance(e, str) or - not isinstance(cid, int) or len(s) != len(e)): + if (not isinstance(s, str) or + not isinstance(e, str) or + not isinstance(cid, int) or len(s) != len(e)): continue sprefix = s[:-4] eprefix = e[:-4] @@ -359,7 +338,7 @@ def do_keyword(self, pos, token): s1 = nunpack(svar) e1 = nunpack(evar) vlen = len(svar) - #assert s1 <= e1, str((s1, e1)) + # assert s1 <= e1, str((s1, e1)) for i in range(e1-s1+1): x = sprefix+struct.pack('>L', s1+i)[-vlen:] self.cmap.add_code2cid(x, cid+i) @@ -381,12 +360,12 @@ def do_keyword(self, pos, token): if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] for (s, e, code) in choplist(3, objs): - if (not isinstance(s, bytes) or not isinstance(e, bytes) or - len(s) != len(e)): - continue + if (not isinstance(s, bytes) or + not isinstance(e, bytes) or len(s) != len(e)): + continue s1 = nunpack(s) e1 = nunpack(e) - #assert s1 <= e1, str((s1, e1)) + # assert s1 <= e1, str((s1, e1)) if isinstance(code, list): for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) @@ -427,11 +406,12 @@ def main(argv): for fname in args: fp = file(fname, 'rb') cmap = FileUnicodeMap() - #cmap = FileCMap() + # cmap = FileCMap() CMapParser(cmap, fp).run() fp.close() cmap.dump() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 02545e83..43554265 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import logging import re +import six from .pdfdevice import PDFTextDevice from .pdffont import PDFUnicodeNotDefined from .layout import LTContainer @@ -23,13 +24,10 @@ from .utils import bbox2str from . import utils -import six # Python 2+3 compatibility log = logging.getLogger(__name__) -## PDFLayoutAnalyzer -## class PDFLayoutAnalyzer(PDFTextDevice): def __init__(self, rsrcmgr, pageno=1, laparams=None): @@ -86,7 +84,8 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1)) if x0 == x1 or y0 == y1: self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1), - stroke, fill, evenodd, gstate.scolor, gstate.ncolor)) + stroke, fill, evenodd, gstate.scolor, + gstate.ncolor)) return if shape == 'mlllh': # rectangle @@ -99,9 +98,10 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): (x2, y2) = apply_matrix_pt(self.ctm, (x2, y2)) (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3)) if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or - (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): + (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2), - stroke, fill, evenodd, gstate.scolor, gstate.ncolor)) + stroke, fill, evenodd, gstate.scolor, + gstate.ncolor)) return # other shapes pts = [] @@ -109,7 +109,7 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): for i in range(1, len(p), 2): pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill, - evenodd, gstate.scolor, gstate.ncolor)) + evenodd, gstate.scolor, gstate.ncolor)) return def render_char(self, matrix, font, fontsize, scaling, rise, cid): @@ -120,7 +120,8 @@ def render_char(self, matrix, font, fontsize, scaling, rise, cid): text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) - item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp) + item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, + textdisp) self.cur_item.add(item) return item.adv @@ -132,12 +133,11 @@ def receive_layout(self, ltpage): return -## PDFPageAggregator -## class PDFPageAggregator(PDFLayoutAnalyzer): def __init__(self, rsrcmgr, pageno=1, laparams=None): - PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) + PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, + laparams=laparams) self.result = None return @@ -149,12 +149,11 @@ def get_result(self): return self.result -## PDFConverter -## class PDFConverter(PDFLayoutAnalyzer): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): - PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) + PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, + laparams=laparams) self.outfp = outfp self.codec = codec if hasattr(self.outfp, 'mode'): @@ -177,13 +176,12 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): return -## TextConverter -## class TextConverter(PDFConverter): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False, imagewriter=None): - PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, + pageno=pageno, laparams=laparams) self.showpageno = showpageno self.imagewriter = imagewriter return @@ -226,12 +224,10 @@ def paint_path(self, gstate, stroke, fill, evenodd, path): return -## HTMLConverter -## class HTMLConverter(PDFConverter): RECT_COLORS = { - #'char': 'green', + # 'char': 'green', 'figure': 'yellow', 'textline': 'magenta', 'textbox': 'cyan', @@ -250,7 +246,8 @@ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, pagemargin=50, imagewriter=None, debug=0, rect_colors={'curve': 'black', 'page': 'gray'}, text_colors={'char': 'black'}): - PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, + laparams=laparams) self.scale = scale self.fontscale = fontscale self.layoutmode = layoutmode @@ -277,15 +274,19 @@ def write(self, text): def write_header(self): self.write('\n') if self.codec: - self.write('\n' % self.codec) + self.write('\n' % self.codec) else: - self.write('\n') + self.write('\n') self.write('\n') return def write_footer(self): - self.write('
Page: %s
\n' % - ', '.join('%s' % (i, i) for i in range(1, self.pageno))) + self.write('
' + 'Page: %s
\n' % + ', '.join('%s' % + (i, i) for i in range(1, self.pageno))) self.write('\n') return @@ -296,22 +297,24 @@ def write_text(self, text): def place_rect(self, color, borderwidth, x, y, w, h): color = self.rect_colors.get(color) if color is not None: - self.write('\n' % + self.write('\n' % (color, borderwidth, x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return def place_border(self, color, borderwidth, item): - self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) + self.place_rect(color, borderwidth, item.x0, item.y1, + item.width, item.height) return def place_image(self, item, borderwidth, x, y, w, h): if self.imagewriter is not None: name = self.imagewriter.export_image(item) - self.write('\n' % + self.write('\n' % (enc(name, None), borderwidth, x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) @@ -320,8 +323,10 @@ def place_image(self, item, borderwidth, x, y, w, h): def place_text(self, color, text, x, y, size): color = self.text_colors.get(color) if color is not None: - self.write('' % - (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale)) + self.write('' % + (color, x*self.scale, (self._yoffset-y)*self.scale, + size*self.scale*self.fontscale)) self.write_text(text) self.write('\n') return @@ -329,14 +334,15 @@ def place_text(self, color, text, x, y, size): def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False): self._fontstack.append(self._font) self._font = None - self.write('
' % (color, borderwidth, writing_mode, x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return - def end_div(self, color): + def end_div(self, _): if self._font is not None: self.write('') self._font = self._fontstack.pop() @@ -373,7 +379,8 @@ def render(item): if self.showpageno: self.write('
' % ((self._yoffset-item.y1)*self.scale)) - self.write('Page %s
\n' % (item.pageid, item.pageid)) + self.write('Page %s
\n' % + (item.pageid, item.pageid)) for child in item: render(child) if item.groups is not None: @@ -382,12 +389,14 @@ def render(item): elif isinstance(item, LTCurve): self.place_border('curve', 1, item) elif isinstance(item, LTFigure): - self.begin_div('figure', 1, item.x0, item.y1, item.width, item.height) + self.begin_div('figure', 1, item.x0, item.y1, + item.width, item.height) for child in item: render(child) self.end_div('figure') elif isinstance(item, LTImage): - self.place_image(item, 1, item.x0, item.y1, item.width, item.height) + self.place_image(item, 1, item.x0, item.y1, + item.width, item.height) else: if self.layoutmode == 'exact': if isinstance(item, LTTextLine): @@ -396,12 +405,14 @@ def render(item): render(child) elif isinstance(item, LTTextBox): self.place_border('textbox', 1, item) - self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20) + self.place_text('textbox', str(item.index+1), + item.x0, item.y1, 20) for child in item: render(child) elif isinstance(item, LTChar): self.place_border('char', 1, item) - self.place_text('char', item.get_text(), item.x0, item.y1, item.size) + self.place_text('char', item.get_text(), item.x0, + item.y1, item.size) else: if isinstance(item, LTTextLine): for child in item: @@ -409,13 +420,15 @@ def render(item): if self.layoutmode != 'loose': self.put_newline() elif isinstance(item, LTTextBox): - self.begin_div('textbox', 1, item.x0, item.y1, item.width, item.height, + self.begin_div('textbox', 1, item.x0, item.y1, + item.width, item.height, item.get_writing_mode()) for child in item: render(child) self.end_div('textbox') elif isinstance(item, LTChar): - self.put_text(item.get_text(), item.fontname, item.size) + self.put_text(item.get_text(), item.fontname, + item.size) elif isinstance(item, LTText): self.write_text(item.get_text()) return @@ -428,15 +441,14 @@ def close(self): return -## XMLConverter -## class XMLConverter(PDFConverter): CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]') def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): - PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, + laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.write_header() @@ -470,7 +482,7 @@ def receive_layout(self, ltpage): def show_group(item): if isinstance(item, LTTextBox): self.write('\n' % - (item.index, bbox2str(item.bbox))) + (item.index, bbox2str(item.bbox))) elif isinstance(item, LTTextGroup): self.write('\n' % bbox2str(item.bbox)) for child in item: @@ -481,7 +493,7 @@ def show_group(item): def render(item): if isinstance(item, LTPage): self.write('\n' % - (item.pageid, bbox2str(item.bbox), item.rotate)) + (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) if item.groups is not None: @@ -492,16 +504,17 @@ def render(item): self.write('\n') elif isinstance(item, LTLine): self.write('\n' % - (item.linewidth, bbox2str(item.bbox))) + (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): self.write('\n' % - (item.linewidth, bbox2str(item.bbox))) + (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTCurve): self.write('\n' % - (item.linewidth, bbox2str(item.bbox), item.get_pts())) + (item.linewidth, bbox2str(item.bbox), + item.get_pts())) elif isinstance(item, LTFigure): self.write('
\n' % - (item.name, bbox2str(item.bbox))) + (item.name, bbox2str(item.bbox))) for child in item: render(child) self.write('
\n') @@ -515,13 +528,14 @@ def render(item): if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' self.write('\n' % - (item.index, bbox2str(item.bbox), wmode)) + (item.index, bbox2str(item.bbox), wmode)) for child in item: render(child) self.write('\n') elif isinstance(item, LTChar): self.write('' % - (enc(item.fontname, None), bbox2str(item.bbox), item.size)) + (enc(item.fontname, None), bbox2str(item.bbox), + item.size)) self.write_text(item.get_text()) self.write('\n') elif isinstance(item, LTText): @@ -530,10 +544,10 @@ def render(item): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.write('\n' % - (enc(name, None), item.width, item.height)) + (enc(name, None), item.width, item.height)) else: self.write('\n' % - (item.width, item.height)) + (item.width, item.height)) else: assert False, str(('Unhandled', item)) return diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 870bd28e..9bb9e449 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,16 +1,14 @@ import re +import six from .psparser import PSLiteral from .glyphlist import glyphname2unicode from .latin_enc import ENCODING -import six # Python 2+3 compatibility STRIP_NAME = re.compile(r'[0-9]+') -## name2unicode -## def name2unicode(name): """Converts Adobe glyph names to Unicode numbers.""" if name in glyphname2unicode: @@ -21,8 +19,6 @@ def name2unicode(name): return six.unichr(int(m.group(0))) -## EncodingDB -## class EncodingDB(object): std2unicode = {} @@ -48,8 +44,8 @@ class EncodingDB(object): } @classmethod - def get_encoding(klass, name, diff=None): - cid2unicode = klass.encodings.get(name, klass.std2unicode) + def get_encoding(cls, name, diff=None): + cid2unicode = cls.encodings.get(name, cls.std2unicode) if diff: cid2unicode = cid2unicode.copy() cid = 0 diff --git a/pdfminer/fontmetrics.py b/pdfminer/fontmetrics.py index 8b3779ac..27d26cf9 100644 --- a/pdfminer/fontmetrics.py +++ b/pdfminer/fontmetrics.py @@ -1,5 +1,3 @@ - - """ Font metrics for the Adobe core 14 fonts. Font metrics are used to compute the boundary of each character @@ -11,8 +9,7 @@ """ -### BEGIN Verbatim copy of the license part - +# BEGIN Verbatim copy of the license part # # Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe # @@ -25,8 +22,7 @@ # paragraph is not modified. Adobe Systems has no responsibility or # obligation to support the use of the AFM files. # - -### END Verbatim copy of the license part +# END Verbatim copy of the license part FONT_METRICS = { 'Courier': ({'FontName': 'Courier', 'Descent': -194.0, 'FontBBox': (-6.0, -249.0, 639.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {u' ': 600, u'!': 600, u'"': 600, u'#': 600, u'$': 600, u'%': 600, u'&': 600, u"'": 600, u'(': 600, u')': 600, u'*': 600, u'+': 600, u',': 600, u'-': 600, u'.': 600, u'/': 600, u'0': 600, u'1': 600, u'2': 600, u'3': 600, u'4': 600, u'5': 600, u'6': 600, u'7': 600, u'8': 600, u'9': 600, u':': 600, u';': 600, u'<': 600, u'=': 600, u'>': 600, u'?': 600, u'@': 600, u'A': 600, u'B': 600, u'C': 600, u'D': 600, u'E': 600, u'F': 600, u'G': 600, u'H': 600, u'I': 600, u'J': 600, u'K': 600, u'L': 600, u'M': 600, u'N': 600, u'O': 600, u'P': 600, u'Q': 600, u'R': 600, u'S': 600, u'T': 600, u'U': 600, u'V': 600, u'W': 600, u'X': 600, u'Y': 600, u'Z': 600, u'[': 600, u'\\': 600, u']': 600, u'^': 600, u'_': 600, u'`': 600, u'a': 600, u'b': 600, u'c': 600, u'd': 600, u'e': 600, u'f': 600, u'g': 600, u'h': 600, u'i': 600, u'j': 600, u'k': 600, u'l': 600, u'm': 600, u'n': 600, u'o': 600, u'p': 600, u'q': 600, u'r': 600, u's': 600, u't': 600, u'u': 600, u'v': 600, u'w': 600, u'x': 600, u'y': 600, u'z': 600, u'{': 600, u'|': 600, u'}': 600, u'~': 600, u'\xa1': 600, u'\xa2': 600, u'\xa3': 600, u'\xa4': 600, u'\xa5': 600, u'\xa6': 600, u'\xa7': 600, u'\xa8': 600, u'\xa9': 600, u'\xaa': 600, u'\xab': 600, u'\xac': 600, u'\xae': 600, u'\xaf': 600, u'\xb0': 600, u'\xb1': 600, u'\xb2': 600, u'\xb3': 600, u'\xb4': 600, u'\xb5': 600, u'\xb6': 600, u'\xb7': 600, u'\xb8': 600, u'\xb9': 600, u'\xba': 600, u'\xbb': 600, u'\xbc': 600, u'\xbd': 600, u'\xbe': 600, u'\xbf': 600, u'\xc0': 600, u'\xc1': 600, u'\xc2': 600, u'\xc3': 600, u'\xc4': 600, u'\xc5': 600, u'\xc6': 600, u'\xc7': 600, u'\xc8': 600, u'\xc9': 600, u'\xca': 600, u'\xcb': 600, u'\xcc': 600, u'\xcd': 600, u'\xce': 600, u'\xcf': 600, u'\xd0': 600, u'\xd1': 600, u'\xd2': 600, u'\xd3': 600, u'\xd4': 600, u'\xd5': 600, u'\xd6': 600, u'\xd7': 600, u'\xd8': 600, u'\xd9': 600, u'\xda': 600, u'\xdb': 600, u'\xdc': 600, u'\xdd': 600, u'\xde': 600, u'\xdf': 600, u'\xe0': 600, u'\xe1': 600, u'\xe2': 600, u'\xe3': 600, u'\xe4': 600, u'\xe5': 600, u'\xe6': 600, u'\xe7': 600, u'\xe8': 600, u'\xe9': 600, u'\xea': 600, u'\xeb': 600, u'\xec': 600, u'\xed': 600, u'\xee': 600, u'\xef': 600, u'\xf0': 600, u'\xf1': 600, u'\xf2': 600, u'\xf3': 600, u'\xf4': 600, u'\xf5': 600, u'\xf6': 600, u'\xf7': 600, u'\xf8': 600, u'\xf9': 600, u'\xfa': 600, u'\xfb': 600, u'\xfc': 600, u'\xfd': 600, u'\xfe': 600, u'\xff': 600, u'\u0100': 600, u'\u0101': 600, u'\u0102': 600, u'\u0103': 600, u'\u0104': 600, u'\u0105': 600, u'\u0106': 600, u'\u0107': 600, u'\u010c': 600, u'\u010d': 600, u'\u010e': 600, u'\u010f': 600, u'\u0110': 600, u'\u0111': 600, u'\u0112': 600, u'\u0113': 600, u'\u0116': 600, u'\u0117': 600, u'\u0118': 600, u'\u0119': 600, u'\u011a': 600, u'\u011b': 600, u'\u011e': 600, u'\u011f': 600, u'\u0122': 600, u'\u0123': 600, u'\u012a': 600, u'\u012b': 600, u'\u012e': 600, u'\u012f': 600, u'\u0130': 600, u'\u0131': 600, u'\u0136': 600, u'\u0137': 600, u'\u0139': 600, u'\u013a': 600, u'\u013b': 600, u'\u013c': 600, u'\u013d': 600, u'\u013e': 600, u'\u0141': 600, u'\u0142': 600, u'\u0143': 600, u'\u0144': 600, u'\u0145': 600, u'\u0146': 600, u'\u0147': 600, u'\u0148': 600, u'\u014c': 600, u'\u014d': 600, u'\u0150': 600, u'\u0151': 600, u'\u0152': 600, u'\u0153': 600, u'\u0154': 600, u'\u0155': 600, u'\u0156': 600, u'\u0157': 600, u'\u0158': 600, u'\u0159': 600, u'\u015a': 600, u'\u015b': 600, u'\u015e': 600, u'\u015f': 600, u'\u0160': 600, u'\u0161': 600, u'\u0162': 600, u'\u0163': 600, u'\u0164': 600, u'\u0165': 600, u'\u016a': 600, u'\u016b': 600, u'\u016e': 600, u'\u016f': 600, u'\u0170': 600, u'\u0171': 600, u'\u0172': 600, u'\u0173': 600, u'\u0178': 600, u'\u0179': 600, u'\u017a': 600, u'\u017b': 600, u'\u017c': 600, u'\u017d': 600, u'\u017e': 600, u'\u0192': 600, u'\u0218': 600, u'\u0219': 600, u'\u02c6': 600, u'\u02c7': 600, u'\u02d8': 600, u'\u02d9': 600, u'\u02da': 600, u'\u02db': 600, u'\u02dc': 600, u'\u02dd': 600, u'\u2013': 600, u'\u2014': 600, u'\u2018': 600, u'\u2019': 600, u'\u201a': 600, u'\u201c': 600, u'\u201d': 600, u'\u201e': 600, u'\u2020': 600, u'\u2021': 600, u'\u2022': 600, u'\u2026': 600, u'\u2030': 600, u'\u2039': 600, u'\u203a': 600, u'\u2044': 600, u'\u2122': 600, u'\u2202': 600, u'\u2206': 600, u'\u2211': 600, u'\u2212': 600, u'\u221a': 600, u'\u2260': 600, u'\u2264': 600, u'\u2265': 600, u'\u25ca': 600, u'\uf6c3': 600, u'\ufb01': 600, u'\ufb02': 600}), diff --git a/pdfminer/glyphlist.py b/pdfminer/glyphlist.py index 848b0971..259f4242 100644 --- a/pdfminer/glyphlist.py +++ b/pdfminer/glyphlist.py @@ -12,7 +12,7 @@ """ -# ################################################################################### +# ############################################################################# # Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated # # Permission is hereby granted, free of charge, to any person obtaining a @@ -42,7 +42,7 @@ # those concerning merchantability or fitness for a particular purpose or # non-infringement of any third party rights regarding the Adobe # materials. -# ################################################################################### +# ############################################################################# # Name: Adobe Glyph List # Table version: 2.0 # Date: September 20, 2002 @@ -4336,4 +4336,4 @@ 'zuhiragana': u'\u305A', 'zukatakana': u'\u30BA', } -#--end +# --end diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index cdef1e7f..e4a3c1b9 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -3,33 +3,43 @@ Functions that encapsulate "usual" use-cases for pdfminer, for use making bundled scripts and for using pdfminer as a module for routine tasks. """ - -import six import sys +import six -from .pdfdocument import PDFDocument -from .pdfparser import PDFParser from .pdfinterp import PDFResourceManager, PDFPageInterpreter -from .pdfdevice import PDFDevice, TagExtractor +from .pdfdevice import TagExtractor from .pdfpage import PDFPage from .converter import XMLConverter, HTMLConverter, TextConverter -from .cmapdb import CMapDB from .image import ImageWriter -def extract_text_to_fp(inf, outfp, - _py2_no_more_posargs=None, # Bloody Python2 needs a shim - output_type='text', codec='utf-8', laparams = None, - maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, - layoutmode='normal', output_dir=None, strip_control=False, - debug=False, disable_caching=False, **other): +def extract_text_to_fp(inf, + outfp, + _py2_no_more_posargs=None, # Python2 needs a shim + output_type='text', + codec='utf-8', + laparams=None, + maxpages=0, + page_numbers=None, + password="", + scale=1.0, + rotation=0, + layoutmode='normal', + output_dir=None, + strip_control=False, + debug=False, + disable_caching=False, + **other): """ Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. - Beware laparams: Including an empty LAParams is not the same as passing None! - Returns nothing, acting as it does on two streams. Use StringIO to get strings. - - output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly. + Beware laparams: + Including an empty LAParams is not the same as passing None! + Returns nothing, acting as it does on two streams. Use StringIO + to get strings. + + output_type: May be 'text', 'xml', 'html', 'tag'. + Only 'text' works properly. codec: Text decoding codec laparams: An LAParams object from pdfminer.layout. Default is None but may not layout correctly. @@ -50,7 +60,7 @@ def extract_text_to_fp(inf, outfp, imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) - + rsrcmgr = PDFResourceManager(caching=not disable_caching) if output_type == 'text': @@ -79,6 +89,6 @@ def extract_text_to_fp(inf, outfp, caching=not disable_caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 - interpreter.process_page(page) + interpreter.process_page(page) device.close() diff --git a/pdfminer/image.py b/pdfminer/image.py index e85815c8..bc6f07b0 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -13,8 +13,6 @@ def align32(x): return ((x+3)//4)*4 -## BMPWriter -## class BMPWriter(object): def __init__(self, fp, bits, width, height): @@ -33,9 +31,11 @@ def __init__(self, fp, bits, width, height): self.linesize = align32((self.width*self.bits+7)//8) self.datasize = self.linesize * self.height headersize = 14+40+ncols*4 - info = struct.pack('' % - (self.char_margin, self.line_margin, self.word_margin, self.all_texts)) + return ('' % + (self.char_margin, self.line_margin, + self.word_margin, self.all_texts)) -## LTItem -## class LTItem(object): - def analyze(self, laparams): + def analyze(self, _): """Perform the layout analysis.""" return -## LTText -## class LTText(object): def __repr__(self): @@ -76,8 +69,6 @@ def get_text(self): raise NotImplementedError -## LTComponent -## class LTComponent(LTItem): def __init__(self, bbox): @@ -92,10 +83,13 @@ def __repr__(self): # Disable comparison. def __lt__(self, _): raise ValueError + def __le__(self, _): raise ValueError + def __gt__(self, _): raise ValueError + def __ge__(self, _): raise ValueError @@ -121,15 +115,13 @@ def hdistance(self, obj): assert isinstance(obj, LTComponent), str(type(obj)) if self.is_hoverlap(obj): return 0 - else: - return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) + return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) def hoverlap(self, obj): assert isinstance(obj, LTComponent), str(type(obj)) if self.is_hoverlap(obj): return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) - else: - return 0 + return 0 def is_voverlap(self, obj): assert isinstance(obj, LTComponent), str(type(obj)) @@ -139,22 +131,19 @@ def vdistance(self, obj): assert isinstance(obj, LTComponent), str(type(obj)) if self.is_voverlap(obj): return 0 - else: - return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) + return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) def voverlap(self, obj): assert isinstance(obj, LTComponent), str(type(obj)) if self.is_voverlap(obj): return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) - else: - return 0 + return 0 -## LTCurve -## class LTCurve(LTComponent): - def __init__(self, linewidth, pts, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None): + def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False, + stroking_color=None, non_stroking_color=None): LTComponent.__init__(self, get_bound(pts)) self.pts = pts self.linewidth = linewidth @@ -169,27 +158,27 @@ def get_pts(self): return ','.join('%.3f,%.3f' % p for p in self.pts) -## LTLine -## class LTLine(LTCurve): - def __init__(self, linewidth, p0, p1, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None): - LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color) + def __init__(self, linewidth, p0, p1, stroke=False, fill=False, + evenodd=False, stroking_color=None, non_stroking_color=None): + LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, + stroking_color, non_stroking_color) return -## LTRect -## class LTRect(LTCurve): - def __init__(self, linewidth, bbox, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None): + def __init__(self, linewidth, bbox, stroke=False, fill=False, + evenodd=False, stroking_color=None, non_stroking_color=None): (x0, y0, x1, y1) = bbox - LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, fill, evenodd, stroking_color, non_stroking_color) + LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), + (x1, y1), (x0, y1)], + stroke, fill, evenodd, stroking_color, + non_stroking_color) return -## LTImage -## class LTImage(LTComponent): def __init__(self, name, stream, bbox): @@ -211,8 +200,6 @@ def __repr__(self): bbox2str(self.bbox), self.srcsize)) -## LTAnno -## class LTAnno(LTItem, LTText): def __init__(self, text): @@ -223,8 +210,6 @@ def get_text(self): return self._text -## LTChar -## class LTChar(LTComponent, LTText): def __init__(self, matrix, font, fontsize, scaling, rise, @@ -255,8 +240,8 @@ def __init__(self, matrix, font, fontsize, scaling, rise, ty = descent + rise bll = (0, ty) bur = (self.adv, ty+height) - (a, b, c, d, e, f) = self.matrix - self.upright = (0 < a*d*scaling and b*c <= 0) + (a, b, c, d, _, _) = self.matrix + self.upright = (b*c <= 0 < a*d*scaling) (x0, y0) = apply_matrix_pt(self.matrix, bll) (x1, y1) = apply_matrix_pt(self.matrix, bur) if x1 < x0: @@ -279,13 +264,14 @@ def __repr__(self): def get_text(self): return self._text - def is_compatible(self, obj): - """Returns True if two characters can coexist in the same line.""" + def is_compatible(self, _): + """Returns always True. + Was documented as: + Returns True if two characters can coexist in the same line. + """ return True -## LTContainer -## class LTContainer(LTComponent): def __init__(self, bbox): @@ -314,8 +300,6 @@ def analyze(self, laparams): return -## LTExpandableContainer -## class LTExpandableContainer(LTContainer): def __init__(self): @@ -329,8 +313,6 @@ def add(self, obj): return -## LTTextContainer -## class LTTextContainer(LTExpandableContainer, LTText): def __init__(self): @@ -339,11 +321,10 @@ def __init__(self): return def get_text(self): - return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText)) + return ''.join(obj.get_text() + for obj in self if isinstance(obj, LTText)) -## LTTextLine -## class LTTextLine(LTTextContainer): def __init__(self, word_margin): @@ -417,13 +398,10 @@ def find_neighbors(self, plane, ratio): abs(obj.y1-self.y1) < d))] -## LTTextBox -## -## A set of text objects that are grouped within -## a certain rectangular area. -## class LTTextBox(LTTextContainer): - + """ + A set of text objects that are grouped within a certain rectangular area. + """ def __init__(self): LTTextContainer.__init__(self) self.index = -1 @@ -457,8 +435,6 @@ def get_writing_mode(self): return 'tb-rl' -## LTTextGroup -## class LTTextGroup(LTTextContainer): def __init__(self, objs): @@ -489,8 +465,6 @@ def analyze(self, laparams): return -## LTLayoutContainer -## class LTLayoutContainer(LTContainer): def __init__(self, bbox): @@ -516,7 +490,8 @@ def group_objects(self, laparams, objs): # (char_margin) halign = (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and - (min(obj0.height, obj1.height) * laparams.line_overlap < + (min(obj0.height, obj1.height) * + laparams.line_overlap < obj0.voverlap(obj1)) and (obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin)) @@ -538,13 +513,15 @@ def group_objects(self, laparams, objs): valign = (laparams.detect_vertical and obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and - (min(obj0.width, obj1.width) * laparams.line_overlap < + (min(obj0.width, obj1.width) * + laparams.line_overlap < obj0.hoverlap(obj1)) and (obj0.vdistance(obj1) < - max(obj0.height, obj1.height) * laparams.char_margin)) + max(obj0.height, obj1.height) * + laparams.char_margin)) if ((halign and isinstance(line, LTTextLineHorizontal)) or - (valign and isinstance(line, LTTextLineVertical))): + (valign and isinstance(line, LTTextLineVertical))): line.add(obj1) elif line is not None: yield line @@ -577,7 +554,8 @@ def group_textlines(self, laparams, lines): boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - if line not in neighbors: continue + if line not in neighbors: + continue members = [] for obj1 in neighbors: members.append(obj1) @@ -592,7 +570,8 @@ def group_textlines(self, laparams, lines): boxes[obj] = box done = set() for line in lines: - if line not in boxes: continue + if line not in boxes: + continue box = boxes[line] if box in done: continue @@ -621,7 +600,8 @@ def dist(obj1, obj2): y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) - return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) + return (x1-x0)*(y1-y0) - \ + obj1.width*obj1.height - obj2.width*obj2.height def isany(obj1, obj2): """Check if there's any other object between obj1 and obj2. @@ -634,8 +614,8 @@ def isany(obj1, obj2): return objs.difference((obj1, obj2)) def key_obj(t): - (c,d,_,_) = t - return (c,d) + (c, d, _, _) = t + return c, d # XXX this still takes O(n^2) :( dists = [] @@ -654,14 +634,14 @@ def key_obj(t): dists.append((1, d, obj1, obj2)) continue if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or - isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): + isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) - dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists - if (obj1 in plane and obj2 in plane) ] + dists = [(c, d, obj1, obj2) for (c, d, obj1, obj2) in dists + if (obj1 in plane and obj2 in plane)] for other in plane: dists.append((0, dist(group, other), group, other)) dists = csort(dists, key=key_obj) @@ -672,7 +652,8 @@ def key_obj(t): def analyze(self, laparams): # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. - (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) + (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), + self) for obj in otherobjs: obj.analyze(laparams) if not textobjs: @@ -682,7 +663,7 @@ def analyze(self, laparams): for obj in empties: obj.analyze(laparams) textboxes = list(self.group_textlines(laparams, textlines)) - if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes: + if -1 <= laparams.boxes_flow <= +1 and textboxes: self.groups = self.group_textboxes(laparams, textboxes) assigner = IndexAssigner() for group in self.groups: @@ -692,24 +673,22 @@ def analyze(self, laparams): else: def getkey(box): if isinstance(box, LTTextBoxVertical): - return (0, -box.x1, box.y0) - else: - return (1, box.y0, box.x0) + return 0, -box.x1, box.y0 + return 1, box.y0, box.x0 textboxes.sort(key=getkey) self._objs = textboxes + otherobjs + empties return -## LTFigure -## class LTFigure(LTLayoutContainer): def __init__(self, name, bbox, matrix): self.name = name self.matrix = matrix (x, y, w, h) = bbox - bbox = get_bound(apply_matrix_pt(matrix, (p, q)) - for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h))) + bbox = get_bound( + apply_matrix_pt(matrix, (p, q)) + for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h))) LTLayoutContainer.__init__(self, bbox) return @@ -725,10 +704,7 @@ def analyze(self, laparams): return -## LTPage -## class LTPage(LTLayoutContainer): - def __init__(self, pageid, bbox, rotate=0): LTLayoutContainer.__init__(self, bbox) self.pageid = pageid diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index 078ac040..adb97809 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -1,16 +1,13 @@ +import logging from io import BytesIO +import six -import six #Python 2+3 compatibility - -import logging class CorruptDataError(Exception): pass -## LZWDecoder -## class LZWDecoder(object): def __init__(self, fp): @@ -98,5 +95,5 @@ def run(self): # lzwdecode def lzwdecode(data): fp = BytesIO(data) - s=LZWDecoder(fp).run() + s = LZWDecoder(fp).run() return b''.join(s) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index 6fe6eaa2..9de732b8 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -1,10 +1,8 @@ +import six from .psparser import LIT -import six #Python 2+3 compatibility -## PDFColorSpace -## LITERAL_DEVICE_GRAY = LIT('DeviceGray') LITERAL_DEVICE_RGB = LIT('DeviceRGB') LITERAL_DEVICE_CMYK = LIT('DeviceCMYK') @@ -12,26 +10,24 @@ class PDFColorSpace(object): - def __init__(self, name, ncomponents): - self.name = name + def __init__(self, my_name, ncomponents): + self.name = my_name self.ncomponents = ncomponents return def __repr__(self): - return '' % (self.name, self.ncomponents) + return '' % \ + (self.name, self.ncomponents) PREDEFINED_COLORSPACE = {} -for (name, n) in six.iteritems({ - 'CalRGB': 3, - 'CalGray': 1, - 'Lab': 3, - 'DeviceRGB': 3, - 'DeviceCMYK': 4, - 'DeviceGray': 1, - 'Separation': 1, - 'Indexed': 1, - 'Pattern': 1, -}) : - PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n) - \ No newline at end of file +for (name, n) in six.iteritems({'CalRGB': 3, + 'CalGray': 1, + 'Lab': 3, + 'DeviceRGB': 3, + 'DeviceCMYK': 4, + 'DeviceGray': 1, + 'Separation': 1, + 'Indexed': 1, + 'Pattern': 1, }): + PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 94351016..3936af1d 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,10 +1,8 @@ from .pdffont import PDFUnicodeNotDefined - from . import utils -## PDFDevice -## + class PDFDevice(object): def __init__(self, rsrcmgr): @@ -59,8 +57,6 @@ def render_string(self, textstate, seq): return -## PDFTextDevice -## class PDFTextDevice(PDFDevice): def render_string(self, textstate, seq): @@ -84,8 +80,8 @@ def render_string(self, textstate, seq): scaling, charspace, wordspace, rise, dxscale) return - def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + def render_string_horizontal(self, seq, matrix, pos, font, fontsize, + scaling, charspace, wordspace, rise, dxscale): (x, y) = pos needcharspace = False for obj in seq: @@ -96,15 +92,16 @@ def render_string_horizontal(self, seq, matrix, pos, for cid in font.decode(obj): if needcharspace: x += charspace - x += self.render_char(utils.translate_matrix(matrix, (x, y)), + x += self.render_char(utils.translate_matrix(matrix, + (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: x += wordspace needcharspace = True - return (x, y) + return x, y - def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + def render_string_vertical(self, seq, matrix, pos, font, fontsize, scaling, + charspace, wordspace, rise, dxscale): (x, y) = pos needcharspace = False for obj in seq: @@ -115,19 +112,18 @@ def render_string_vertical(self, seq, matrix, pos, for cid in font.decode(obj): if needcharspace: y += charspace - y += self.render_char(utils.translate_matrix(matrix, (x, y)), + y += self.render_char(utils.translate_matrix(matrix, + (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace needcharspace = True - return (x, y) + return x, y def render_char(self, matrix, font, fontsize, scaling, rise, cid): return 0 -## TagExtractor -## class TagExtractor(PDFDevice): def __init__(self, rsrcmgr, outfp, codec='utf-8'): @@ -157,7 +153,8 @@ def render_string(self, textstate, seq): return def begin_page(self, page, ctm): - output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) + output = '' % \ + (self.pageno, utils.bbox2str(page.mediabox), page.rotate) self.outfp.write(utils.make_compat_bytes(output)) return @@ -169,7 +166,8 @@ def end_page(self, page): def begin_tag(self, tag, props=None): s = '' if isinstance(props, dict): - s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) + s = ''.join(' %s="%s"' % + (utils.enc(k), utils.enc(str(v))) for (k, v) in sorted(props.iteritems())) out_s = '<%s%s>' % (utils.enc(tag.name), s) self.outfp.write(utils.make_compat_bytes(out_s)) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 5fb9cce4..927dfc12 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -2,8 +2,7 @@ import re import struct import logging - -import six # Python 2+3 compatibility +import six try: import hashlib as md5 except ImportError: @@ -39,34 +38,37 @@ log = logging.getLogger(__name__) -## Exceptions -## + class PDFNoValidXRef(PDFSyntaxError): pass + class PDFNoOutlines(PDFException): pass + class PDFDestinationNotFound(PDFException): pass + class PDFEncryptionError(PDFException): pass + class PDFPasswordIncorrect(PDFEncryptionError): pass + class PDFTextExtractionNotAllowed(PDFEncryptionError): pass + # some predefined literals and keywords. LITERAL_OBJSTM = LIT('ObjStm') LITERAL_XREF = LIT('XRef') LITERAL_CATALOG = LIT('Catalog') -## XRefs -## class PDFBaseXRef(object): def get_trailer(self): @@ -82,8 +84,6 @@ def get_pos(self, objid): raise KeyError(objid) -## PDFXRef -## class PDFXRef(PDFBaseXRef): def __init__(self): @@ -109,14 +109,16 @@ def load(self, parser): break f = line.strip().split(b' ') if len(f) != 2: - raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) + raise PDFNoValidXRef('Trailer not found: %r: line=%r' % + (parser, line)) try: if six.PY2: (start, nobjs) = map(long, f) else: (start, nobjs) = map(int, f) except ValueError: - raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) + raise PDFNoValidXRef('Invalid line: %r: line=%r' % + (parser, line)) for objid in range(start, start+nobjs): try: (_, line) = parser.nextline() @@ -124,11 +126,13 @@ def load(self, parser): raise PDFNoValidXRef('Unexpected EOF - file corrupted?') f = line.strip().split(b' ') if len(f) != 3: - raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) + raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % + (parser, line)) (pos, genno, use) = f if use != b'n': continue - self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno)) + self.offsets[objid] = \ + (None, long(pos) if six.PY2 else int(pos), int(genno)) log.info('xref objects: %r', self.offsets) self.load_trailer(parser) return @@ -160,8 +164,6 @@ def get_pos(self, objid): raise -## PDFXRefFallback -## class PDFXRefFallback(PDFXRef): def __repr__(self): @@ -182,7 +184,7 @@ def load(self, parser): log.info('trailer: %r', self.trailer) break if six.PY3: - line=line.decode('latin-1') #default pdf encoding + line = line.decode('latin-1') # default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue @@ -193,7 +195,8 @@ def load(self, parser): # expand ObjStm. parser.seek(pos) (_, obj) = parser.nextobject() - if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM: + if isinstance(obj, PDFStream) and obj.get('Type') is\ + LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream['N'] @@ -216,8 +219,6 @@ def load(self, parser): return -## PDFXRefStream -## class PDFXRefStream(PDFBaseXRef): def __init__(self): @@ -228,14 +229,15 @@ def __init__(self): return def __repr__(self): - return '' % (self.ranges) + return '' % self.ranges def load(self, parser): (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = parser.nextobject() - if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: + if not isinstance(stream, PDFStream) or stream['Type'] \ + is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream['Size'] index_array = stream.get('Index', (0, size)) @@ -267,7 +269,7 @@ def get_objids(self): def get_pos(self, objid): index = 0 for (start, nobjs) in self.ranges: - if start <= objid and objid < start+nobjs: + if start <= objid < start+nobjs: index += objid - start break else: @@ -288,8 +290,6 @@ def get_pos(self, objid): raise KeyError(objid) -## PDFSecurityHandler -## class PDFStandardSecurityHandler(object): PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' @@ -306,7 +306,8 @@ def __init__(self, docid, param, password=''): def init(self): self.init_params() if self.r not in self.supported_revisions: - raise PDFEncryptionError('Unsupported revision: param=%r' % self.param) + raise PDFEncryptionError('Unsupported revision:' + ' param=%r' % self.param) self.init_key() return @@ -412,7 +413,8 @@ def decrypt(self, objid, genno, data, attrs=None): return self.decrypt_rc4(objid, genno, data) def decrypt_rc4(self, objid, genno, data): - key = self.key + struct.pack('H', fp.read(2))[0] + value = b1 << 24 | b2 << 16 |\ + struct.unpack('>H', fp.read(2))[0] stack.append(value) return d @@ -180,84 +175,84 @@ def getdict(data): class CFFFont(object): STANDARD_STRINGS = ( - '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', - 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft', - 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period', - 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', - 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal', - 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash', - 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a', - 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', - 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown', - 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section', - 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft', - 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash', - 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet', - 'quotesinglbase', 'quotedblbase', 'quotedblright', - 'guillemotright', 'ellipsis', 'perthousand', 'questiondown', - 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve', - 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut', - 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash', - 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', - 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu', - 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn', - 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn', - 'threequarters', 'twosuperior', 'registered', 'minus', 'eth', - 'multiply', 'threesuperior', 'copyright', 'Aacute', - 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde', - 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave', - 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', - 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', - 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave', - 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex', - 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute', - 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex', - 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', - 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute', - 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis', - 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle', - 'dollarsuperior', 'ampersandsmall', 'Acutesmall', - 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', - 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle', - 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle', - 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle', - 'commasuperior', 'threequartersemdash', 'periodsuperior', - 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', - 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior', - 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior', - 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior', - 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall', - 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', - 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', - 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall', - 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall', - 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall', - 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', - 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall', - 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior', - 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall', - 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths', - 'onethird', 'twothirds', 'zerosuperior', 'foursuperior', - 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior', - 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', - 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior', - 'seveninferior', 'eightinferior', 'nineinferior', - 'centinferior', 'dollarinferior', 'periodinferior', - 'commainferior', 'Agravesmall', 'Aacutesmall', - 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', - 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall', - 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall', - 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall', - 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', - 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall', - 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall', - 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall', - 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', - '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book', - 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', + '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign', + 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft', + 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period', + 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', + 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal', + 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', + 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash', + 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a', + 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', + 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown', + 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section', + 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft', + 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash', + 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet', + 'quotesinglbase', 'quotedblbase', 'quotedblright', + 'guillemotright', 'ellipsis', 'perthousand', 'questiondown', + 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve', + 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut', + 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash', + 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash', + 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu', + 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn', + 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn', + 'threequarters', 'twosuperior', 'registered', 'minus', 'eth', + 'multiply', 'threesuperior', 'copyright', 'Aacute', + 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde', + 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave', + 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde', + 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde', + 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave', + 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex', + 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute', + 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex', + 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex', + 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute', + 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis', + 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle', + 'dollarsuperior', 'ampersandsmall', 'Acutesmall', + 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader', + 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle', + 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle', + 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle', + 'commasuperior', 'threequartersemdash', 'periodsuperior', + 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior', + 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior', + 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior', + 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior', + 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall', + 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall', + 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall', + 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall', + 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall', + 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall', + 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall', + 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall', + 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior', + 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall', + 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths', + 'onethird', 'twothirds', 'zerosuperior', 'foursuperior', + 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior', + 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior', + 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior', + 'seveninferior', 'eightinferior', 'nineinferior', + 'centinferior', 'dollarinferior', 'periodinferior', + 'commainferior', 'Agravesmall', 'Aacutesmall', + 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall', + 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall', + 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall', + 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall', + 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall', + 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall', + 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall', + 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall', + 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', + '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book', + 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', ) class INDEX(object): @@ -289,7 +284,8 @@ def __init__(self, name, fp): self.name = name self.fp = fp # Header - (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4)) + (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', + self.fp.read(4)) self.fp.read(hdrsize-4) # Name INDEX self.name_index = self.INDEX(self.fp) @@ -316,7 +312,8 @@ def __init__(self, name, fp): if format == b'\x00': # Format 0 (n,) = struct.unpack('B', self.fp.read(1)) - for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))): + for (code, gid) in enumerate(struct.unpack('B'*n, + self.fp.read(n))): self.code2gid[code] = gid self.gid2code[gid] = code elif format == b'\x01': @@ -339,7 +336,8 @@ def __init__(self, name, fp): if format == b'\x00': # Format 0 n = self.nglyphs-1 - for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))): + for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, + self.fp.read(2*n))): gid += 1 name = self.getstr(sid) self.name2gid[name] = gid @@ -360,9 +358,9 @@ def __init__(self, name, fp): assert False, str(('Unhandled', format)) else: raise ValueError('unsupported charset format: %r' % format) - #print self.code2gid - #print self.name2gid - #assert 0 + # print self.code2gid + # print self.name2gid + # assert 0 return def getstr(self, sid): @@ -371,8 +369,6 @@ def getstr(self, sid): return self.string_index[sid-len(self.STANDARD_STRINGS)] -## TrueTypeFont -## class TrueTypeFont(object): class CMapNotFound(Exception): @@ -405,7 +401,8 @@ def create_unicode_map(self): fp.seek(base_offset+st_offset) (fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6)) if fmttype == 0: - char2gid.update(enumerate(struct.unpack('>256B', fp.read(256)))) + char2gid.update(enumerate(struct.unpack('>256B', + fp.read(256)))) elif fmttype == 2: subheaderkeys = struct.unpack('>256H', fp.read(512)) firstbytes = [0]*8192 @@ -414,8 +411,10 @@ def create_unicode_map(self): nhdrs = max(subheaderkeys)//8 + 1 hdrs = [] for i in range(nhdrs): - (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8)) - hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset)) + (firstcode, entcount, delta, offset) = struct.unpack( + '>HHhH', fp.read(8)) + hdrs.append((i, firstcode, entcount, + delta, fp.tell()-2+offset)) for (i, firstcode, entcount, delta, pos) in hdrs: if not entcount: continue @@ -439,7 +438,9 @@ def create_unicode_map(self): if idr: fp.seek(pos+idr) for c in range(sc, ec+1): - char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff + char2gid[c] = ( + struct.unpack('>H', + fp.read(2))[0] + idd) & 0xffff else: for c in range(sc, ec+1): char2gid[c] = (c + idd) & 0xffff @@ -452,8 +453,6 @@ def create_unicode_map(self): return unicode_map -## Fonts -## class PDFFontError(PDFException): pass @@ -461,11 +460,11 @@ class PDFFontError(PDFException): class PDFUnicodeNotDefined(PDFFontError): pass + LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') LITERAL_TYPE1C = LIT('Type1C') -# PDFFont class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): @@ -478,7 +477,8 @@ def __init__(self, descriptor, widths, default_width=None): self.ascent = num_value(descriptor.get('Ascent', 0)) self.descent = num_value(descriptor.get('Descent', 0)) self.italic_angle = num_value(descriptor.get('ItalicAngle', 0)) - self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0)) + self.default_width = default_width or num_value(descriptor.get( + 'MissingWidth', 0)) self.leading = num_value(descriptor.get('Leading', 0)) self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0))) self.hscale = self.vscale = .001 @@ -542,7 +542,8 @@ def __init__(self, descriptor, widths, spec): else: encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): - name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) + name = literal_name(encoding.get('BaseEncoding', + LITERAL_STANDARD_ENCODING)) diff = list_value(encoding.get('Differences', [])) self.cid2unicode = EncodingDB.get_encoding(name, diff) else: @@ -582,7 +583,7 @@ def __init__(self, rsrcmgr, spec): except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) - #lastchar = int_value(spec.get('LastChar', 255)) + # lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict((i+firstchar, w) for (i, w) in enumerate(widths)) PDFSimpleFont.__init__(self, descriptor, widths, spec) @@ -599,19 +600,17 @@ def __repr__(self): return '' % self.basefont -# PDFTrueTypeFont class PDFTrueTypeFont(PDFType1Font): def __repr__(self): return '' % self.basefont -# PDFType3Font class PDFType3Font(PDFSimpleFont): - def __init__(self, rsrcmgr, spec): + def __init__(self, _, spec): firstchar = int_value(spec.get('FirstChar', 0)) - #lastchar = int_value(spec.get('LastChar', 0)) + # lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict((i+firstchar, w) for (i, w) in enumerate(widths)) if 'FontDescriptor' in spec: @@ -629,7 +628,6 @@ def __repr__(self): return '' -# PDFCIDFont class PDFCIDFont(PDFFont): def __init__(self, rsrcmgr, spec, strict=settings.STRICT): @@ -640,8 +638,11 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) - self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), - resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) + self.cidcoding = '%s-%s' % ( + resolve1(self.cidsysteminfo.get('Registry', + b'unknown')).decode("latin1"), + resolve1(self.cidsysteminfo.get('Ordering', + b'unknown')).decode("latin1")) try: name = literal_name(spec['Encoding']) except KeyError: @@ -678,15 +679,17 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): pass else: try: - self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical()) - except CMapDB.CMapNotFound as e: + self.unicode_map = CMapDB.get_unicode_map( + self.cidcoding, self.cmap.is_vertical()) + except CMapDB.CMapNotFound: pass self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical widths = get_widths2(list_value(spec.get('W2', []))) - self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in six.iteritems(widths)) + self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in + six.iteritems(widths)) (vy, w) = spec.get('DW2', [880, -1000]) self.default_disp = (None, vy) widths = dict((cid, w) for (cid, (w, _)) in six.iteritems(widths)) @@ -701,7 +704,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): return def __repr__(self): - return '' % (self.basefont, self.cidcoding) + return '' %\ + (self.basefont, self.cidcoding) def is_vertical(self): return self.vertical @@ -713,7 +717,9 @@ def decode(self, bytes): return self.cmap.decode(bytes) def char_disp(self, cid): - "Returns an integer for horizontal fonts, a tuple for vertical fonts." + """ + Returns an integer for horizontal fonts, a tuple for vertical fonts. + """ return self.disps.get(cid, self.default_disp) def to_unichr(self, cid): @@ -725,15 +731,15 @@ def to_unichr(self, cid): raise PDFUnicodeNotDefined(self.cidcoding, cid) -# main def main(argv): for fname in argv[1:]: fp = open(fname, 'rb') - #font = TrueTypeFont(fname, fp) + # font = TrueTypeFont(fname, fp) font = CFFFont(fname, fp) - print (font) + print(font) fp.close() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 0c2328d3..e3a6aed2 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -2,6 +2,7 @@ import re import logging from io import BytesIO +import six from .cmapdb import CMapDB from .cmapdb import CMap from .psparser import PSTypeError @@ -31,28 +32,25 @@ from .utils import mult_matrix from .utils import MATRIX_IDENTITY -import six # Python 2+3 compatibility log = logging.getLogger(__name__) -## Exceptions -## + class PDFResourceError(PDFException): pass + class PDFInterpreterError(PDFException): pass -## Constants -## + LITERAL_PDF = LIT('PDF') LITERAL_TEXT = LIT('Text') LITERAL_FONT = LIT('Font') LITERAL_FORM = LIT('Form') LITERAL_IMAGE = LIT('Image') -## PDFTextState -## + class PDFTextState(object): def __init__(self): @@ -70,7 +68,8 @@ def __init__(self): return def __repr__(self): - return ('' % (self.font, self.fontsize, self.charspace, self.wordspace, @@ -97,8 +96,6 @@ def reset(self): return -## PDFGraphicState -## class PDFGraphicState(object): def __init__(self): @@ -139,10 +136,7 @@ def __repr__(self): self.scolor, self.ncolor)) -## Resource Manager -## class PDFResourceManager(object): - """Repository of shared resources. ResourceManager facilitates reuse of shared resources @@ -162,7 +156,7 @@ def get_procset(self, procs): elif proc is LITERAL_TEXT: pass else: - #raise PDFResourceError('ProcSet %r is not supported.' % proc) + # raise PDFResourceError('ProcSet %r is not supported.' % proc) pass return @@ -219,8 +213,6 @@ def get_font(self, objid, spec): return font -## PDFContentParser -## class PDFContentParser(PSStackParser): def __init__(self, streams): @@ -264,20 +256,22 @@ def get_inline_data(self, pos, target=b'EI'): while i <= len(target): self.fillbuf() if i: - c = six.indexbytes(self.buf,self.charpos) - c=six.int2byte(c) + c = six.indexbytes(self.buf, self.charpos) + c = six.int2byte(c) data += c self.charpos += 1 if len(target) <= i and c.isspace(): i += 1 - elif i < len(target) and c == (six.int2byte(target[i]) if six.PY3 else target[i]): + elif i < len(target) \ + and c == (six.int2byte(target[i]) + if six.PY3 else target[i]): i += 1 else: i = 0 else: try: j = self.buf.index(target[0], self.charpos) - #print 'found', (0, self.buf[j:j+10]) + # print 'found', (0, self.buf[j:j+10]) data += self.buf[self.charpos:j+1] self.charpos = j+1 i = 1 @@ -286,7 +280,7 @@ def get_inline_data(self, pos, target=b'EI'): self.charpos = len(self.buf) data = data[:-(len(target)+1)] # strip the last part data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data) - return (pos, data) + return pos, data def flush(self): self.add_results(*self.popall()) @@ -304,7 +298,8 @@ def do_keyword(self, pos, token): try: (_, objs) = self.end_type('inline') if len(objs) % 2 != 0: - raise PSTypeError('Invalid dictionary construct: %r' % objs) + raise PSTypeError('Invalid dictionary construct: %r' % + objs) d = dict((literal_name(k), v) for (k, v) in choplist(2, objs)) (pos, data) = self.get_inline_data(pos+len(b'ID ')) obj = PDFStream(d, data) @@ -318,8 +313,6 @@ def do_keyword(self, pos, token): return -## Interpreter -## class PDFPageInterpreter(object): def __init__(self, rsrcmgr, device): @@ -330,9 +323,8 @@ def __init__(self, rsrcmgr, device): def dup(self): return self.__class__(self.rsrcmgr, self.device) - # init_resources(resources): - # Prepare the fonts and XObjects listed in the Resource attribute. def init_resources(self, resources): + """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap = {} self.xobjmap = {} @@ -345,9 +337,11 @@ def get_colorspace(spec): name = literal_name(spec[0]) else: name = literal_name(spec) - if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): + if name == 'ICCBased' and \ + isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, stream_value(spec[1])['N']) - elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): + elif name == 'DeviceN' and\ + isinstance(spec, list) and len(spec) >= 2: return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) @@ -370,9 +364,8 @@ def get_colorspace(spec): self.xobjmap[xobjid] = xobjstrm return - # init_state(ctm) - # Initialize the text and graphic states for rendering a page. def init_state(self, ctm): + """Initialize the text and graphic states for rendering a page.""" # gstack: stack for graphical states. self.gstack = [] self.ctm = ctm @@ -400,7 +393,7 @@ def pop(self, n): return x def get_current_state(self): - return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) + return self.ctm, self.textstate.copy(), self.graphicstate.copy() def set_current_state(self, state): (self.ctm, self.textstate, self.graphicstate) = state @@ -460,8 +453,8 @@ def do_i(self, flatness): return # load-gstate - def do_gs(self, name): - #XXX + def do_gs(self, _): + # XXX return # moveto @@ -505,7 +498,8 @@ def do_re(self, x, y, w, h): # stroke def do_S(self): - self.device.paint_path(self.graphicstate, True, False, False, self.curpath) + self.device.paint_path(self.graphicstate, True, False, False, + self.curpath) self.curpath = [] return @@ -517,7 +511,8 @@ def do_s(self): # fill def do_f(self): - self.device.paint_path(self.graphicstate, False, True, False, self.curpath) + self.device.paint_path(self.graphicstate, False, True, False, + self.curpath) self.curpath = [] return # fill (obsolete) @@ -525,19 +520,22 @@ def do_f(self): # fill-even-odd def do_f_a(self): - self.device.paint_path(self.graphicstate, False, True, True, self.curpath) + self.device.paint_path(self.graphicstate, False, True, True, + self.curpath) self.curpath = [] return # fill-and-stroke def do_B(self): - self.device.paint_path(self.graphicstate, True, True, False, self.curpath) + self.device.paint_path(self.graphicstate, True, True, False, + self.curpath) self.curpath = [] return # fill-and-stroke-even-odd def do_B_a(self): - self.device.paint_path(self.graphicstate, True, True, True, self.curpath) + self.device.paint_path(self.graphicstate, True, True, True, + self.curpath) self.curpath = [] return @@ -587,37 +585,37 @@ def do_cs(self, name): # setgray-stroking def do_G(self, gray): self.graphicstate.color = gray - #self.do_CS(LITERAL_DEVICE_GRAY) + # self.do_CS(LITERAL_DEVICE_GRAY) return # setgray-non-stroking def do_g(self, gray): self.graphicstate.color = gray - #self.do_cs(LITERAL_DEVICE_GRAY) + # self.do_cs(LITERAL_DEVICE_GRAY) return # setrgb-stroking def do_RG(self, r, g, b): self.graphicstate.color = (r, g, b) - #self.do_CS(LITERAL_DEVICE_RGB) + # self.do_CS(LITERAL_DEVICE_RGB) return # setrgb-non-stroking def do_rg(self, r, g, b): self.graphicstate.color = (r, g, b) - #self.do_cs(LITERAL_DEVICE_RGB) + # self.do_cs(LITERAL_DEVICE_RGB) return # setcmyk-stroking def do_K(self, c, m, y, k): self.graphicstate.color = (c, m, y, k) - #self.do_CS(LITERAL_DEVICE_CMYK) + # self.do_CS(LITERAL_DEVICE_CMYK) return # setcmyk-non-stroking def do_k(self, c, m, y, k): self.graphicstate.color = (c, m, y, k) - #self.do_cs(LITERAL_DEVICE_CMYK) + # self.do_cs(LITERAL_DEVICE_CMYK) return # setcolor @@ -737,7 +735,7 @@ def do_Td(self, tx, ty): (a, b, c, d, e, f) = self.textstate.matrix self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.linematrix = (0, 0) - #print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate) + # print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate) return # text-move @@ -746,7 +744,7 @@ def do_TD(self, tx, ty): self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.leading = ty self.textstate.linematrix = (0, 0) - #print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate) + # print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate) return # textmatrix @@ -758,13 +756,15 @@ def do_Tm(self, a, b, c, d, e, f): # nextline def do_T_a(self): (a, b, c, d, e, f) = self.textstate.matrix - self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f) + self.textstate.matrix = (a, b, c, d, + self.textstate.leading*c+e, + self.textstate.leading*d+f) self.textstate.linematrix = (0, 0) return # show-pos def do_TJ(self, seq): - #print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate) + # print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate) if self.textstate.font is None: if settings.STRICT: raise PDFInterpreterError('No font specified!') @@ -824,9 +824,11 @@ def do_Do(self, xobjid): # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. xobjres = xobj.get('Resources') - resources = dict_value(xobjres) if xobjres else self.resources.copy() + resources = dict_value(xobjres) if xobjres \ + else self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) - interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) + interpreter.render_contents(resources, [xobj], + ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) @@ -877,7 +879,8 @@ def execute(self, streams): break if isinstance(obj, PSKeyword): name = keyword_name(obj) - method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q') + method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').\ + replace("'", '_q') if hasattr(self, method): func = getattr(self, method) nargs = six.get_function_code(func).co_argcount-1 @@ -891,7 +894,8 @@ def execute(self, streams): func() else: if settings.STRICT: - raise PDFInterpreterError('Unknown operator: %r' % name) + raise PDFInterpreterError('Unknown operator: %r' % + name) else: self.push(obj) return diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 418aeb24..b97eba35 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,5 +1,6 @@ import logging +import six from . import settings from .psparser import LIT from .pdftypes import PDFObjectNotFound @@ -11,7 +12,6 @@ from .pdfdocument import PDFDocument from .pdfdocument import PDFTextExtractionNotAllowed -import six # Python 2+3 compatibility log = logging.getLogger(__name__) @@ -19,10 +19,8 @@ LITERAL_PAGE = LIT('Page') LITERAL_PAGES = LIT('Pages') -## PDFPage -## -class PDFPage(object): +class PDFPage(object): """An object that holds the information about a page. A PDFPage object is merely a convenience class that has a set @@ -73,12 +71,13 @@ def __init__(self, doc, pageid, attrs): return def __repr__(self): - return '' % (self.resources, self.mediabox) + return '' % \ + (self.resources, self.mediabox) INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) @classmethod - def create_pages(klass, document): + def create_pages(cls, document): def search(obj, parent): if isinstance(obj, int): objid = obj @@ -87,7 +86,7 @@ def search(obj, parent): objid = obj.objid tree = dict_value(obj).copy() for (k, v) in six.iteritems(parent): - if k in klass.INHERITABLE_ATTRS and k not in tree: + if k in cls.INHERITABLE_ATTRS and k not in tree: tree[k] = v tree_type = tree.get('Type') @@ -104,8 +103,9 @@ def search(obj, parent): yield (objid, tree) pages = False if 'Pages' in document.catalog: - for (objid, tree) in search(document.catalog['Pages'], document.catalog): - yield klass(document, objid, tree) + for (objid, tree) in search(document.catalog['Pages'], + document.catalog): + yield cls(document, objid, tree) pages = True if not pages: # fallback when /Pages is missing. @@ -113,14 +113,15 @@ def search(obj, parent): for objid in xref.get_objids(): try: obj = document.getobj(objid) - if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE: - yield klass(document, objid, obj) + if isinstance(obj, dict) and obj.get('Type')\ + is LITERAL_PAGE: + yield cls(document, objid, obj) except PDFObjectNotFound: pass return @classmethod - def get_pages(klass, fp, + def get_pages(cls, fp, pagenos=None, maxpages=0, password='', caching=True, check_extractable=True): # Create a PDF parser object associated with the file object. @@ -129,9 +130,10 @@ def get_pages(klass, fp, doc = PDFDocument(parser, password=password, caching=caching) # Check if the document allows text extraction. If not, abort. if check_extractable and not doc.is_extractable: - raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + raise PDFTextExtractionNotAllowed('Text extraction is not allowed:' + ' %r' % fp) # Process each page contained in the document. - for (pageno, page) in enumerate(klass.create_pages(doc)): + for (pageno, page) in enumerate(cls.create_pages(doc)): if pagenos and (pageno not in pagenos): continue yield page diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 1dc17d7b..702f792a 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -15,16 +15,11 @@ log = logging.getLogger(__name__) -## Exceptions -## class PDFSyntaxError(PDFException): pass -## PDFParser -## class PDFParser(PSStackParser): - """ PDFParser fetch PDF objects from a file stream. It can handle indirect references by referring to @@ -64,14 +59,11 @@ def do_keyword(self, pos, token): if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) - elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) - elif token is self.KEYWORD_NULL: # null object self.push((pos, None)) - elif token is self.KEYWORD_R: # reference to indirect object try: @@ -81,7 +73,6 @@ def do_keyword(self, pos, token): self.push((pos, obj)) except PSSyntaxError: pass - elif token is self.KEYWORD_STREAM: # stream object ((_, dic),) = self.pop(1) @@ -106,7 +97,7 @@ def do_keyword(self, pos, token): self.seek(pos+objlen) while 1: try: - (linepos, line) = self.nextline() + (_, line) = self.nextline() except PSEOF: if settings.STRICT: raise PDFSyntaxError('Unexpected EOF') @@ -122,10 +113,10 @@ def do_keyword(self, pos, token): data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary - log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10]) + log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', + pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) self.push((pos, obj)) - else: # others self.push((pos, token)) @@ -133,10 +124,7 @@ def do_keyword(self, pos, token): return -## PDFStreamParser -## class PDFStreamParser(PDFParser): - """ PDFStreamParser is used to parse PDF content streams that is contained in each page and has instructions @@ -154,6 +142,7 @@ def flush(self): return KEYWORD_OBJ = KWD(b'obj') + def do_keyword(self, pos, token): if token is self.KEYWORD_R: # reference to indirect object diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 40cca46b..9eeccbff 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -1,6 +1,7 @@ import zlib import logging +import six from .lzw import lzwdecode from .ascii85 import ascii85decode from .ascii85 import asciihexdecode @@ -13,12 +14,9 @@ from .utils import apply_png_predictor from .utils import isnumber -import six #Python 2+3 compatibility - log = logging.getLogger(__name__) LITERAL_CRYPT = LIT('Crypt') - # Abbreviation of Filter names in PDF 4.8.6. "Inline Images" LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) @@ -29,29 +27,30 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) -## PDF Objects -## class PDFObject(PSObject): pass + class PDFException(PSException): pass + class PDFTypeError(PDFException): pass + class PDFValueError(PDFException): pass + class PDFObjectNotFound(PDFException): pass + class PDFNotImplementedError(PDFException): pass -## PDFObjRef -## class PDFObjRef(PDFObject): def __init__(self, doc, objid, _): @@ -60,11 +59,11 @@ def __init__(self, doc, objid, _): raise PDFValueError('PDF object id cannot be 0.') self.doc = doc self.objid = objid - #self.genno = genno # Never used. + # self.genno = genno # Never used. return def __repr__(self): - return '' % (self.objid) + return '' % self.objid def resolve(self, default=None): try: @@ -73,7 +72,6 @@ def resolve(self, default=None): return default -# resolve def resolve1(x, default=None): """Resolves an object. @@ -179,8 +177,6 @@ def stream_value(x): return x -## PDFStream type -## class PDFStream(PDFObject): def __init__(self, attrs, rawdata, decipher=None): @@ -201,10 +197,12 @@ def set_objid(self, objid, genno): def __repr__(self): if self.data is None: assert self.rawdata is not None - return '' % (self.objid, len(self.rawdata), self.attrs) + return '' % \ + (self.objid, len(self.rawdata), self.attrs) else: assert self.data is not None - return '' % (self.objid, len(self.data), self.attrs) + return '' % \ + (self.objid, len(self.data), self.attrs) def __contains__(self, name): return name in self.attrs @@ -239,10 +237,11 @@ def get_filters(self): if hasattr(fltr, 'resolve'): fltr = fltr.resolve()[0] _filters.append(fltr) - return list(zip(_filters, params)) #solves https://github.com/pdfminer/pdfminer.six/issues/15 + return list(zip(_filters, params)) def decode(self): - assert self.data is None and self.rawdata is not None, str((self.data, self.rawdata)) + assert self.data is None and self.rawdata is not None, \ + str((self.data, self.rawdata)) data = self.rawdata if self.decipher: # Handle encryption @@ -252,14 +251,15 @@ def decode(self): self.data = data self.rawdata = None return - for (f,params) in filters: + for (f, params) in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: data = zlib.decompress(data) except zlib.error as e: if settings.STRICT: - raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) + raise PDFException('Invalid zlib bytes: %r, %r' % + (e, data)) data = b'' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) @@ -272,7 +272,8 @@ def decode(self): elif f in LITERALS_CCITTFAX_DECODE: data = ccittfaxdecode(data, params) elif f in LITERALS_DCT_DECODE: - # This is probably a JPG stream - it does not need to be decoded twice. + # This is probably a JPG stream + # it does not need to be decoded twice. # Just return the stream to the user. pass elif f == LITERAL_CRYPT: @@ -290,10 +291,13 @@ def decode(self): # PNG predictor colors = int_value(params.get('Colors', 1)) columns = int_value(params.get('Columns', 1)) - bitspercomponent = int_value(params.get('BitsPerComponent', 8)) - data = apply_png_predictor(pred, colors, columns, bitspercomponent, data) + bitspercomponent = int_value(params.get('BitsPerComponent', + 8)) + data = apply_png_predictor(pred, colors, columns, + bitspercomponent, data) else: - raise PDFNotImplementedError('Unsupported predictor: %r' % pred) + raise PDFNotImplementedError('Unsupported predictor: %r' % + pred) self.data = data self.rawdata = None return diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 9b214af0..1ee52633 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -4,8 +4,7 @@ import re import logging - -import six # Python 2+3 compatibility +import six from . import settings from .utils import choplist @@ -13,8 +12,6 @@ log = logging.getLogger(__name__) -## PS Exceptions -## class PSException(Exception): pass @@ -35,22 +32,15 @@ class PSValueError(PSException): pass -## Basic PostScript Types -## +# Basic PostScript Types -## PSObject -## class PSObject(object): - """Base class for all PS or PDF-related data types.""" pass -## PSLiteral -## class PSLiteral(PSObject): - """A class that represents a PostScript literal. Postscript literals are used as identifiers, such as @@ -66,14 +56,11 @@ def __init__(self, name): self.name = name def __repr__(self): - name=self.name + name = self.name return '/%r' % name -## PSKeyword -## class PSKeyword(PSObject): - """A class that represents a PostScript keyword. PostScript keywords are a dozen of predefined words. @@ -89,14 +76,11 @@ def __init__(self, name): return def __repr__(self): - name=self.name + name = self.name return '/%r' % name -## PSSymbolTable -## class PSSymbolTable(object): - """A utility class for storing PSLiteral/PSKeyword objects. Interned objects can be checked its identity with "is" operator. @@ -115,6 +99,7 @@ def intern(self, name): self.dict[name] = lit return lit + PSLiteralTable = PSSymbolTable(PSLiteral) PSKeywordTable = PSSymbolTable(PSKeyword) LIT = PSLiteralTable.intern @@ -132,31 +117,30 @@ def literal_name(x): if settings.STRICT: raise PSTypeError('Literal required: %r' % (x,)) else: - name=x + name = x else: - name=x.name + name = x.name if six.PY3: try: - name = str(name,'utf-8') + name = str(name, 'utf-8') except: pass return name + def keyword_name(x): if not isinstance(x, PSKeyword): if settings.STRICT: raise PSTypeError('Keyword required: %r' % x) else: - name=x + name = x else: - name=x.name + name = x.name if six.PY3: - name = str(name,'utf-8','ignore') + name = str(name, 'utf-8', 'ignore') return name -## PSBaseParser -## EOL = re.compile(br'[\r\n]') SPC = re.compile(br'\s') NONSPC = re.compile(br'\S') @@ -168,13 +152,12 @@ def keyword_name(x): END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]') END_STRING = re.compile(br'[()\134]') OCT_STRING = re.compile(br'[0-7]') -ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13, b'(': 40, b')': 41, b'\\': 92} +ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13, + b'(': 40, b')': 41, b'\\': 92} class PSBaseParser(object): - - """Most basic PostScript parser that performs only tokenization. - """ + """Most basic PostScript parser that performs only tokenization.""" BUFSIZ = 4096 def __init__(self, fp): @@ -183,7 +166,8 @@ def __init__(self, fp): return def __repr__(self): - return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos) + return '<%s: %r, bufpos=%d>' % \ + (self.__class__.__name__, self.fp, self.bufpos) def flush(self): return @@ -193,7 +177,7 @@ def close(self): return def tell(self): - return self.bufpos+self.charpos + return self.bufpos + self.charpos def poll(self, pos=None, n=80): pos0 = self.fp.tell() @@ -205,8 +189,7 @@ def poll(self, pos=None, n=80): return def seek(self, pos): - """Seeks the parser to the given position. - """ + """Seeks the parser to the given position.""" log.debug('seek: %r', pos) self.fp.seek(pos) # reset the status for nextline() @@ -232,8 +215,7 @@ def fillbuf(self): return def nextline(self): - """Fetches a next line that ends either with \\r or \\n. - """ + """Fetches a next line that ends either with \\r or \\n.""" linebuf = b'' linepos = self.bufpos + self.charpos eol = False @@ -259,11 +241,10 @@ def nextline(self): self.charpos = len(self.buf) log.debug('nextline: %r, %r', linepos, linebuf) - return (linepos, linebuf) + return linepos, linebuf def revreadlines(self): """Fetches a next line backword. - This is used to locate the trailers at the end of a file. """ self.fp.seek(0, 2) @@ -338,12 +319,12 @@ def _parse_comment(self, s, i): m = EOL.search(s, i) if not m: self._curtoken += s[i:] - return (self._parse_comment, len(s)) + return self._parse_comment, len(s) j = m.start(0) self._curtoken += s[i:j] self._parse1 = self._parse_main # We ignore comments. - #self._tokens.append(self._curtoken) + # self._tokens.append(self._curtoken) return j def _parse_literal(self, s, i): @@ -359,7 +340,7 @@ def _parse_literal(self, s, i): self._parse1 = self._parse_literal_hex return j+1 try: - self._curtoken=str(self._curtoken,'utf-8') + self._curtoken = str(self._curtoken, 'utf-8') except: pass self._add_token(LIT(self._curtoken)) @@ -444,7 +425,8 @@ def _parse_string(self, s, i): return j+1 if c == b')': self.paren -= 1 - if self.paren: # WTF, they said balanced parens need no special treatment. + if self.paren: + # WTF, they said balanced parens need no special treatment. self._curtoken += c return j+1 self._add_token(self._curtoken) @@ -490,7 +472,8 @@ def _parse_hexstring(self, s, i): return len(s) j = m.start(0) self._curtoken += s[i:j] - token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken)) + token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)), + SPC.sub(b'', self._curtoken)) self._add_token(token) self._parse1 = self._parse_main return j @@ -504,8 +487,6 @@ def nexttoken(self): return token -## PSStackParser -## class PSStackParser(PSBaseParser): def __init__(self, fp): @@ -559,7 +540,7 @@ def end_type(self, type): objs = [obj for (_, obj) in self.curstack] (pos, self.curtype, self.curstack) = self.context.pop() log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs) - return (pos, objs) + return pos, objs def do_keyword(self, pos, token): return @@ -568,12 +549,15 @@ def nextobject(self): """Yields a list of objects. Returns keywords, literals, strings, numbers, arrays and dictionaries. - Arrays and dictionaries are represented as Python lists and dictionaries. + Arrays and dictionaries are represented as Python lists and + dictionaries. """ while not self.results: (pos, token) = self.nexttoken() - #print (pos,token), (self.curtype, self.curstack) - if isinstance(token, (six.integer_types, float, bool, six.string_types, six.binary_type, PSLiteral)): + # print (pos,token), (self.curtype, self.curstack) + if isinstance(token, (six.integer_types, float, bool, + six.string_types, + six.binary_type, PSLiteral)): # normal token self.push((pos, token)) elif token == KEYWORD_ARRAY_BEGIN: @@ -594,9 +578,11 @@ def nextobject(self): try: (pos, objs) = self.end_type('d') if len(objs) % 2 != 0: - raise PSSyntaxError('Invalid dictionary construct: %r' % objs) + raise PSSyntaxError('Invalid dictionary construct: %r' + % objs) # construct a Python dictionary. - d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None) + d = dict((literal_name(k), v) for (k, v) in + choplist(2, objs) if v is not None) self.push((pos, d)) except PSTypeError: if settings.STRICT: @@ -611,11 +597,13 @@ def nextobject(self): except PSTypeError: if settings.STRICT: raise - elif isinstance(token,PSKeyword): - log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack) + elif isinstance(token, PSKeyword): + log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, + token, self.curstack) self.do_keyword(pos, token) else: - log.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack) + log.error('unknown token: pos=%r, token=%r, stack=%r', pos, + token, self.curstack) self.do_keyword(pos, token) raise if self.context: diff --git a/pdfminer/rijndael.py b/pdfminer/rijndael.py index 2d3a7ab2..02e41972 100644 --- a/pdfminer/rijndael.py +++ b/pdfminer/rijndael.py @@ -1,28 +1,24 @@ - - -""" Python implementation of Rijndael encryption algorithm. - +"""Python implementation of Rijndael encryption algorithm. This code is in the public domain. - This code is based on a public domain C implementation by Philip J. Erdelsky: - http://www.efgh.com/software/rijndael.htm - + http://www.efgh.com/software/rijndael.htm """ import struct def KEYLENGTH(keybits): - return (keybits)//8 + return keybits//8 def RKLENGTH(keybits): - return (keybits)//8+28 + return keybits//8+28 def NROUNDS(keybits): - return (keybits)//32+6 + return keybits//32+6 + Te0 = [ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, @@ -701,20 +697,28 @@ def NROUNDS(keybits): # 128-bit blocks, Rijndael never uses more than 10 rcon values ] -if len(struct.pack('L',0)) == 4: + +if len(struct.pack('L', 0)) == 4: # 32bit - def GETU32(x): return struct.unpack('>L', x)[0] - def PUTU32(x): return struct.pack('>L', x) + def GETU32(x): + return struct.unpack('>L', x)[0] + + def PUTU32(x): + return struct.pack('>L', x) else: # 64bit - def GETU32(x): return struct.unpack('>I', x)[0] - def PUTU32(x): return struct.pack('>I', x) + def GETU32(x): + return struct.unpack('>I', x)[0] + + def PUTU32(x): + return struct.pack('>I', x) -# Expand the cipher key into the encryption key schedule. -# -# @return the number of rounds for the given cipher key size. def rijndaelSetupEncrypt(key, keybits): + """Expand the cipher key into the encryption key schedule. + + :return: the number of rounds for the given cipher key size. + """ i = p = 0 rk = [0]*RKLENGTH(keybits) rk[0] = GETU32(key[0:4]) @@ -726,15 +730,16 @@ def rijndaelSetupEncrypt(key, keybits): temp = rk[p+3] rk[p+4] = (rk[p+0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ - (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ - (Te4[(temp ) & 0xff] & 0x0000ff00) ^ - (Te4[(temp >> 24) ] & 0x000000ff) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[temp & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24)] & 0x000000ff) ^ rcon[i]) rk[p+5] = rk[p+1] ^ rk[p+4] rk[p+6] = rk[p+2] ^ rk[p+5] rk[p+7] = rk[p+3] ^ rk[p+6] i += 1 - if i == 10: return (rk, 10) + if i == 10: + return rk, 10 p += 4 rk[4] = GETU32(key[16:20]) @@ -744,15 +749,16 @@ def rijndaelSetupEncrypt(key, keybits): temp = rk[p+5] rk[p+6] = (rk[p+0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ - (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ - (Te4[(temp ) & 0xff] & 0x0000ff00) ^ - (Te4[(temp >> 24) ] & 0x000000ff) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[temp & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24)] & 0x000000ff) ^ rcon[i]) rk[p+7] = rk[p+1] ^ rk[p+6] rk[p+8] = rk[p+2] ^ rk[p+7] rk[p+9] = rk[p+3] ^ rk[p+8] i += 1 - if i == 8: return (rk, 12) + if i == 8: + return rk, 12 rk[p+10] = rk[p+4] ^ rk[p+9] rk[p+11] = rk[p+5] ^ rk[p+10] p += 6 @@ -764,21 +770,22 @@ def rijndaelSetupEncrypt(key, keybits): temp = rk[p+7] rk[p+8] = (rk[p+0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ - (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ - (Te4[(temp ) & 0xff] & 0x0000ff00) ^ - (Te4[(temp >> 24) ] & 0x000000ff) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[temp & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24)] & 0x000000ff) ^ rcon[i]) rk[p+9] = rk[p+1] ^ rk[p+8] rk[p+10] = rk[p+2] ^ rk[p+9] rk[p+11] = rk[p+3] ^ rk[p+10] i += 1 - if i == 7: return (rk, 14) + if i == 7: + return rk, 14 temp = rk[p+11] rk[p+12] = (rk[p+4] ^ - (Te4[(temp >> 24) ] & 0xff000000) ^ + (Te4[(temp >> 24)] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(temp ) & 0xff] & 0x000000ff)) + (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[temp & 0xff] & 0x000000ff)) rk[p+13] = rk[p+5] ^ rk[p+12] rk[p+14] = rk[p+6] ^ rk[p+13] rk[p+15] = rk[p+7] ^ rk[p+14] @@ -787,10 +794,11 @@ def rijndaelSetupEncrypt(key, keybits): raise ValueError(keybits) -# Expand the cipher key into the decryption key schedule. -# -# @return the number of rounds for the given cipher key size. def rijndaelSetupDecrypt(key, keybits): + """Expand the cipher key into the decryption key schedule. + + :return: the number of rounds for the given cipher key size. + """ # expand the cipher key: (rk, nrounds) = rijndaelSetupEncrypt(key, keybits) @@ -798,45 +806,38 @@ def rijndaelSetupDecrypt(key, keybits): i = 0 j = 4*nrounds while i < j: - temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp - temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp - temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp - temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp + for num in range(0, 4): + rk[i + num], rk[j + num] = rk[j + num], rk[i + num] i += 4 j -= 4 - # apply the inverse MixColumn transform to all round keys but the first and the last: + # apply the inverse MixColumn transform to all + # round keys but the first and the last: p = 0 for i in range(1, nrounds): p += 4 - rk[p+0] = ( - Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[p+0] ) & 0xff] & 0xff]) - rk[p+1] = ( - Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[p+1] ) & 0xff] & 0xff]) - rk[p+2] = ( - Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[p+2] ) & 0xff] & 0xff]) - rk[p+3] = ( - Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[p+3] ) & 0xff] & 0xff]) - - return (rk, nrounds) + rk[p+0] = (Td0[Te4[(rk[p+0] >> 24)] & 0xff] ^ + Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[p+0] & 0xff] & 0xff]) + rk[p+1] = (Td0[Te4[(rk[p+1] >> 24)] & 0xff] ^ + Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[p+1] & 0xff] & 0xff]) + rk[p+2] = (Td0[Te4[(rk[p+2] >> 24)] & 0xff] ^ + Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[p+2] & 0xff] & 0xff]) + rk[p+3] = (Td0[Te4[(rk[p+3] >> 24)] & 0xff] ^ + Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[p+3] & 0xff] & 0xff]) + return rk, nrounds def rijndaelEncrypt(rk, nrounds, plaintext): assert len(plaintext) == 16, str(len(plaintext)) - # map byte array block to cipher state - # and add initial round key: + # map byte array block to cipher state and add initial round key: s0 = GETU32(plaintext[0:4]) ^ rk[0] s1 = GETU32(plaintext[4:8]) ^ rk[1] s2 = GETU32(plaintext[8:12]) ^ rk[2] @@ -846,89 +847,65 @@ def rijndaelEncrypt(rk, nrounds, plaintext): r = nrounds >> 1 p = 0 while 1: - t0 = ( - Te0[(s0 >> 24) ] ^ - Te1[(s1 >> 16) & 0xff] ^ - Te2[(s2 >> 8) & 0xff] ^ - Te3[(s3 ) & 0xff] ^ - rk[p+4]) - t1 = ( - Te0[(s1 >> 24) ] ^ - Te1[(s2 >> 16) & 0xff] ^ - Te2[(s3 >> 8) & 0xff] ^ - Te3[(s0 ) & 0xff] ^ - rk[p+5]) - t2 = ( - Te0[(s2 >> 24) ] ^ - Te1[(s3 >> 16) & 0xff] ^ - Te2[(s0 >> 8) & 0xff] ^ - Te3[(s1 ) & 0xff] ^ - rk[p+6]) - t3 = ( - Te0[(s3 >> 24) ] ^ - Te1[(s0 >> 16) & 0xff] ^ - Te2[(s1 >> 8) & 0xff] ^ - Te3[(s2 ) & 0xff] ^ - rk[p+7]) + t0 = (Te0[(s0 >> 24)] ^ + Te1[(s1 >> 16) & 0xff] ^ + Te2[(s2 >> 8) & 0xff] ^ + Te3[s3 & 0xff] ^ + rk[p+4]) + t1 = (Te0[(s1 >> 24)] ^ + Te1[(s2 >> 16) & 0xff] ^ + Te2[(s3 >> 8) & 0xff] ^ + Te3[s0 & 0xff] ^ + rk[p+5]) + t2 = (Te0[(s2 >> 24)] ^ + Te1[(s3 >> 16) & 0xff] ^ + Te2[(s0 >> 8) & 0xff] ^ + Te3[s1 & 0xff] ^ + rk[p+6]) + t3 = (Te0[(s3 >> 24)] ^ + Te1[(s0 >> 16) & 0xff] ^ + Te2[(s1 >> 8) & 0xff] ^ + Te3[s2 & 0xff] ^ + rk[p+7]) p += 8 r -= 1 - if r == 0: break - s0 = ( - Te0[(t0 >> 24) ] ^ - Te1[(t1 >> 16) & 0xff] ^ - Te2[(t2 >> 8) & 0xff] ^ - Te3[(t3 ) & 0xff] ^ - rk[p+0]) - s1 = ( - Te0[(t1 >> 24) ] ^ - Te1[(t2 >> 16) & 0xff] ^ - Te2[(t3 >> 8) & 0xff] ^ - Te3[(t0 ) & 0xff] ^ - rk[p+1]) - s2 = ( - Te0[(t2 >> 24) ] ^ - Te1[(t3 >> 16) & 0xff] ^ - Te2[(t0 >> 8) & 0xff] ^ - Te3[(t1 ) & 0xff] ^ - rk[p+2]) - s3 = ( - Te0[(t3 >> 24) ] ^ - Te1[(t0 >> 16) & 0xff] ^ - Te2[(t1 >> 8) & 0xff] ^ - Te3[(t2 ) & 0xff] ^ - rk[p+3]) + if r == 0: + break + s0 = (Te0[(t0 >> 24)] ^ Te1[(t1 >> 16) & 0xff] ^ + Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[p+0]) + s1 = (Te0[(t1 >> 24)] ^ Te1[(t2 >> 16) & 0xff] ^ + Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[p+1]) + s2 = (Te0[(t2 >> 24)] ^ Te1[(t3 >> 16) & 0xff] ^ + Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[p+2]) + s3 = (Te0[(t3 >> 24)] ^ Te1[(t0 >> 16) & 0xff] ^ + Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[p+3]) ciphertext = b'' - # apply last round and - # map cipher state to byte array block: - s0 = ( - (Te4[(t0 >> 24) ] & 0xff000000) ^ - (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t3 ) & 0xff] & 0x000000ff) ^ - rk[p+0]) + # apply last round and map cipher state to byte array block: + s0 = ((Te4[(t0 >> 24)] & 0xff000000) ^ + (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t3 & 0xff] & 0x000000ff) ^ + rk[p+0]) ciphertext += PUTU32(s0) - s1 = ( - (Te4[(t1 >> 24) ] & 0xff000000) ^ - (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t0 ) & 0xff] & 0x000000ff) ^ - rk[p+1]) + s1 = ((Te4[(t1 >> 24)] & 0xff000000) ^ + (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t0 & 0xff] & 0x000000ff) ^ + rk[p+1]) ciphertext += PUTU32(s1) - s2 = ( - (Te4[(t2 >> 24) ] & 0xff000000) ^ - (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t1 ) & 0xff] & 0x000000ff) ^ - rk[p+2]) + s2 = ((Te4[(t2 >> 24)] & 0xff000000) ^ + (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t1 & 0xff] & 0x000000ff) ^ + rk[p+2]) ciphertext += PUTU32(s2) - s3 = ( - (Te4[(t3 >> 24) ] & 0xff000000) ^ - (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t2 ) & 0xff] & 0x000000ff) ^ - rk[p+3]) + s3 = ((Te4[(t3 >> 24)] & 0xff000000) ^ + (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t2 & 0xff] & 0x000000ff) ^ + rk[p+3]) ciphertext += PUTU32(s3) assert len(ciphertext) == 16, str(len(ciphertext)) @@ -949,89 +926,53 @@ def rijndaelDecrypt(rk, nrounds, ciphertext): r = nrounds >> 1 p = 0 while 1: - t0 = ( - Td0[(s0 >> 24) ] ^ - Td1[(s3 >> 16) & 0xff] ^ - Td2[(s2 >> 8) & 0xff] ^ - Td3[(s1 ) & 0xff] ^ - rk[p+4]) - t1 = ( - Td0[(s1 >> 24) ] ^ - Td1[(s0 >> 16) & 0xff] ^ - Td2[(s3 >> 8) & 0xff] ^ - Td3[(s2 ) & 0xff] ^ - rk[p+5]) - t2 = ( - Td0[(s2 >> 24) ] ^ - Td1[(s1 >> 16) & 0xff] ^ - Td2[(s0 >> 8) & 0xff] ^ - Td3[(s3 ) & 0xff] ^ - rk[p+6]) - t3 = ( - Td0[(s3 >> 24) ] ^ - Td1[(s2 >> 16) & 0xff] ^ - Td2[(s1 >> 8) & 0xff] ^ - Td3[(s0 ) & 0xff] ^ - rk[p+7]) + t0 = (Td0[(s0 >> 24)] ^ Td1[(s3 >> 16) & 0xff] ^ + Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[p+4]) + t1 = (Td0[(s1 >> 24)] ^ Td1[(s0 >> 16) & 0xff] ^ + Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[p+5]) + t2 = (Td0[(s2 >> 24)] ^ Td1[(s1 >> 16) & 0xff] ^ + Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[p+6]) + t3 = (Td0[(s3 >> 24)] ^ Td1[(s2 >> 16) & 0xff] ^ + Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[p+7]) p += 8 r -= 1 - if r == 0: break - s0 = ( - Td0[(t0 >> 24) ] ^ - Td1[(t3 >> 16) & 0xff] ^ - Td2[(t2 >> 8) & 0xff] ^ - Td3[(t1 ) & 0xff] ^ - rk[p+0]) - s1 = ( - Td0[(t1 >> 24) ] ^ - Td1[(t0 >> 16) & 0xff] ^ - Td2[(t3 >> 8) & 0xff] ^ - Td3[(t2 ) & 0xff] ^ - rk[p+1]) - s2 = ( - Td0[(t2 >> 24) ] ^ - Td1[(t1 >> 16) & 0xff] ^ - Td2[(t0 >> 8) & 0xff] ^ - Td3[(t3 ) & 0xff] ^ - rk[p+2]) - s3 = ( - Td0[(t3 >> 24) ] ^ - Td1[(t2 >> 16) & 0xff] ^ - Td2[(t1 >> 8) & 0xff] ^ - Td3[(t0 ) & 0xff] ^ - rk[p+3]) + if r == 0: + break + s0 = (Td0[(t0 >> 24)] ^ Td1[(t3 >> 16) & 0xff] ^ + Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[p+0]) + s1 = (Td0[(t1 >> 24)] ^ Td1[(t0 >> 16) & 0xff] ^ + Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[p+1]) + s2 = (Td0[(t2 >> 24)] ^ Td1[(t1 >> 16) & 0xff] ^ + Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[p+2]) + s3 = (Td0[(t3 >> 24)] ^ Td1[(t2 >> 16) & 0xff] ^ + Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[p+3]) plaintext = b'' - # apply last round and - # map cipher state to byte array block: - s0 = ( - (Td4[(t0 >> 24) ] & 0xff000000) ^ - (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t1 ) & 0xff] & 0x000000ff) ^ - rk[p+0]) + # apply last round and map cipher state to byte array block: + s0 = ((Td4[(t0 >> 24)] & 0xff000000) ^ + (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t1 & 0xff] & 0x000000ff) ^ + rk[p+0]) plaintext += PUTU32(s0) - s1 = ( - (Td4[(t1 >> 24) ] & 0xff000000) ^ - (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t2 ) & 0xff] & 0x000000ff) ^ - rk[p+1]) + s1 = ((Td4[(t1 >> 24)] & 0xff000000) ^ + (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t2 & 0xff] & 0x000000ff) ^ + rk[p+1]) plaintext += PUTU32(s1) - s2 = ( - (Td4[(t2 >> 24) ] & 0xff000000) ^ - (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t3 ) & 0xff] & 0x000000ff) ^ - rk[p+2]) + s2 = ((Td4[(t2 >> 24)] & 0xff000000) ^ + (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t3 & 0xff] & 0x000000ff) ^ + rk[p+2]) plaintext += PUTU32(s2) - s3 = ( - (Td4[(t3 >> 24) ] & 0xff000000) ^ - (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t0 ) & 0xff] & 0x000000ff) ^ - rk[p+3]) + s3 = ((Td4[(t3 >> 24)] & 0xff000000) ^ + (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t0 & 0xff] & 0x000000ff) ^ + rk[p+3]) plaintext += PUTU32(s3) assert len(plaintext) == 16, str(len(plaintext)) @@ -1040,7 +981,6 @@ def rijndaelDecrypt(rk, nrounds, ciphertext): # decrypt(key, fin, fout, keybits=256) class RijndaelDecryptor(object): - """ >>> key = b'00010203050607080a0b0c0d0f101112'.decode('hex') >>> ciphertext = b'd8f532538289ef7d06b506a4fd5be9c9'.decode('hex') @@ -1049,10 +989,13 @@ class RijndaelDecryptor(object): """ def __init__(self, key, keybits=256): - assert len(key) == KEYLENGTH(keybits), str((len(key), KEYLENGTH(keybits))) + assert len(key) == KEYLENGTH(keybits), \ + str((len(key), KEYLENGTH(keybits))) (self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits) - assert len(self.rk) == RKLENGTH(keybits), str((len(self.rk), RKLENGTH(keybits))) - assert self.nrounds == NROUNDS(keybits), str((self.nrounds, NROUNDS(keybits))) + assert len(self.rk) == RKLENGTH(keybits), \ + str((len(self.rk), RKLENGTH(keybits))) + assert self.nrounds == NROUNDS(keybits), \ + str((self.nrounds, NROUNDS(keybits))) return def decrypt(self, ciphertext): @@ -1064,10 +1007,13 @@ def decrypt(self, ciphertext): class RijndaelEncryptor(object): def __init__(self, key, keybits=256): - assert len(key) == KEYLENGTH(keybits), str((len(key), KEYLENGTH(keybits))) + assert len(key) == KEYLENGTH(keybits), \ + str((len(key), KEYLENGTH(keybits))) (self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits) - assert len(self.rk) == RKLENGTH(keybits), str((len(self.rk), RKLENGTH(keybits))) - assert self.nrounds == NROUNDS(keybits), str((self.nrounds, NROUNDS(keybits))) + assert len(self.rk) == RKLENGTH(keybits), \ + str((len(self.rk), RKLENGTH(keybits))) + assert self.nrounds == NROUNDS(keybits), \ + str((self.nrounds, NROUNDS(keybits))) return def encrypt(self, plaintext): diff --git a/pdfminer/runlength.py b/pdfminer/runlength.py index 54bc7691..814c3943 100644 --- a/pdfminer/runlength.py +++ b/pdfminer/runlength.py @@ -1,12 +1,11 @@ +""" +RunLength decoder (Adobe version) implementation based on PDF Reference +version 1.4 section 3.3.4. + * public domain * +""" -# -# RunLength decoder (Adobe version) implementation based on PDF Reference -# version 1.4 section 3.3.4. -# -# * public domain * -# +import six -import six #Python 2+3 compatibility def rldecode(data): """ @@ -25,20 +24,19 @@ def rldecode(data): decoded = b'' i = 0 while i < len(data): - #print 'data[%d]=:%d:' % (i,ord(data[i])) - length = six.indexbytes(data,i) + # print 'data[%d]=:%d:' % (i,ord(data[i])) + length = six.indexbytes(data, i) if length == 128: break - if length >= 0 and length < 128: - for j in range(i+1,(i+1)+(length+1)): - decoded+=six.int2byte(six.indexbytes(data,j)) - #print 'length=%d, run=%s' % (length+1,run) - + if 0 <= length < 128: + for j in range(i+1, (i+1)+(length+1)): + decoded += six.int2byte(six.indexbytes(data, j)) + # print 'length=%d, run=%s' % (length+1,run) + i = (i+1) + (length+1) if length > 128: - run = six.int2byte(six.indexbytes(data,i+1))*(257-length) - #print 'length=%d, run=%s' % (257-length,run) - decoded+=run + run = six.int2byte(six.indexbytes(data, i+1))*(257-length) + # print 'length=%d, run=%s' % (257-length,run) + decoded += run i = (i+1) + 1 return decoded - diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 339759a1..bba4e74b 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -3,51 +3,61 @@ Miscellaneous Routines. """ import struct -# from sys import maxint as INF #doesn't work anymore under Python3, +import six + +# from sys import maxint as INF doesn't work anymore under Python3, # but PDF still uses 32 bits ints -INF = (1<<31) - 1 +INF = (1 << 31) - 1 -import six #Python 2+3 compatibility if six.PY3: import chardet # For str encoding detection in Py3 unicode = str + def make_compat_bytes(in_str): - "In Py2, does nothing. In Py3, converts to bytes, encoding to unicode." + """In Py2, does nothing. In Py3, converts to bytes, encoding to unicode.""" assert isinstance(in_str, str), str(type(in_str)) if six.PY2: return in_str - else: - return in_str.encode() + return in_str.encode() + def make_compat_str(in_str): - "In Py2, does nothing. In Py3, converts to string, guessing encoding." + """In Py2, does nothing. In Py3, converts to string, guessing encoding.""" assert isinstance(in_str, (bytes, str, unicode)), str(type(in_str)) if six.PY3 and isinstance(in_str, bytes): - enc = chardet.detect(in_str) - in_str = in_str.decode(enc['encoding']) + encoding = chardet.detect(in_str) + in_str = in_str.decode(encoding['encoding']) return in_str -def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'): - "When Py2 str.encode is called, it often means bytes.encode in Py3. This does either." + +def compatible_encode_method(bytesorstring, + encoding='utf-8', + erraction='ignore'): + """When Py2 str.encode is called, it often means bytes.encode in Py3. + This does either. + """ + if six.PY2: - assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring)) + assert isinstance(bytesorstring, (str, unicode)),\ + str(type(bytesorstring)) return bytesorstring.encode(encoding, erraction) + if six.PY3: - if isinstance(bytesorstring, str): return bytesorstring + if isinstance(bytesorstring, str): + return bytesorstring assert isinstance(bytesorstring, bytes), str(type(bytesorstring)) return bytesorstring.decode(encoding, erraction) -## PNG Predictor -## -def apply_png_predictor(pred, colors, columns, bitspercomponent, data): + +def apply_png_predictor(_, colors, columns, bitspercomponent, data): + """Apply png predictor""" + if bitspercomponent != 8: - # unsupported raise ValueError("Unsupported `bitspercomponent': %d" % bitspercomponent) nbytes = colors * columns * bitspercomponent // 8 - i = 0 buf = b'' line0 = b'\x00' * columns for i in range(0, len(data), nbytes+1): @@ -84,56 +94,48 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): c = ((c+a+b)//2) & 255 line2 += six.int2byte(c) else: - # unsupported raise ValueError("Unsupported predictor value: %d" % ft) buf += line2 line0 = line2 return buf -## Matrix operations -## MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) def mult_matrix(m1, m0): + """Returns the multiplication of two matrices.""" (a1, b1, c1, d1, e1, f1) = m1 (a0, b0, c0, d0, e0, f0) = m0 - """Returns the multiplication of two matrices.""" - return (a0*a1+c0*b1, b0*a1+d0*b1, - a0*c1+c0*d1, b0*c1+d0*d1, - a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) + return (a0*a1+c0*b1, b0*a1+d0*b1, a0*c1+c0*d1, + b0*c1+d0*d1, a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) def translate_matrix(m, v): """Translates a matrix by (x, y).""" (a, b, c, d, e, f) = m (x, y) = v - return (a, b, c, d, x*a+y*c+e, x*b+y*d+f) + return a, b, c, d, x*a+y*c+e, x*b+y*d+f def apply_matrix_pt(m, v): + """Applies a matrix to a point.""" (a, b, c, d, e, f) = m (x, y) = v - """Applies a matrix to a point.""" - return (a*x+c*y+e, b*x+d*y+f) + return a*x+c*y+e, b*x+d*y+f def apply_matrix_norm(m, v): """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" - (a, b, c, d, e, f) = m + (a, b, c, d, _, _) = m (p, q) = v - return (a*p+c*q, b*p+d*q) - + return a*p+c*q, b*p+d*q -## Utility functions -## -# isnumber def isnumber(x): return isinstance(x, (six.integer_types, float)) -# uniq + def uniq(objs): """Eliminates duplicated elements.""" done = set() @@ -145,14 +147,12 @@ def uniq(objs): return -# csort def csort(objs, key): """Order-preserving sorting function.""" idxs = dict((obj, i) for (i, obj) in enumerate(objs)) return sorted(objs, key=lambda obj: (key(obj), idxs[obj])) -# fsplit def fsplit(pred, objs): """Split a list into two classes according to the predicate.""" t = [] @@ -162,17 +162,15 @@ def fsplit(pred, objs): t.append(obj) else: f.append(obj) - return (t, f) + return t, f -# drange def drange(v0, v1, d): """Returns a discrete range.""" assert v0 < v1, str((v0, v1, d)) return range(int(v0)//d, int(v1+d)//d) -# get_bound def get_bound(pts): """Compute a minimal rectangle that covers all the points.""" (x0, y0, x1, y1) = (INF, INF, -INF, -INF) @@ -181,10 +179,9 @@ def get_bound(pts): y0 = min(y0, y) x1 = max(x1, x) y1 = max(y1, y) - return (x0, y0, x1, y1) + return x0, y0, x1, y1 -# pick def pick(seq, func, maxobj=None): """Picks the object obj where func(obj) has the highest value.""" maxscore = None @@ -195,7 +192,6 @@ def pick(seq, func, maxobj=None): return maxobj -# choplist def choplist(n, seq): """Groups every n elements of the list.""" r = [] @@ -207,7 +203,6 @@ def choplist(n, seq): return -# nunpack def nunpack(s, default=0): """Unpacks 1 to 4 or 8 byte integers (big endian).""" l = len(s) @@ -227,7 +222,6 @@ def nunpack(s, default=0): raise TypeError('invalid length: %d' % l) -# decode_text PDFDocEncoding = ''.join(six.unichr(x) for x in ( 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, @@ -268,16 +262,15 @@ def decode_text(s): """Decodes a PDFDocEncoding string to Unicode.""" if s.startswith(b'\xfe\xff'): return six.text_type(s[2:], 'utf-16be', 'ignore') - else: - return ''.join(PDFDocEncoding[c] for c in s) + return ''.join(PDFDocEncoding[c] for c in s) -# enc def enc(x, codec='ascii'): """Encodes a string for SGML/XML/HTML""" if isinstance(x, bytes): return '' - x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"') + x = x.replace('&', '&').replace('>', '>').\ + replace('<', '<').replace('"', '"') if codec: x = x.encode(codec, 'xmlcharrefreplace') return x @@ -293,14 +286,13 @@ def matrix2str(m): return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f) -## Plane -## -## A set-like data structure for objects placed on a plane. -## Can efficiently find objects in a certain rectangular area. -## It maintains two parallel lists of objects, each of -## which is sorted by its x or y coordinate. -## class Plane(object): + """Plane + A set-like data structure for objects placed on a plane. + Can efficiently find objects in a certain rectangular area. + It maintains two parallel lists of objects, each of + which is sorted by its x or y coordinate. + """ def __init__(self, bbox, gridsize=50): self._seq = [] # preserve the object order. @@ -311,10 +303,10 @@ def __init__(self, bbox, gridsize=50): return def __repr__(self): - return ('' % list(self)) + return '' % list(self) def __iter__(self): - return ( obj for obj in self._seq if obj in self._objs ) + return (obj for obj in self._seq if obj in self._objs) def __len__(self): return len(self._objs) @@ -324,8 +316,8 @@ def __contains__(self, obj): def _getrange(self, bbox): (x0, y0, x1, y1) = bbox - if (x1 <= self.x0 or self.x1 <= x0 or - y1 <= self.y0 or self.y1 <= y0): return + if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0: + return x0 = max(self.x0, x0) y0 = max(self.y0, y0) x1 = min(self.x1, x1) @@ -375,8 +367,8 @@ def find(self, bbox): if obj in done: continue done.add(obj) - if (obj.x1 <= x0 or x1 <= obj.x0 or - obj.y1 <= y0 or y1 <= obj.y0): + if obj.x1 <= x0 or x1 <= obj.x0 \ + or obj.y1 <= y0 or y1 <= obj.y0: continue yield obj return