diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index d28659fd8..e7e405a88 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -43,7 +43,7 @@ jobs: - name: Test with flake8 run: | - flake8 . --ignore=E203,W503,E501,F405,E226,E128,E225,F403,E201,E202,E231,W504,E241,F401,E261,E302,E211,E701,E228,E111,F841,E117,E127,E251,E266,E + flake8 . --ignore=E203,W503,W504,E,F403,F405 if: matrix.python-version != '2.7' - name: Test with pytest diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py index bace8bf72..5d85cbc4f 100644 --- a/PyPDF2/__init__.py +++ b/PyPDF2/__init__.py @@ -3,4 +3,12 @@ from .pagerange import PageRange, parse_filename_page_ranges from ._version import __version__ -__all__ = ["pdf", "PdfFileMerger"] +__all__ = [ + "__version__", + "PageRange", + "parse_filename_page_ranges", + "pdf", + "PdfFileMerger", + "PdfFileReader", + "PdfFileWriter", +] diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index d0b98786f..1ecce31c8 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -55,7 +55,7 @@ def compress(data): # Unable to import zlib. Attempt to use the System.IO.Compression # library from the .NET framework. (IronPython only) import System - from System import IO, Collections, Array + from System import IO, Array def _string_to_bytearr(buf): retval = Array.CreateInstance(System.Byte, len(buf)) @@ -275,7 +275,7 @@ def decode(data, decodeParms=None): x = 0 hitEod = False # remove all whitespace from data - data = [y for y in data if not (y in ' \n\r\t')] + data = [y for y in data if y not in ' \n\r\t'] while not hitEod: c = data[x] if len(retval) == 0 and c == "<" and data[x+1] == "~": @@ -363,7 +363,7 @@ def decode(data, decodeParms=None, height=0): width = decodeParms["/Columns"] imgSize = len(data) - tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h' + tiff_header_struct = '<2shlh' + 'hhll' * 8 + 'h' tiffHeader = struct.pack(tiff_header_struct, b'II', # Byte order indication: Little endian 42, # Version number (always 42) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 75d098621..334d76609 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -381,7 +381,7 @@ def readStringFromStream(stream): # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) - if not tok in b_("\n\r"): + if tok not in b_("\n\r"): stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: @@ -483,10 +483,10 @@ def readFromStream(stream, pdf): try: try: ret=name.decode('utf-8') - except (UnicodeEncodeError, UnicodeDecodeError) as e: + except (UnicodeEncodeError, UnicodeDecodeError): ret=name.decode('gbk') return NameObject(ret) - except (UnicodeEncodeError, UnicodeDecodeError) as e: + except (UnicodeEncodeError, UnicodeDecodeError): # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number if not pdf.strict: @@ -843,7 +843,7 @@ def getData(self): decoded._data = filters.decodeStreamData(self) for key, value in list(self.items()): - if not key in ("/Length", "/Filter", "/DecodeParms"): + if key not in ("/Length", "/Filter", "/DecodeParms"): decoded[key] = value self.decodedSelf = decoded return decoded._data diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index eae26ed18..00393bbb8 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -311,7 +311,6 @@ def _trim_dests(self, pdf, dests, pages): page set. """ new_dests = [] - prev_header_added = True for k, o in list(dests.items()): for j in range(*pages): if pdf.getPage(j).getObject() == o['/Page'].getObject(): @@ -356,7 +355,7 @@ def _write_dests(self): if p.id == v['/Page']: v[NameObject('/Page')] = p.out_pagedata pageno = i - pdf = p.src + pdf = p.src # noqa: F841 break if pageno is not None: self.output.addNamedDestinationObject(v) @@ -429,7 +428,7 @@ def _write_bookmarks(self, bookmarks=None, parent=None): b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) pageno = i - pdf = p.src + pdf = p.src # noqa: F841 break if pageno is not None: del b['/Page'], b['/Type'] diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py index 83fd8a99b..73fcad77f 100644 --- a/PyPDF2/pagerange.py +++ b/PyPDF2/pagerange.py @@ -89,8 +89,7 @@ def __init__(self, arg): @staticmethod def valid(input): """ True if input is a valid initializer for a PageRange. """ - return isinstance(input, slice) or \ - isinstance(input, PageRange) or \ + return isinstance(input, (slice, PageRange)) or \ (isString(input) and bool(re.match(PAGE_RANGE_RE, input))) @@ -144,7 +143,7 @@ def parse_filename_page_ranges(args): for arg in args + [None]: if PageRange.valid(arg): if not pdf_filename: - raise ValueError("The first argument must be a filename, " \ + raise ValueError("The first argument must be a filename, " "not a page range.") pairs.append( (pdf_filename, PageRange(arg)) ) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index fbe097fc2..0f7692bf5 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -41,7 +41,6 @@ __maintainer__ = "Phaseit, Inc." __maintainer_email = "PyPDF2@phaseit.net" -import string import math import struct import sys @@ -57,7 +56,6 @@ else: from io import BytesIO -from . import filters from . import utils import warnings import codecs @@ -543,7 +541,6 @@ def _sweepIndirectReferences(self, externMap, data): if debug: print((data, "TYPE", data.__class__.__name__)) if isinstance(data, DictionaryObject): for key, value in list(data.items()): - origvalue = value value = self._sweepIndirectReferences(externMap, value) if isinstance(value, StreamObject): # a dictionary value is a stream. streams must be indirect @@ -794,6 +791,11 @@ def removeImages(self, ignoreByteStringObject=False): to ignore ByteString Objects. """ pages = self.getObject(self._pages)['/Kids'] + jump_operators = [ + b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), + b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), + b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh') + ] for j in range(len(pages)): page = pages[j] pageRef = self.getObject(page) @@ -804,36 +806,29 @@ def removeImages(self, ignoreByteStringObject=False): _operations = [] seq_graphics = False for operands, operator in content.operations: - if operator == b_('Tj'): - text = operands[0] - if ignoreByteStringObject: - if not isinstance(text, TextStringObject): - operands[0] = TextStringObject() - elif operator == b_("'"): + if operator in [b_('Tj'), b_("'")]: text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] - if ignoreByteStringObject: - if not isinstance(text, TextStringObject): - operands[2] = TextStringObject() + if ignoreByteStringObject and not isinstance(text, TextStringObject): + operands[2] = TextStringObject() elif operator == b_("TJ"): for i in range(len(operands[0])): - if ignoreByteStringObject: - if not isinstance(operands[0][i], TextStringObject): - operands[0][i] = TextStringObject() + if ( + ignoreByteStringObject + and not isinstance(operands[0][i], TextStringObject) + ): + operands[0][i] = TextStringObject() if operator == b_('q'): seq_graphics = True if operator == b_('Q'): seq_graphics = False - if seq_graphics: - if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), - b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), - b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]: - continue + if seq_graphics and operator in jump_operators: + continue if operator == b_('re'): continue _operations.append((operands, operator)) @@ -856,23 +851,13 @@ def removeText(self, ignoreByteStringObject=False): if not isinstance(content, ContentStream): content = ContentStream(content, pageRef) for operands,operator in content.operations: - if operator == b_('Tj'): + if operator in [b_('Tj'), b_("'")]: text = operands[0] if not ignoreByteStringObject: if isinstance(text, TextStringObject): operands[0] = TextStringObject() else: - if isinstance(text, TextStringObject) or \ - isinstance(text, ByteStringObject): - operands[0] = TextStringObject() - elif operator == b_("'"): - text = operands[0] - if not ignoreByteStringObject: - if isinstance(text, TextStringObject): - operands[0] = TextStringObject() - else: - if isinstance(text, TextStringObject) or \ - isinstance(text, ByteStringObject): + if isinstance(text, (TextStringObject, ByteStringObject)): operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] @@ -880,8 +865,7 @@ def removeText(self, ignoreByteStringObject=False): if isinstance(text, TextStringObject): operands[2] = TextStringObject() else: - if isinstance(text, TextStringObject) or \ - isinstance(text, ByteStringObject): + if isinstance(text, (TextStringObject, ByteStringObject)): operands[2] = TextStringObject() elif operator == b_("TJ"): for i in range(len(operands[0])): @@ -889,8 +873,7 @@ def removeText(self, ignoreByteStringObject=False): if isinstance(operands[0][i], TextStringObject): operands[0][i] = TextStringObject() else: - if isinstance(operands[0][i], TextStringObject) or \ - isinstance(operands[0][i], ByteStringObject): + if isinstance(operands[0][i], (TextStringObject, ByteStringObject)): operands[0][i] = TextStringObject() pageRef.__setitem__(NameObject('/Contents'), content) @@ -1172,9 +1155,8 @@ def _showwarning(message, category, filename, lineno, file=warndest, line=None): if hasattr(stream, 'mode') and 'b' not in stream.mode: warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) if isString(stream): - fileobj = open(stream, 'rb') - stream = BytesIO(b_(fileobj.read())) - fileobj.close() + with open(stream, 'rb') as fileobj: + stream = BytesIO(b_(fileobj.read())) self.read(stream) self.stream = stream @@ -1729,7 +1711,7 @@ def getObject(self, indirectReference): return retval def _decryptObject(self, obj, key): - if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): + if isinstance(obj, (ByteStringObject, TextStringObject)): obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) elif isinstance(obj, StreamObject): obj._data = utils.RC4_encrypt(key, obj._data) @@ -1752,7 +1734,10 @@ def readObjectHeader(self, stream): idnum = readUntilWhitespace(stream) extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) generation = readUntilWhitespace(stream) - obj = stream.read(3) + + # although it's not used, it might still be necessary to read + _obj = stream.read(3) # noqa: F841 + readNonWhitespace(stream) stream.seek(-1, 1) if (extra and self.strict): @@ -1938,8 +1923,8 @@ def used_before(num, generation): # The rest of the elements depend on the xref_type if xref_type == 0: # linked list of free objects - next_free_object = getEntry(1) - next_generation = getEntry(2) + next_free_object = getEntry(1) # noqa: F841 + next_generation = getEntry(2) # noqa: F841 elif xref_type == 1: # objects that are in use but are not compressed byte_offset = getEntry(1) diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py index cb392a18b..3270d86f8 100644 --- a/PyPDF2/utils.py +++ b/PyPDF2/utils.py @@ -196,11 +196,10 @@ def markLocation(stream): # Mainly for debugging RADIUS = 5000 stream.seek(-RADIUS, 1) - outputDoc = open('PyPDF2_pdfLocation.txt', 'wb') - outputDoc.write(stream.read(RADIUS)) - outputDoc.write(b'HERE') - outputDoc.write(stream.read(RADIUS)) - outputDoc.close() + with open('PyPDF2_pdfLocation.txt', 'wb') as outputDoc: + outputDoc.write(stream.read(RADIUS)) + outputDoc.write(b'HERE') + outputDoc.write(stream.read(RADIUS)) stream.seek(-RADIUS, 1) @@ -242,7 +241,7 @@ def b_(s): if len(s) < 2: bc[s] = r return r - except Exception as e: + except Exception: print(s) r = s.encode('utf-8') if len(s) < 2: diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py index 3670f264a..9aec5e017 100644 --- a/PyPDF2/xmp.py +++ b/PyPDF2/xmp.py @@ -2,7 +2,6 @@ import datetime import decimal from .generic import PdfObject -from xml.dom import getDOMImplementation from xml.dom.minidom import parseString from .utils import u_ diff --git a/Scripts/2-up.py b/Scripts/2-up.py index c3212b428..2540e0114 100644 --- a/Scripts/2-up.py +++ b/Scripts/2-up.py @@ -16,7 +16,7 @@ def main(): print("usage: python 2-up.py input_file output_file") sys.exit(1) print("2-up input " + sys.argv[1]) - reader = PdfFileReader(open(sys.argv[1], "rb")) + reader = PdfFileReader(sys.argv[1]) writer = PdfFileWriter() for iter in range(0, reader.getNumPages() - 1, 2): lhs = reader.getPage(iter) diff --git a/Scripts/booklet.py b/Scripts/booklet.py index d669c6b3d..9eeac3de7 100644 --- a/Scripts/booklet.py +++ b/Scripts/booklet.py @@ -1,12 +1,12 @@ #!/usr/bin/env python """ - Layout the pages from a PDF file to print a booklet or brochure. +Layout the pages from a PDF file to print a booklet or brochure. - The resulting media size is twice the size of the first page - of the source document. If you print the resulting PDF in duplex - (short edge), you get a center fold brochure that you can staple - together and read as a booklet. +The resulting media size is twice the size of the first page +of the source document. If you print the resulting PDF in duplex +(short edge), you get a center fold brochure that you can staple +together and read as a booklet. """ from __future__ import division, print_function @@ -63,7 +63,8 @@ def mergePageByNumber(dstPage, pageNumber, xOffset): mergePageByNumber(page, i, offsets[0]) mergePageByNumber(page, virtualPages - i - 1, offsets[1]) - writer.write(open(args.output, "wb")) + with open(args.output, "wb") as fp: + writer.write(fp) if __name__ == "__main__": diff --git a/Scripts/pdf-image-extractor.py b/Scripts/pdf-image-extractor.py index 916c81564..c2e2aa00e 100644 --- a/Scripts/pdf-image-extractor.py +++ b/Scripts/pdf-image-extractor.py @@ -11,7 +11,7 @@ def main(pdf: str): - reader = PyPDF2.PdfFileReader(open(pdf, "rb")) + reader = PyPDF2.PdfFileReader(pdf) page = reader.pages[30] if "/XObject" in page["/Resources"]: diff --git a/Tests/test_basic_features.py b/Tests/test_basic_features.py index 75e72c324..f3a41fe41 100644 --- a/Tests/test_basic_features.py +++ b/Tests/test_basic_features.py @@ -14,7 +14,7 @@ def test_basic_features(): output = PdfFileWriter() document1 = os.path.join(RESOURCE_ROOT, "crazyones.pdf") - input1 = PdfFileReader(open(document1, "rb")) + input1 = PdfFileReader(document1) # print how many pages input1 has: print("document1.pdf has %d pages." % input1.getNumPages()) @@ -32,7 +32,7 @@ def test_basic_features(): # add page 4 from input1, but first add a watermark from another PDF: page4 = input1.getPage(0) watermark_pdf = document1 - watermark = PdfFileReader(open(watermark_pdf, "rb")) + watermark = PdfFileReader(watermark_pdf) page4.mergePage(watermark.getPage(0)) output.addPage(page4) diff --git a/Tests/test_merger.py b/Tests/test_merger.py index ca2a40fc3..49048a741 100644 --- a/Tests/test_merger.py +++ b/Tests/test_merger.py @@ -1,5 +1,4 @@ import os -import binascii import sys import PyPDF2 @@ -34,8 +33,8 @@ def test_merge(): # file_merger.append(reader) # File handle - fh = open(pdf_path, "rb") - file_merger.append(fh) + with open(pdf_path, "rb") as fh: + file_merger.append(fh) file_merger.addBookmark("A bookmark", 0) diff --git a/Tests/test_page.py b/Tests/test_page.py index 19ae52fc3..5b15f9f58 100644 --- a/Tests/test_page.py +++ b/Tests/test_page.py @@ -15,7 +15,7 @@ def test_page_operations(): output is as expected. """ pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") - reader = PdfFileReader(open(pdf_path, "rb")) + reader = PdfFileReader(pdf_path) page = reader.pages[0] page.mergeRotatedScaledPage(page, 90, 1, 1) page.mergeScaledTranslatedPage(page, 1, 1, 1) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 96cddeb8b..6cf736d59 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -2,7 +2,7 @@ import os import pytest import PyPDF2.utils -from PyPDF2.filters import decodeStreamData, _xobj_to_image +from PyPDF2.filters import _xobj_to_image TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -29,7 +29,7 @@ def test_read_metadata(): ], ) def test_get_annotations(src): - reader = PyPDF2.PdfFileReader(open(src, "rb")) + reader = PyPDF2.PdfFileReader(src) for page in reader.pages: print("/Annots" in page) @@ -49,7 +49,7 @@ def test_get_annotations(src): ], ) def test_get_attachments(src): - reader = PyPDF2.PdfFileReader(open(src, "rb")) + reader = PyPDF2.PdfFileReader(src) attachments = {} for i in range(reader.getNumPages()): @@ -71,7 +71,7 @@ def test_get_attachments(src): ], ) def test_get_outlines(src, outline_elements): - reader = PyPDF2.PdfFileReader(open(src, "rb")) + reader = PyPDF2.PdfFileReader(src) outlines = reader.getOutlines() assert len(outlines) == outline_elements @@ -85,7 +85,7 @@ def test_get_outlines(src, outline_elements): ], ) def test_get_images(src, nb_images): - reader = PyPDF2.PdfFileReader(open(src, "rb")) + reader = PyPDF2.PdfFileReader(src) with pytest.raises(TypeError): page = reader.pages["0"] diff --git a/Tests/test_writer.py b/Tests/test_writer.py index 8c39258ff..bd8cd9d8c 100644 --- a/Tests/test_writer.py +++ b/Tests/test_writer.py @@ -3,7 +3,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2.utils import PageSizeNotDefinedError -from PyPDF2.generic import IndirectObject, RectangleObject +from PyPDF2.generic import RectangleObject TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -20,8 +20,8 @@ def test_writer_operations(): pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") pdf_outline_path = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") - reader = PdfFileReader(open(pdf_path, "rb")) - reader_outline = PdfFileReader(open(pdf_outline_path, "rb")) + reader = PdfFileReader(pdf_path) + reader_outline = PdfFileReader(pdf_outline_path) output = PdfFileWriter() page = reader.pages[0] @@ -62,7 +62,7 @@ def test_writer_operations(): def test_remove_images(): pdf_path = os.path.join(RESOURCE_ROOT, "side-by-side-subfig.pdf") - reader = PdfFileReader(open(pdf_path, "rb")) + reader = PdfFileReader(pdf_path) output = PdfFileWriter() page = reader.pages[0] diff --git a/requirements/ci.in b/requirements/ci.in index eb92c4503..3545b25f2 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -1,5 +1,6 @@ pytest flake8 flake8-bugbear +flake8_implicit_str_concat pytest-cov pillow diff --git a/requirements/ci.txt b/requirements/ci.txt index d25907cfd..c926afe44 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -4,9 +4,10 @@ # # pip-compile requirements/ci.in # -attrs==21.4.0 +attrs==20.3.0 # via # flake8-bugbear + # flake8-implicit-str-concat # pytest coverage[toml]==6.2 # via pytest-cov @@ -16,6 +17,8 @@ flake8==4.0.1 # flake8-bugbear flake8-bugbear==22.3.23 # via -r requirements/ci.in +flake8-implicit-str-concat==0.2.0 + # via -r requirements/ci.in importlib-metadata==4.2.0 # via # flake8 @@ -25,6 +28,8 @@ iniconfig==1.1.1 # via pytest mccabe==0.6.1 # via flake8 +more-itertools==8.12.0 + # via flake8-implicit-str-concat packaging==21.3 # via pytest pillow==8.4.0 diff --git a/setup.py b/setup.py index 52e2d5b72..76a5bc380 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,8 @@ VERSIONFILE = "PyPDF2/_version.py" -verstrline = open(VERSIONFILE, "rt").read() +with open(VERSIONFILE, "rt") as fp: + verstrline = fp.read() VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" mo = re.search(VSRE, verstrline, re.M) if mo: