py-pdf · MartinThoma · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
@@ -43,7 +43,7 @@ jobs:
 
     - name: Test with flake8
       run: |
-        flake8 . --ignore=E203,W503,E501,F405,E226,E128,E225,F403,E201,E202,E231,W504,E241,F401,E261,E302,E211,E701,E228,E111,F841,E117,E127,E251,E266,E
+        flake8 . --ignore=E203,W503,W504,E,F403,F405
       if: matrix.python-version != '2.7'
 
     - name: Test with pytest

diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py
@@ -3,4 +3,12 @@
 from .pagerange import PageRange, parse_filename_page_ranges
 from ._version import __version__
 
-__all__ = ["pdf", "PdfFileMerger"]
+__all__ = [
+    "__version__",
+    "PageRange",
+    "parse_filename_page_ranges",
+    "pdf",
+    "PdfFileMerger",
+    "PdfFileReader",
+    "PdfFileWriter",
+]
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -55,7 +55,7 @@ def compress(data):
     # Unable to import zlib.  Attempt to use the System.IO.Compression
     # library from the .NET framework. (IronPython only)
     import System
-    from System import IO, Collections, Array
+    from System import IO, Array
 
     def _string_to_bytearr(buf):
         retval = Array.CreateInstance(System.Byte, len(buf))
@@ -275,7 +275,7 @@ def decode(data, decodeParms=None):
             x = 0
             hitEod = False
             # remove all whitespace from data
-            data = [y for y in data if not (y in ' \n\r\t')]
+            data = [y for y in data if y not in ' \n\r\t']
             while not hitEod:
                 c = data[x]
                 if len(retval) == 0 and c == "<" and data[x+1] == "~":
@@ -363,7 +363,7 @@ def decode(data, decodeParms=None, height=0):
 
         width = decodeParms["/Columns"]
         imgSize = len(data)
-        tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
+        tiff_header_struct = '<2shlh' + 'hhll' * 8 + 'h'
         tiffHeader = struct.pack(tiff_header_struct,
                            b'II',  # Byte order indication: Little endian
                            42,  # Version number (always 42)

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -381,7 +381,7 @@ def readStringFromStream(stream):
                     # break occurs.  If it's a multi-char EOL, consume the
                     # second character:
                     tok = stream.read(1)
-                    if not tok in b_("\n\r"):
+                    if tok not in b_("\n\r"):
                         stream.seek(-1, 1)
                     # Then don't add anything to the actual string, since this
                     # line break was escaped:
@@ -483,10 +483,10 @@ def readFromStream(stream, pdf):
         try:
             try:
                 ret=name.decode('utf-8')
-            except (UnicodeEncodeError, UnicodeDecodeError) as e:
+            except (UnicodeEncodeError, UnicodeDecodeError):
                 ret=name.decode('gbk')
             return NameObject(ret)
-        except (UnicodeEncodeError, UnicodeDecodeError) as e:
+        except (UnicodeEncodeError, UnicodeDecodeError):
             # Name objects should represent irregular characters
             # with a '#' followed by the symbol's hex number
             if not pdf.strict:
@@ -843,7 +843,7 @@ def getData(self):
 
             decoded._data = filters.decodeStreamData(self)
             for key, value in list(self.items()):
-                if not key in ("/Length", "/Filter", "/DecodeParms"):
+                if key not in ("/Length", "/Filter", "/DecodeParms"):
                     decoded[key] = value
             self.decodedSelf = decoded
             return decoded._data

diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py
@@ -311,7 +311,6 @@ def _trim_dests(self, pdf, dests, pages):
         page set.
         """
         new_dests = []
-        prev_header_added = True
         for k, o in list(dests.items()):
             for j in range(*pages):
                 if pdf.getPage(j).getObject() == o['/Page'].getObject():
@@ -356,7 +355,7 @@ def _write_dests(self):
                     if p.id == v['/Page']:
                         v[NameObject('/Page')] = p.out_pagedata
                         pageno = i
-                        pdf = p.src
+                        pdf = p.src  # noqa: F841
                         break
             if pageno is not None:
                 self.output.addNamedDestinationObject(v)
@@ -429,7 +428,7 @@ def _write_bookmarks(self, bookmarks=None, parent=None):
                         b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
 
                         pageno = i
-                        pdf = p.src
+                        pdf = p.src  # noqa: F841
                         break
             if pageno is not None:
                 del b['/Page'], b['/Type']

diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py
@@ -89,8 +89,7 @@ def __init__(self, arg):
     @staticmethod
     def valid(input):
         """ True if input is a valid initializer for a PageRange. """
-        return isinstance(input, slice)  or \
-               isinstance(input, PageRange) or \
+        return isinstance(input, (slice, PageRange))  or \
                (isString(input)
                 and bool(re.match(PAGE_RANGE_RE, input)))
 
@@ -144,7 +143,7 @@ def parse_filename_page_ranges(args):
     for arg in args + [None]:
         if PageRange.valid(arg):
             if not pdf_filename:
-                raise ValueError("The first argument must be a filename, " \
+                raise ValueError("The first argument must be a filename, "
                                  "not a page range.")
 
             pairs.append( (pdf_filename, PageRange(arg)) )

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -41,7 +41,6 @@
 __maintainer__ = "Phaseit, Inc."
 __maintainer_email = "[email protected]"
 
-import string
 import math
 import struct
 import sys
@@ -57,7 +56,6 @@
 else:
     from io import BytesIO
 
-from . import filters
 from . import utils
 import warnings
 import codecs
@@ -543,7 +541,6 @@ def _sweepIndirectReferences(self, externMap, data):
         if debug: print((data, "TYPE", data.__class__.__name__))
         if isinstance(data, DictionaryObject):
             for key, value in list(data.items()):
-                origvalue = value
                 value = self._sweepIndirectReferences(externMap, value)
                 if isinstance(value, StreamObject):
                     # a dictionary value is a stream.  streams must be indirect
@@ -794,6 +791,11 @@ def removeImages(self, ignoreByteStringObject=False):
             to ignore ByteString Objects.
         """
         pages = self.getObject(self._pages)['/Kids']
+        jump_operators = [
+            b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
+            b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
+            b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')
+        ]
         for j in range(len(pages)):
             page = pages[j]
             pageRef = self.getObject(page)
@@ -804,36 +806,29 @@ def removeImages(self, ignoreByteStringObject=False):
             _operations = []
             seq_graphics = False
             for operands, operator in content.operations:
-                if operator == b_('Tj'):
-                    text = operands[0]
-                    if ignoreByteStringObject:
-                        if not isinstance(text, TextStringObject):
-                            operands[0] = TextStringObject()
-                elif operator == b_("'"):
+                if operator in [b_('Tj'), b_("'")]:
                     text = operands[0]
                     if ignoreByteStringObject:
                         if not isinstance(text, TextStringObject):
                             operands[0] = TextStringObject()
                 elif operator == b_('"'):
                     text = operands[2]
-                    if ignoreByteStringObject:
-                        if not isinstance(text, TextStringObject):
-                            operands[2] = TextStringObject()
+                    if ignoreByteStringObject and not isinstance(text, TextStringObject):
+                        operands[2] = TextStringObject()
                 elif operator == b_("TJ"):
                     for i in range(len(operands[0])):
-                        if ignoreByteStringObject:
-                            if not isinstance(operands[0][i], TextStringObject):
-                                operands[0][i] = TextStringObject()
+                        if (
+                            ignoreByteStringObject
+                            and not isinstance(operands[0][i], TextStringObject)
+                        ):
+                            operands[0][i] = TextStringObject()
 
                 if operator == b_('q'):
                     seq_graphics = True
                 if operator == b_('Q'):
                     seq_graphics = False
-                if seq_graphics:
-                    if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
-                            b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
-                            b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]:
-                        continue
+                if seq_graphics and operator in jump_operators:
+                    continue
                 if operator == b_('re'):
                     continue
                 _operations.append((operands, operator))
@@ -856,41 +851,29 @@ def removeText(self, ignoreByteStringObject=False):
             if not isinstance(content, ContentStream):
                 content = ContentStream(content, pageRef)
             for operands,operator in content.operations:
-                if operator == b_('Tj'):
+                if operator in [b_('Tj'), b_("'")]:
                     text = operands[0]
                     if not ignoreByteStringObject:
                         if isinstance(text, TextStringObject):
                             operands[0] = TextStringObject()
                     else:
-                        if isinstance(text, TextStringObject) or \
-                                isinstance(text, ByteStringObject):
-                            operands[0] = TextStringObject()
-                elif operator == b_("'"):
-                    text = operands[0]
-                    if not ignoreByteStringObject:
-                        if isinstance(text, TextStringObject):
-                            operands[0] = TextStringObject()
-                    else:
-                        if isinstance(text, TextStringObject) or \
-                                isinstance(text, ByteStringObject):
+                        if isinstance(text, (TextStringObject, ByteStringObject)):
                             operands[0] = TextStringObject()
                 elif operator == b_('"'):
                     text = operands[2]
                     if not ignoreByteStringObject:
                         if isinstance(text, TextStringObject):
                             operands[2] = TextStringObject()
                     else:
-                        if isinstance(text, TextStringObject) or \
-                                isinstance(text, ByteStringObject):
+                        if isinstance(text, (TextStringObject, ByteStringObject)):
                             operands[2] = TextStringObject()
                 elif operator == b_("TJ"):
                     for i in range(len(operands[0])):
                         if not ignoreByteStringObject:
                             if isinstance(operands[0][i], TextStringObject):
                                 operands[0][i] = TextStringObject()
                         else:
-                            if isinstance(operands[0][i], TextStringObject) or \
-                                    isinstance(operands[0][i], ByteStringObject):
+                            if isinstance(operands[0][i], (TextStringObject, ByteStringObject)):
                                 operands[0][i] = TextStringObject()
 
             pageRef.__setitem__(NameObject('/Contents'), content)
@@ -1172,9 +1155,8 @@ def _showwarning(message, category, filename, lineno, file=warndest, line=None):
         if hasattr(stream, 'mode') and 'b' not in stream.mode:
             warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
         if isString(stream):
-            fileobj = open(stream, 'rb')
-            stream = BytesIO(b_(fileobj.read()))
-            fileobj.close()
+            with open(stream, 'rb') as fileobj:
+                stream = BytesIO(b_(fileobj.read()))
         self.read(stream)
         self.stream = stream
 
@@ -1729,7 +1711,7 @@ def getObject(self, indirectReference):
         return retval
 
     def _decryptObject(self, obj, key):
-        if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
+        if isinstance(obj, (ByteStringObject, TextStringObject)):
             obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
         elif isinstance(obj, StreamObject):
             obj._data = utils.RC4_encrypt(key, obj._data)
@@ -1752,7 +1734,10 @@ def readObjectHeader(self, stream):
         idnum = readUntilWhitespace(stream)
         extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
         generation = readUntilWhitespace(stream)
-        obj = stream.read(3)
+
+        # although it's not used, it might still be necessary to read
+        _obj = stream.read(3)  # noqa: F841
+
         readNonWhitespace(stream)
         stream.seek(-1, 1)
         if (extra and self.strict):
@@ -1938,8 +1923,8 @@ def used_before(num, generation):
                         # The rest of the elements depend on the xref_type
                         if xref_type == 0:
                             # linked list of free objects
-                            next_free_object = getEntry(1)
-                            next_generation = getEntry(2)
+                            next_free_object = getEntry(1)  # noqa: F841
+                            next_generation = getEntry(2)  # noqa: F841
                         elif xref_type == 1:
                             # objects that are in use but are not compressed
                             byte_offset = getEntry(1)

diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py
@@ -196,11 +196,10 @@ def markLocation(stream):
     # Mainly for debugging
     RADIUS = 5000
     stream.seek(-RADIUS, 1)
-    outputDoc = open('PyPDF2_pdfLocation.txt', 'wb')
-    outputDoc.write(stream.read(RADIUS))
-    outputDoc.write(b'HERE')
-    outputDoc.write(stream.read(RADIUS))
-    outputDoc.close()
+    with open('PyPDF2_pdfLocation.txt', 'wb') as outputDoc:
+        outputDoc.write(stream.read(RADIUS))
+        outputDoc.write(b'HERE')
+        outputDoc.write(stream.read(RADIUS))
     stream.seek(-RADIUS, 1)
 
 
@@ -242,7 +241,7 @@ def b_(s):
                 if len(s) < 2:
                     bc[s] = r
                 return r
-            except Exception as e:
+            except Exception:
                 print(s)
                 r = s.encode('utf-8')
                 if len(s) < 2:

diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py
@@ -2,7 +2,6 @@
 import datetime
 import decimal
 from .generic import PdfObject
-from xml.dom import getDOMImplementation
 from xml.dom.minidom import parseString
 from .utils import u_
 

diff --git a/Scripts/2-up.py b/Scripts/2-up.py
@@ -16,7 +16,7 @@ def main():
         print("usage: python 2-up.py input_file output_file")
         sys.exit(1)
     print("2-up input " + sys.argv[1])
-    reader = PdfFileReader(open(sys.argv[1], "rb"))
+    reader = PdfFileReader(sys.argv[1])
     writer = PdfFileWriter()
     for iter in range(0, reader.getNumPages() - 1, 2):
         lhs = reader.getPage(iter)

diff --git a/Scripts/booklet.py b/Scripts/booklet.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
 
 """
-    Layout the pages from a PDF file to print a booklet or brochure.
+Layout the pages from a PDF file to print a booklet or brochure.
 
-    The resulting media size is twice the size of the first page
-    of the source document. If you print the resulting PDF in duplex
-    (short edge), you get a center fold brochure that you can staple
-    together and read as a booklet.
+The resulting media size is twice the size of the first page
+of the source document. If you print the resulting PDF in duplex
+(short edge), you get a center fold brochure that you can staple
+together and read as a booklet.
 """
 
 from __future__ import division, print_function
@@ -63,7 +63,8 @@ def mergePageByNumber(dstPage, pageNumber, xOffset):
         mergePageByNumber(page, i, offsets[0])
         mergePageByNumber(page, virtualPages - i - 1, offsets[1])
 
-    writer.write(open(args.output, "wb"))
+    with open(args.output, "wb") as fp:
+        writer.write(fp)
 
 
 if __name__ == "__main__":

diff --git a/Scripts/pdf-image-extractor.py b/Scripts/pdf-image-extractor.py
@@ -11,7 +11,7 @@
 
 
 def main(pdf: str):
-    reader = PyPDF2.PdfFileReader(open(pdf, "rb"))
+    reader = PyPDF2.PdfFileReader(pdf)
     page = reader.pages[30]
 
     if "/XObject" in page["/Resources"]:

diff --git a/Tests/test_basic_features.py b/Tests/test_basic_features.py
@@ -14,7 +14,7 @@
 def test_basic_features():
     output = PdfFileWriter()
     document1 = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
-    input1 = PdfFileReader(open(document1, "rb"))
+    input1 = PdfFileReader(document1)
 
     # print how many pages input1 has:
     print("document1.pdf has %d pages." % input1.getNumPages())
@@ -32,7 +32,7 @@ def test_basic_features():
     # add page 4 from input1, but first add a watermark from another PDF:
     page4 = input1.getPage(0)
     watermark_pdf = document1
-    watermark = PdfFileReader(open(watermark_pdf, "rb"))
+    watermark = PdfFileReader(watermark_pdf)
     page4.mergePage(watermark.getPage(0))
     output.addPage(page4)