py-pdf · MartinThoma · Apr 15, 2022 · Apr 15, 2022 · Apr 15, 2022 · Apr 15, 2022
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -1,5 +1,3 @@
-# vim: sw=4:expandtab:foldmethod=marker
-#
 # Copyright (c) 2006, Mathieu Fenniak
 # All rights reserved.
 #
@@ -40,7 +38,7 @@
     from cStringIO import StringIO
 else:
     from io import StringIO
-    import struct
+import struct
 
 try:
     import zlib
@@ -356,6 +354,10 @@ def decode(data, decodeParms=None):
 class CCITTFaxDecode(object):
     def decode(data, decodeParms=None, height=0):
         if decodeParms:
+            from PyPDF2.generic import ArrayObject
+            if isinstance(decodeParms, ArrayObject):
+                if len(decodeParms) == 1:
+                    decodeParms = decodeParms[0]
             if decodeParms.get("/K", 1) == -1:
                 CCITTgroup = 4
             else:
@@ -451,6 +453,10 @@ def _xobj_to_image(x_object_obj):
             img_byte_arr = io.BytesIO()
             img.save(img_byte_arr, format="PNG")
             data = img_byte_arr.getvalue()
+        elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']):
+            from PyPDF2.utils import b_
+            extension = ".png"
+            data = b_(data)
         elif x_object_obj["/Filter"] == "/DCTDecode":
             extension = ".jpg"
         elif x_object_obj["/Filter"] == "/JPXDecode":

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -44,6 +44,8 @@
 import decimal
 import codecs
 
+from PyPDF2.utils import ERR_STREAM_TRUNCATED_PREMATURELY
+
 ObjectPrefix = b_('/<[tf(n%')
 NumberSigns = b_('+-')
 IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
@@ -199,17 +201,15 @@ def readFromStream(stream, pdf):
         while True:
             tok = stream.read(1)
             if not tok:
-                # stream has truncated prematurely
-                raise PdfStreamError("Stream has ended unexpectedly")
+                raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
             if tok.isspace():
                 break
             idnum += tok
         generation = b_("")
         while True:
             tok = stream.read(1)
             if not tok:
-                # stream has truncated prematurely
-                raise PdfStreamError("Stream has ended unexpectedly")
+                raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
             if tok.isspace():
                 if not generation:
                     continue
@@ -273,10 +273,11 @@ def readFromStream(stream):
     readFromStream = staticmethod(readFromStream)
 
 
-##
-# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
-# TextStringObject to represent the string.
 def createStringObject(string):
+    """
+    Given a string (either a "str" or "unicode"), create a ByteStringObject or a
+    TextStringObject to represent the string.
+    """
     if isinstance(string, utils.string_type):
         return TextStringObject(string)
     elif isinstance(string, utils.bytes_type):
@@ -306,8 +307,7 @@ def readHexStringFromStream(stream):
     while True:
         tok = readNonWhitespace(stream)
         if not tok:
-            # stream has truncated prematurely
-            raise PdfStreamError("Stream has ended unexpectedly")
+            raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
         if tok == b_(">"):
             break
         x += tok
@@ -328,8 +328,7 @@ def readStringFromStream(stream):
     while True:
         tok = stream.read(1)
         if not tok:
-            # stream has truncated prematurely
-            raise PdfStreamError("Stream has ended unexpectedly")
+            raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
         if tok == b_("("):
             parens += 1
         elif tok == b_(")"):
@@ -392,16 +391,17 @@ def readStringFromStream(stream):
     return createStringObject(txt)
 
 
-##
-# Represents a string object where the text encoding could not be determined.
-# This occurs quite often, as the PDF spec doesn't provide an alternate way to
-# represent strings -- for example, the encryption data stored in files (like
-# /O) is clearly not text, but is still stored in a "String" object.
 class ByteStringObject(utils.bytes_type, PdfObject):
+    """
+    Represents a string object where the text encoding could not be determined.
+    This occurs quite often, as the PDF spec doesn't provide an alternate way to
+    represent strings -- for example, the encryption data stored in files (like
+    /O) is clearly not text, but is still stored in a "String" object.
+    """
 
     ##
     # For compatibility with TextStringObject.original_bytes.  This method
-    # returns self.
+    #  self.
     original_bytes = property(lambda self: self)
 
     def writeToStream(self, stream, encryption_key):
@@ -413,12 +413,14 @@ def writeToStream(self, stream, encryption_key):
         stream.write(b_(">"))
 
 
-##
-# Represents a string object that has been decoded into a real unicode string.
-# If read from a PDF document, this string appeared to match the
-# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
-# occur.
 class TextStringObject(utils.string_type, PdfObject):
+    """
+    Represents a string object that has been decoded into a real unicode string.
+    If read from a PDF document, this string appeared to match the
+    PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
+    occur.
+    """
+
     autodetect_pdfdocencoding = False
     autodetect_utf16 = False
 
@@ -569,8 +571,7 @@ def readFromStream(stream, pdf):
                 skipOverComment(stream)
                 continue
             if not tok:
-                # stream has truncated prematurely
-                raise PdfStreamError("Stream has ended unexpectedly")
+                raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
 
             if debug: print(("Tok:", tok))
             if tok == b_(">"):

diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py
@@ -1,5 +1,3 @@
-# vim: sw=4:expandtab:foldmethod=marker
-#
 # Copyright (c) 2006, Mathieu Fenniak
 # All rights reserved.
 #

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 #
-# vim: sw=4:expandtab:foldmethod=marker
-#
 # Copyright (c) 2006, Mathieu Fenniak
 # Copyright (c) 2007, Ashish Kulkarni <[email protected]>
 #
@@ -1637,7 +1635,7 @@ def _getObjectFromStream(self, indirectReference):
                 streamData.seek(0, 0)
                 lines = streamData.readlines()
                 for i in range(0, len(lines)):
-                    print((lines[i]))
+                    print(lines[i])
                 streamData.seek(pos, 0)
             try:
                 obj = readObject(streamData, self)
@@ -2588,11 +2586,6 @@ def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expan
                                                  ctm[1][0], ctm[1][1],
                                                  ctm[2][0], ctm[2][1]], expand)
 
-    ##
-    # Applys a transformation matrix the page.
-    #
-    # @param ctm   A 6 elements tuple containing the operands of the
-    #              transformation matrix
     def addTransformation(self, ctm):
         """
         Applies a transformation matrix to the page.

diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py
@@ -39,7 +39,7 @@
 except ImportError:  # Py3
     import builtins
 
-
+ERR_STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
 xrange_fn = getattr(builtins, "xrange", range)
 _basestring = getattr(builtins, "basestring", str)
 
@@ -122,7 +122,7 @@ def skipOverComment(stream):
 def readUntilRegex(stream, regex, ignore_eof=False):
     """
     Reads until the regular expression pattern matched (ignore the match)
-    Raise PdfStreamError on premature end-of-file.
+    :raises PdfStreamError: on premature end-of-file
     :param bool ignore_eof: If true, ignore end-of-line and return immediately
     """
     name = b_('')
@@ -133,7 +133,7 @@ def readUntilRegex(stream, regex, ignore_eof=False):
             if ignore_eof:
                 return name
             else:
-                raise PdfStreamError("Stream has ended unexpectedly")
+                raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
         m = regex.search(tok)
         if m is not None:
             name += tok[:m.start()]
@@ -242,7 +242,6 @@ def b_(s):
                     bc[s] = r
                 return r
             except Exception:
-                print(s)
                 r = s.encode('utf-8')
                 if len(s) < 2:
                     bc[s] = r

diff --git a/Resources/imagemagick-ASCII85Decode.pdf b/Resources/imagemagick-ASCII85Decode.pdf
diff --git a/Resources/imagemagick-CCITTFaxDecode.pdf b/Resources/imagemagick-CCITTFaxDecode.pdf
diff --git a/Resources/imagemagick-images.pdf b/Resources/imagemagick-images.pdf
diff --git a/Resources/imagemagick-lzw.pdf b/Resources/imagemagick-lzw.pdf
diff --git a/Resources/metadata.pdf b/Resources/metadata.pdf
diff --git a/Tests/test_basic_features.py b/Tests/test_basic_features.py
@@ -2,60 +2,60 @@
 
 import pytest
 
-from PyPDF2 import PdfFileWriter, PdfFileReader
-from PyPDF2.utils import PdfReadError
+from PyPDF2 import PdfFileReader, PdfFileWriter
 from PyPDF2.pdf import convertToInt
+from PyPDF2.utils import PdfReadError
 
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
 PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
 RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
 
 
 def test_basic_features():
-    output = PdfFileWriter()
-    document1 = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
-    input1 = PdfFileReader(document1)
+    writer = PdfFileWriter()
+    pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
+    reader = PdfFileReader(pdf_path)
 
     # print how many pages input1 has:
-    print("document1.pdf has %d pages." % input1.getNumPages())
+    print("document1.pdf has %d pages." % reader.getNumPages())
 
     # add page 1 from input1 to output document, unchanged
-    output.addPage(input1.getPage(0))
+    writer.addPage(reader.getPage(0))
 
     # add page 2 from input1, but rotated clockwise 90 degrees
-    output.addPage(input1.getPage(0).rotateClockwise(90))
+    writer.addPage(reader.getPage(0).rotateClockwise(90))
 
     # add page 3 from input1, rotated the other way:
-    output.addPage(input1.getPage(0).rotateCounterClockwise(90))
+    writer.addPage(reader.getPage(0).rotateCounterClockwise(90))
     # alt: output.addPage(input1.getPage(0).rotateClockwise(270))
 
     # add page 4 from input1, but first add a watermark from another PDF:
-    page4 = input1.getPage(0)
-    watermark_pdf = document1
+    page4 = reader.getPage(0)
+    watermark_pdf = pdf_path
     watermark = PdfFileReader(watermark_pdf)
     page4.mergePage(watermark.getPage(0))
-    output.addPage(page4)
+    writer.addPage(page4)
 
     # add page 5 from input1, but crop it to half size:
-    page5 = input1.getPage(0)
+    page5 = reader.getPage(0)
     page5.mediaBox.upperRight = (
         page5.mediaBox.getUpperRight_x() / 2,
         page5.mediaBox.getUpperRight_y() / 2,
     )
-    output.addPage(page5)
+    writer.addPage(page5)
 
     # add some Javascript to launch the print window on opening this PDF.
     # the password dialog may prevent the print dialog from being shown,
     # comment the the encription lines, if that's the case, to try this out
-    output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
+    writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
 
     # encrypt your new PDF and add a password
     password = "secret"
-    output.encrypt(password)
+    writer.encrypt(password)
 
     # finally, write "output" to PyPDF2-output.pdf
     with open("PyPDF2-output.pdf", "wb") as outputStream:
-        output.write(outputStream)
+        writer.write(outputStream)
 
 
 def test_convertToInt():

diff --git a/Tests/test_javascript.py b/Tests/test_javascript.py
@@ -1,4 +1,5 @@
 import os
+
 import pytest
 
 from PyPDF2 import PdfFileReader, PdfFileWriter
@@ -8,21 +9,28 @@
 PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
 RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
 
+
 @pytest.fixture
 def pdf_file_writer():
-    ipdf = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
+    reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
     pdf_file_writer = PdfFileWriter()
-    pdf_file_writer.appendPagesFromReader(ipdf)
+    pdf_file_writer.appendPagesFromReader(reader)
     yield pdf_file_writer
 
+
 def test_add_js(pdf_file_writer):
-    pdf_file_writer.addJS(
-        "this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
-    )
+    pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
+
+    assert (
+        "/Names" in pdf_file_writer._root_object
+    ), "addJS should add a name catalog in the root object."
+    assert (
+        "/JavaScript" in pdf_file_writer._root_object["/Names"]
+    ), "addJS should add a JavaScript name tree under the name catalog."
+    assert (
+        "/OpenAction" in pdf_file_writer._root_object
+    ), "addJS should add an OpenAction to the catalog."
 
-    assert "/Names" in pdf_file_writer._root_object, "addJS should add a name catalog in the root object."
-    assert "/JavaScript" in pdf_file_writer._root_object["/Names"], "addJS should add a JavaScript name tree under the name catalog."
-    assert "/OpenAction" in pdf_file_writer._root_object, "addJS should add an OpenAction to the catalog."
 
 def test_overwrite_js(pdf_file_writer):
     def get_javascript_name():
@@ -31,14 +39,12 @@ def get_javascript_name():
         assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"]
         return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][0]
 
-    pdf_file_writer.addJS(
-        "this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
-    )
+    pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
     first_js = get_javascript_name()
 
-    pdf_file_writer.addJS(
-        "this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
-    )
+    pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
     second_js = get_javascript_name()
 
-    assert first_js != second_js, "addJS should overwrite the previous script in the catalog."
+    assert (
+        first_js != second_js
+    ), "addJS should overwrite the previous script in the catalog."