Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: New LZW decoding implementation #2887

Merged
merged 14 commits into from
Oct 3, 2024
131 changes: 123 additions & 8 deletions pypdf/_codecs/_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
the module should not do any PDF parsing.
"""

import io
from abc import ABC, abstractmethod
from typing import Dict, List

Expand Down Expand Up @@ -47,7 +48,7 @@ class LzwCodec(Codec):

def _initialize_encoding_table(self) -> None:
"""Initialize the encoding table and state to initial conditions."""
self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.next_code = self.EOD_MARKER + 1
self.bits_per_code = self.INITIAL_BITS_PER_CODE
self.max_code_value = (1 << self.bits_per_code) - 1
Expand Down Expand Up @@ -78,16 +79,16 @@ def encode(self, data: bytes) -> bytes:
for byte in data:
next_sequence = current_sequence + bytes([byte])

if next_sequence in self.table:
if next_sequence in self.encoding_table:
# Extend current sequence if already in the table
current_sequence = next_sequence
else:
# Output code for the current sequence
result_codes.append(self.table[current_sequence])
result_codes.append(self.encoding_table[current_sequence])

# Add the new sequence to the table if there's room
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
self.table[next_sequence] = self.next_code
self.encoding_table[next_sequence] = self.next_code
self._increase_next_code()
else:
# If the table is full, emit a clear-table command
Expand All @@ -99,7 +100,7 @@ def encode(self, data: bytes) -> bytes:

# Ensure everything actually is encoded
if current_sequence:
result_codes.append(self.table[current_sequence])
result_codes.append(self.encoding_table[current_sequence])
result_codes.append(self.EOD_MARKER)

return self._pack_codes_into_bytes(result_codes)
Expand Down Expand Up @@ -138,8 +139,122 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:

return bytes(output)

def _initialize_decoding_table(self) -> None:
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
b""
] * (4096 - self.CLEAR_TABLE_MARKER)
self._table_index = self.EOD_MARKER + 1
self._bits_to_get = 9

def _next_code_decode(self, data: bytes) -> int:
self._next_data: int
try:
while self._next_bits < self._bits_to_get:
self._next_data = (self._next_data << 8) | (
data[self._byte_pointer] & 0xFF
)
self._byte_pointer += 1
self._next_bits += 8

code = (
self._next_data >> (self._next_bits - self._bits_to_get)
) & self._and_table[self._bits_to_get - 9]
self._next_bits -= self._bits_to_get

return code
except IndexError:
return self.EOD_MARKER

# The following method has been converted to Python from PDFsharp:
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
#
# Original license:
#
# -------------------------------------------------------------------------
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
# Germany
#
# http://docs.pdfsharp.net
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# --------------------------------------------------------------------------
def decode(self, data: bytes) -> bytes:
"""Decode data using LZW."""
from ..filters import LZWDecode
"""
The following code was converted to Python from the following code:
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
"""
self._and_table = [511, 1023, 2047, 4095]
self._table_index = 0
self._bits_to_get = 9
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0

output_stream = io.BytesIO()

self._initialize_decoding_table()
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
old_code = self.CLEAR_TABLE_MARKER

while True:
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break

return LZWDecode.Decoder(data).decode()
if code == self.CLEAR_TABLE_MARKER:
self._initialize_decoding_table()
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break
output_stream.write(self.decoding_table[code])
old_code = code
elif code < self._table_index:
string = self.decoding_table[code]
output_stream.write(string)
if old_code != self.CLEAR_TABLE_MARKER:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code
else:
# The code is not in the table and not one of the special codes
string = (
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
)
output_stream.write(string)
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code

output = output_stream.getvalue()
return output

def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
self.decoding_table[self._table_index] = new_string
self._table_index += 1

# Update the number of bits to get based on the table index
if self._table_index == 511:
self._bits_to_get = 10
elif self._table_index == 1023:
self._bits_to_get = 11
elif self._table_index == 2047:
self._bits_to_get = 12
78 changes: 2 additions & 76 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._codecs._codecs import LzwCodec as _LzwCodec
from ._utils import (
WHITESPACES_AS_BYTES,
deprecate,
deprecation_no_replacement,
logger_warning,
ord_,
)
from .constants import CcittFaxDecodeParameters as CCITT
from .constants import FilterTypeAbbreviations as FTA
Expand Down Expand Up @@ -366,89 +366,15 @@ def decode(


class LZWDecode:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
"""
Taken from:

https://github.com/katjas/PDFrenderer/blob/master/src/com/sun/pdfview/decode/LZWDecode.java
"""

class Decoder:
STOP = 257
CLEARDICT = 256

def __init__(self, data: bytes) -> None:
self.data = data
self.bytepos = 0
self.bitpos = 0
self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
self.reset_dict()

def reset_dict(self) -> None:
self.dictlen = 258
self.bitspercode = 9

def next_code(self) -> int:
fillbits = self.bitspercode
value = 0
while fillbits > 0:
if self.bytepos >= len(self.data):
return -1
nextbits = ord_(self.data[self.bytepos])
bitsfromhere = 8 - self.bitpos
bitsfromhere = min(bitsfromhere, fillbits)
value |= (
(nextbits >> (8 - self.bitpos - bitsfromhere))
& (0xFF >> (8 - bitsfromhere))
) << (fillbits - bitsfromhere)
fillbits -= bitsfromhere
self.bitpos += bitsfromhere
if self.bitpos >= 8:
self.bitpos = 0
self.bytepos = self.bytepos + 1
return value

def decode(self) -> bytes:
"""
TIFF 6.0 specification explains in sufficient details the steps to
implement the LZW encode() and decode() algorithms.

algorithm derived from:
http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
and the PDFReference

Raises:
PdfReadError: If the stop code is missing
"""
cW = self.CLEARDICT
baos = b""
while True:
pW = cW
cW = self.next_code()
if cW == -1:
raise PdfReadError("Missed the stop code in LZWDecode!")
if cW == self.STOP:
break
elif cW == self.CLEARDICT:
self.reset_dict()
elif pW == self.CLEARDICT:
baos += self.dict[cW]
else:
if cW < self.dictlen:
baos += self.dict[cW]
p = self.dict[pW] + self.dict[cW][0:1]
self.dict[self.dictlen] = p
self.dictlen += 1
else:
p = self.dict[pW] + self.dict[pW][0:1]
baos += p
self.dict[self.dictlen] = p
self.dictlen += 1
if (
self.dictlen >= (1 << self.bitspercode) - 1
and self.bitspercode < 12
):
self.bitspercode += 1
return baos
return _LzwCodec().decode(self.data)

@staticmethod
def _decodeb(
Expand Down
15 changes: 15 additions & 0 deletions tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,18 @@ def test_encode_lzw(plain, expected_encoded):
codec = LzwCodec()
actual_encoded = codec.encode(plain)
assert actual_encoded == expected_encoded


@pytest.mark.parametrize(
("encoded", "expected_decoded"),
[
# _pack_codes_into_bytes([256, 65, 66, 67, 68, 256, 256, 69, 70, 71, 72, 257])
(b"\x80\x10HD2$\x02\x00E#\x11\xc9\x10\x10", b"ABCDEFGH"), # Clear twice.
# _pack_codes_into_bytes([65, 66, 67, 68, 257])
(b" \x90\x88dH\x08", b"ABCD"), # No explicit initial clear marker.
],
)
def test_decode_lzw(encoded, expected_decoded):
codec = LzwCodec()
actual_decoded = codec.decode(encoded)
assert actual_decoded == expected_decoded
5 changes: 2 additions & 3 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,7 @@ def test_decompress_zlib_error(caplog):
def test_lzw_decode_neg1():
reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf")))
page = reader.pages[47]
with pytest.raises(PdfReadError) as exc:
page.extract_text()
assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
assert page.extract_text().startswith("Chapter 2")


@pytest.mark.enable_socket()
Expand All @@ -249,6 +247,7 @@ def test_issue_399():
@pytest.mark.enable_socket()
def test_image_without_pillow(tmp_path):
import os

name = "tika-914102.pdf"
pdf_path = Path(__file__).parent / "pdf_cache" / name
pdf_path_str = str(pdf_path.resolve()).replace("\\", "/")
Expand Down
Loading