Skip to content

Commit

Permalink
MAINT: New LZW decoding implementation (#2887)
Browse files Browse the repository at this point in the history
MAINT: New LZW decoding implementation

---------

Co-authored-by: Stefan <[email protected]>
  • Loading branch information
MartinThoma and stefan6419846 authored Oct 3, 2024
1 parent d5233a0 commit e825ac0
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 87 deletions.
131 changes: 123 additions & 8 deletions pypdf/_codecs/_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
the module should not do any PDF parsing.
"""

import io
from abc import ABC, abstractmethod
from typing import Dict, List

Expand Down Expand Up @@ -47,7 +48,7 @@ class LzwCodec(Codec):

def _initialize_encoding_table(self) -> None:
"""Initialize the encoding table and state to initial conditions."""
self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.next_code = self.EOD_MARKER + 1
self.bits_per_code = self.INITIAL_BITS_PER_CODE
self.max_code_value = (1 << self.bits_per_code) - 1
Expand Down Expand Up @@ -78,16 +79,16 @@ def encode(self, data: bytes) -> bytes:
for byte in data:
next_sequence = current_sequence + bytes([byte])

if next_sequence in self.table:
if next_sequence in self.encoding_table:
# Extend current sequence if already in the table
current_sequence = next_sequence
else:
# Output code for the current sequence
result_codes.append(self.table[current_sequence])
result_codes.append(self.encoding_table[current_sequence])

# Add the new sequence to the table if there's room
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
self.table[next_sequence] = self.next_code
self.encoding_table[next_sequence] = self.next_code
self._increase_next_code()
else:
# If the table is full, emit a clear-table command
Expand All @@ -99,7 +100,7 @@ def encode(self, data: bytes) -> bytes:

# Ensure everything actually is encoded
if current_sequence:
result_codes.append(self.table[current_sequence])
result_codes.append(self.encoding_table[current_sequence])
result_codes.append(self.EOD_MARKER)

return self._pack_codes_into_bytes(result_codes)
Expand Down Expand Up @@ -138,8 +139,122 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:

return bytes(output)

def _initialize_decoding_table(self) -> None:
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
b""
] * (4096 - self.CLEAR_TABLE_MARKER)
self._table_index = self.EOD_MARKER + 1
self._bits_to_get = 9

def _next_code_decode(self, data: bytes) -> int:
self._next_data: int
try:
while self._next_bits < self._bits_to_get:
self._next_data = (self._next_data << 8) | (
data[self._byte_pointer] & 0xFF
)
self._byte_pointer += 1
self._next_bits += 8

code = (
self._next_data >> (self._next_bits - self._bits_to_get)
) & self._and_table[self._bits_to_get - 9]
self._next_bits -= self._bits_to_get

return code
except IndexError:
return self.EOD_MARKER

# The following method has been converted to Python from PDFsharp:
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
#
# Original license:
#
# -------------------------------------------------------------------------
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
# Germany
#
# http://docs.pdfsharp.net
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# --------------------------------------------------------------------------
def decode(self, data: bytes) -> bytes:
"""Decode data using LZW."""
from ..filters import LZWDecode
"""
The following code was converted to Python from the following code:
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
"""
self._and_table = [511, 1023, 2047, 4095]
self._table_index = 0
self._bits_to_get = 9
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0

output_stream = io.BytesIO()

self._initialize_decoding_table()
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
old_code = self.CLEAR_TABLE_MARKER

while True:
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break

return LZWDecode.Decoder(data).decode()
if code == self.CLEAR_TABLE_MARKER:
self._initialize_decoding_table()
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break
output_stream.write(self.decoding_table[code])
old_code = code
elif code < self._table_index:
string = self.decoding_table[code]
output_stream.write(string)
if old_code != self.CLEAR_TABLE_MARKER:
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code
else:
# The code is not in the table and not one of the special codes
string = (
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
)
output_stream.write(string)
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code

output = output_stream.getvalue()
return output

def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
self.decoding_table[self._table_index] = new_string
self._table_index += 1

# Update the number of bits to get based on the table index
if self._table_index == 511:
self._bits_to_get = 10
elif self._table_index == 1023:
self._bits_to_get = 11
elif self._table_index == 2047:
self._bits_to_get = 12
78 changes: 2 additions & 76 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._codecs._codecs import LzwCodec as _LzwCodec
from ._utils import (
WHITESPACES_AS_BYTES,
deprecate,
deprecation_no_replacement,
logger_warning,
ord_,
)
from .constants import CcittFaxDecodeParameters as CCITT
from .constants import FilterTypeAbbreviations as FTA
Expand Down Expand Up @@ -366,89 +366,15 @@ def decode(


class LZWDecode:
"""
Taken from:
https://github.com/katjas/PDFrenderer/blob/master/src/com/sun/pdfview/decode/LZWDecode.java
"""

class Decoder:
STOP = 257
CLEARDICT = 256

def __init__(self, data: bytes) -> None:
self.data = data
self.bytepos = 0
self.bitpos = 0
self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
self.reset_dict()

def reset_dict(self) -> None:
self.dictlen = 258
self.bitspercode = 9

def next_code(self) -> int:
fillbits = self.bitspercode
value = 0
while fillbits > 0:
if self.bytepos >= len(self.data):
return -1
nextbits = ord_(self.data[self.bytepos])
bitsfromhere = 8 - self.bitpos
bitsfromhere = min(bitsfromhere, fillbits)
value |= (
(nextbits >> (8 - self.bitpos - bitsfromhere))
& (0xFF >> (8 - bitsfromhere))
) << (fillbits - bitsfromhere)
fillbits -= bitsfromhere
self.bitpos += bitsfromhere
if self.bitpos >= 8:
self.bitpos = 0
self.bytepos = self.bytepos + 1
return value

def decode(self) -> bytes:
"""
TIFF 6.0 specification explains in sufficient details the steps to
implement the LZW encode() and decode() algorithms.
algorithm derived from:
http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
and the PDFReference
Raises:
PdfReadError: If the stop code is missing
"""
cW = self.CLEARDICT
baos = b""
while True:
pW = cW
cW = self.next_code()
if cW == -1:
raise PdfReadError("Missed the stop code in LZWDecode!")
if cW == self.STOP:
break
elif cW == self.CLEARDICT:
self.reset_dict()
elif pW == self.CLEARDICT:
baos += self.dict[cW]
else:
if cW < self.dictlen:
baos += self.dict[cW]
p = self.dict[pW] + self.dict[cW][0:1]
self.dict[self.dictlen] = p
self.dictlen += 1
else:
p = self.dict[pW] + self.dict[pW][0:1]
baos += p
self.dict[self.dictlen] = p
self.dictlen += 1
if (
self.dictlen >= (1 << self.bitspercode) - 1
and self.bitspercode < 12
):
self.bitspercode += 1
return baos
return _LzwCodec().decode(self.data)

@staticmethod
def _decodeb(
Expand Down
15 changes: 15 additions & 0 deletions tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,18 @@ def test_encode_lzw(plain, expected_encoded):
codec = LzwCodec()
actual_encoded = codec.encode(plain)
assert actual_encoded == expected_encoded


@pytest.mark.parametrize(
("encoded", "expected_decoded"),
[
# _pack_codes_into_bytes([256, 65, 66, 67, 68, 256, 256, 69, 70, 71, 72, 257])
(b"\x80\x10HD2$\x02\x00E#\x11\xc9\x10\x10", b"ABCDEFGH"), # Clear twice.
# _pack_codes_into_bytes([65, 66, 67, 68, 257])
(b" \x90\x88dH\x08", b"ABCD"), # No explicit initial clear marker.
],
)
def test_decode_lzw(encoded, expected_decoded):
codec = LzwCodec()
actual_decoded = codec.decode(encoded)
assert actual_decoded == expected_decoded
5 changes: 2 additions & 3 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,7 @@ def test_decompress_zlib_error(caplog):
def test_lzw_decode_neg1():
reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf")))
page = reader.pages[47]
with pytest.raises(PdfReadError) as exc:
page.extract_text()
assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
assert page.extract_text().startswith("Chapter 2")


@pytest.mark.enable_socket()
Expand All @@ -249,6 +247,7 @@ def test_issue_399():
@pytest.mark.enable_socket()
def test_image_without_pillow(tmp_path):
import os

name = "tika-914102.pdf"
pdf_path = Path(__file__).parent / "pdf_cache" / name
pdf_path_str = str(pdf_path.resolve()).replace("\\", "/")
Expand Down

0 comments on commit e825ac0

Please sign in to comment.