Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: New LZW decoding implementation #2887

Merged
merged 14 commits into from
Oct 3, 2024
100 changes: 92 additions & 8 deletions pypdf/_codecs/_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
the module should not do any PDF parsing.
"""

import io
from abc import ABC, abstractmethod
from typing import Dict, List

Expand Down Expand Up @@ -47,7 +48,7 @@ class LzwCodec(Codec):

def _initialize_encoding_table(self) -> None:
"""Initialize the encoding table and state to initial conditions."""
self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.next_code = self.EOD_MARKER + 1
self.bits_per_code = self.INITIAL_BITS_PER_CODE
self.max_code_value = (1 << self.bits_per_code) - 1
Expand Down Expand Up @@ -78,16 +79,16 @@ def encode(self, data: bytes) -> bytes:
for byte in data:
next_sequence = current_sequence + bytes([byte])

if next_sequence in self.table:
if next_sequence in self.encoding_table:
# Extend current sequence if already in the table
current_sequence = next_sequence
else:
# Output code for the current sequence
result_codes.append(self.table[current_sequence])
result_codes.append(self.encoding_table[current_sequence])

# Add the new sequence to the table if there's room
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
self.table[next_sequence] = self.next_code
self.encoding_table[next_sequence] = self.next_code
self._increase_next_code()
else:
# If the table is full, emit a clear-table command
Expand All @@ -99,7 +100,7 @@ def encode(self, data: bytes) -> bytes:

# Ensure everything actually is encoded
if current_sequence:
result_codes.append(self.table[current_sequence])
result_codes.append(self.encoding_table[current_sequence])
result_codes.append(self.EOD_MARKER)

return self._pack_codes_into_bytes(result_codes)
Expand Down Expand Up @@ -138,8 +139,91 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:

return bytes(output)

def _initialize_decoding_table(self) -> None:
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
b""
] * (4096 - self.CLEAR_TABLE_MARKER)
self._table_index = self.EOD_MARKER + 1
self._bits_to_get = 9

def _next_code_decode(self, data: bytes) -> int:
self._next_data: int
try:
while self._next_bits < self._bits_to_get:
self._next_data = (self._next_data << 8) | (
data[self._byte_pointer] & 0xFF
)
self._byte_pointer += 1
self._next_bits += 8

code = (
self._next_data >> (self._next_bits - self._bits_to_get)
) & self._and_table[self._bits_to_get - 9]
self._next_bits -= self._bits_to_get

return code
except IndexError:
return self.EOD_MARKER

def decode(self, data: bytes) -> bytes:
"""Decode data using LZW."""
from ..filters import LZWDecode
"""
The following code was converted to Python from the following code:
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
"""
self._and_table = [511, 1023, 2047, 4095]
self._table_index = 0
self._bits_to_get = 9
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0

output_stream = io.BytesIO()

self._initialize_decoding_table()
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
old_code = self.CLEAR_TABLE_MARKER

while True:
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break

return LZWDecode.Decoder(data).decode()
if code == self.CLEAR_TABLE_MARKER:
self._initialize_decoding_table()
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break
output_stream.write(self.decoding_table[code])
old_code = code
elif code < self._table_index:
string = self.decoding_table[code]
output_stream.write(string)
if old_code != self.CLEAR_TABLE_MARKER:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code
else:
# The code not in the table and not one of the special codes
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
string = (
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
)
output_stream.write(string)
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code

output = output_stream.getvalue()
return output

def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
self.decoding_table[self._table_index] = new_string
self._table_index += 1

# Update the number of bits to get based on the table index
if self._table_index == 511:
self._bits_to_get = 10
elif self._table_index == 1023:
self._bits_to_get = 11
elif self._table_index == 2047:
self._bits_to_get = 12
130 changes: 2 additions & 128 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,11 @@
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._codecs._codecs import LzwCodec
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
from ._utils import (
WHITESPACES_AS_BYTES,
deprecate,
deprecation_no_replacement,
logger_warning,
ord_,
)
from .constants import CcittFaxDecodeParameters as CCITT
from .constants import FilterTypeAbbreviations as FTA
Expand Down Expand Up @@ -365,131 +364,6 @@ def decode(
return b"".join(lst)


class LZWDecode:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
"""
Taken from:

https://github.com/katjas/PDFrenderer/blob/master/src/com/sun/pdfview/decode/LZWDecode.java
"""

class Decoder:
STOP = 257
CLEARDICT = 256

def __init__(self, data: bytes) -> None:
self.data = data
self.bytepos = 0
self.bitpos = 0
self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
self.reset_dict()

def reset_dict(self) -> None:
self.dictlen = 258
self.bitspercode = 9

def next_code(self) -> int:
fillbits = self.bitspercode
value = 0
while fillbits > 0:
if self.bytepos >= len(self.data):
return -1
nextbits = ord_(self.data[self.bytepos])
bitsfromhere = 8 - self.bitpos
bitsfromhere = min(bitsfromhere, fillbits)
value |= (
(nextbits >> (8 - self.bitpos - bitsfromhere))
& (0xFF >> (8 - bitsfromhere))
) << (fillbits - bitsfromhere)
fillbits -= bitsfromhere
self.bitpos += bitsfromhere
if self.bitpos >= 8:
self.bitpos = 0
self.bytepos = self.bytepos + 1
return value

def decode(self) -> bytes:
"""
TIFF 6.0 specification explains in sufficient details the steps to
implement the LZW encode() and decode() algorithms.

algorithm derived from:
http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
and the PDFReference

Raises:
PdfReadError: If the stop code is missing
"""
cW = self.CLEARDICT
baos = b""
while True:
pW = cW
cW = self.next_code()
if cW == -1:
raise PdfReadError("Missed the stop code in LZWDecode!")
if cW == self.STOP:
break
elif cW == self.CLEARDICT:
self.reset_dict()
elif pW == self.CLEARDICT:
baos += self.dict[cW]
else:
if cW < self.dictlen:
baos += self.dict[cW]
p = self.dict[pW] + self.dict[cW][0:1]
self.dict[self.dictlen] = p
self.dictlen += 1
else:
p = self.dict[pW] + self.dict[pW][0:1]
baos += p
self.dict[self.dictlen] = p
self.dictlen += 1
if (
self.dictlen >= (1 << self.bitspercode) - 1
and self.bitspercode < 12
):
self.bitspercode += 1
return baos

@staticmethod
def _decodeb(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode an LZW encoded data stream.

Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.

Returns:
decoded data.
"""
# decode_parms is unused here
return LZWDecode.Decoder(data).decode()

@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> str: # deprecated
"""
Decode an LZW encoded data stream.

Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.

Returns:
decoded data.
"""
# decode_parms is unused here
deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0")
return LZWDecode.Decoder(data).decode().decode("latin-1")


class ASCII85Decode:
"""Decodes string ASCII85-encoded data into a byte format."""

Expand Down Expand Up @@ -698,7 +572,7 @@ def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject
elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL):
data = RunLengthDecode.decode(data)
elif filter_type in (FT.LZW_DECODE, FTA.LZW):
data = LZWDecode._decodeb(data, params)
data = LzwCodec().decode(data)
elif filter_type in (FT.ASCII_85_DECODE, FTA.A85):
data = ASCII85Decode.decode(data)
elif filter_type == FT.DCT_DECODE:
Expand Down
5 changes: 2 additions & 3 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,7 @@ def test_decompress_zlib_error(caplog):
def test_lzw_decode_neg1():
reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf")))
page = reader.pages[47]
with pytest.raises(PdfReadError) as exc:
page.extract_text()
assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
assert page.extract_text().startswith("Chapter 2")


@pytest.mark.enable_socket()
Expand All @@ -249,6 +247,7 @@ def test_issue_399():
@pytest.mark.enable_socket()
def test_image_without_pillow(tmp_path):
import os

name = "tika-914102.pdf"
pdf_path = Path(__file__).parent / "pdf_cache" / name
pdf_path_str = str(pdf_path.resolve()).replace("\\", "/")
Expand Down
Loading