Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: New LZW decoding implementation #2887

Merged
merged 14 commits into from
Oct 3, 2024
124 changes: 120 additions & 4 deletions pypdf/_codecs/_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
the module should not do any PDF parsing.
"""

import io
from abc import ABC, abstractmethod
from typing import Dict, List

Expand Down Expand Up @@ -138,8 +139,123 @@

return bytes(output)

def decode(self, data: bytes) -> bytes:
"""Decode data using LZW."""
from ..filters import LZWDecode
def _next_code(self, data: bytes) -> int:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
self.bitpos: int
self._next_bits: int

Check warning on line 144 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L143-L144

Added lines #L143 - L144 were not covered by tests

fillbits = self.bits_per_code
value = 0

Check warning on line 147 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L146-L147

Added lines #L146 - L147 were not covered by tests

while fillbits > 0:
if self._byte_pointer >= len(data):
return -1

Check warning on line 151 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L151

Added line #L151 was not covered by tests

nextbits = data[self._byte_pointer]
bits_available = 8 - self.bitpos
bits_to_use = min(bits_available, fillbits)

Check warning on line 155 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L153-L155

Added lines #L153 - L155 were not covered by tests

value |= (

Check warning on line 157 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L157

Added line #L157 was not covered by tests
(nextbits >> (8 - self.bitpos - bits_to_use))
& (0xFF >> (8 - bits_to_use))
) << (fillbits - bits_to_use)

fillbits -= bits_to_use
self.bitpos += bits_to_use

Check warning on line 163 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L162-L163

Added lines #L162 - L163 were not covered by tests

if self.bitpos == 8:
self.bitpos = 0
self._byte_pointer += 1

Check warning on line 167 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L166-L167

Added lines #L166 - L167 were not covered by tests

return value

Check warning on line 169 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L169

Added line #L169 was not covered by tests

return LZWDecode.Decoder(data).decode()
def next_code_decode(self) -> int:
self._next_data: int
assert self._data
try:
while self._next_bits < self._bits_to_get:
self._next_data = (self._next_data << 8) | (
self._data[self._byte_pointer] & 0xFF
)
self._byte_pointer += 1
self._next_bits += 8

code = (
self._next_data >> (self._next_bits - self._bits_to_get)
) & self._and_table[self._bits_to_get - 9]
self._next_bits -= self._bits_to_get

return code
except IndexError:
return 257 # End of data

def decode(self, data: bytes) -> bytes:
"""
The following code was converted to Python from the following code:
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
"""
self._and_table = [511, 1023, 2047, 4095]
self._string_table: list[bytes]
self._data = None
self._table_index = 0
self._bits_to_get = 9
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0

if data[0] == 0x00 and data[1] == 0x01:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
raise Exception("LZW flavor not supported.")

Check warning on line 206 in pypdf/_codecs/_codecs.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_codecs/_codecs.py#L206

Added line #L206 was not covered by tests
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved

output_stream = io.BytesIO()

self.initialize_decoding_table()
self._data = data
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
old_code = 256 # Start with a reset code

while True:
code = self.next_code_decode()
if code == 257: # End of data code
break

if code == 256: # Clear code
self.initialize_decoding_table()
code = self.next_code_decode()
if code == 257:
break
output_stream.write(self._string_table[code])
old_code = code
elif code < self._table_index:
string = self._string_table[code]
output_stream.write(string)
if old_code != 256:
self.add_entry_decode(self._string_table[old_code], string[0])
old_code = code
else:
# Special case: code not in the table
string = self._string_table[old_code] + self._string_table[old_code][:1]
output_stream.write(string)
self.add_entry_decode(self._string_table[old_code], string[0])
old_code = code

output = output_stream.getvalue()
return output

def initialize_decoding_table(self) -> None:
self._string_table = [bytes([i]) for i in range(256)] + [b""] * (4096 - 256)
self._table_index = 258
self._bits_to_get = 9

def add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
self._string_table[self._table_index] = new_string
self._table_index += 1

# Update the number of bits to get based on the table index
if self._table_index == 511:
self._bits_to_get = 10
elif self._table_index == 1023:
self._bits_to_get = 11
elif self._table_index == 2047:
self._bits_to_get = 12
130 changes: 2 additions & 128 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,11 @@
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._codecs._codecs import LzwCodec
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
from ._utils import (
WHITESPACES_AS_BYTES,
deprecate,
deprecation_no_replacement,
logger_warning,
ord_,
)
from .constants import CcittFaxDecodeParameters as CCITT
from .constants import FilterTypeAbbreviations as FTA
Expand Down Expand Up @@ -365,131 +364,6 @@ def decode(
return b"".join(lst)


class LZWDecode:
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
"""
Taken from:

https://github.com/katjas/PDFrenderer/blob/master/src/com/sun/pdfview/decode/LZWDecode.java
"""

class Decoder:
STOP = 257
CLEARDICT = 256

def __init__(self, data: bytes) -> None:
self.data = data
self.bytepos = 0
self.bitpos = 0
self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
self.reset_dict()

def reset_dict(self) -> None:
self.dictlen = 258
self.bitspercode = 9

def next_code(self) -> int:
fillbits = self.bitspercode
value = 0
while fillbits > 0:
if self.bytepos >= len(self.data):
return -1
nextbits = ord_(self.data[self.bytepos])
bitsfromhere = 8 - self.bitpos
bitsfromhere = min(bitsfromhere, fillbits)
value |= (
(nextbits >> (8 - self.bitpos - bitsfromhere))
& (0xFF >> (8 - bitsfromhere))
) << (fillbits - bitsfromhere)
fillbits -= bitsfromhere
self.bitpos += bitsfromhere
if self.bitpos >= 8:
self.bitpos = 0
self.bytepos = self.bytepos + 1
return value

def decode(self) -> bytes:
"""
TIFF 6.0 specification explains in sufficient details the steps to
implement the LZW encode() and decode() algorithms.

algorithm derived from:
http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
and the PDFReference

Raises:
PdfReadError: If the stop code is missing
"""
cW = self.CLEARDICT
baos = b""
while True:
pW = cW
cW = self.next_code()
if cW == -1:
raise PdfReadError("Missed the stop code in LZWDecode!")
if cW == self.STOP:
break
elif cW == self.CLEARDICT:
self.reset_dict()
elif pW == self.CLEARDICT:
baos += self.dict[cW]
else:
if cW < self.dictlen:
baos += self.dict[cW]
p = self.dict[pW] + self.dict[cW][0:1]
self.dict[self.dictlen] = p
self.dictlen += 1
else:
p = self.dict[pW] + self.dict[pW][0:1]
baos += p
self.dict[self.dictlen] = p
self.dictlen += 1
if (
self.dictlen >= (1 << self.bitspercode) - 1
and self.bitspercode < 12
):
self.bitspercode += 1
return baos

@staticmethod
def _decodeb(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode an LZW encoded data stream.

Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.

Returns:
decoded data.
"""
# decode_parms is unused here
return LZWDecode.Decoder(data).decode()

@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> str: # deprecated
"""
Decode an LZW encoded data stream.

Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.

Returns:
decoded data.
"""
# decode_parms is unused here
deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0")
return LZWDecode.Decoder(data).decode().decode("latin-1")


class ASCII85Decode:
"""Decodes string ASCII85-encoded data into a byte format."""

Expand Down Expand Up @@ -698,7 +572,7 @@ def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject
elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL):
data = RunLengthDecode.decode(data)
elif filter_type in (FT.LZW_DECODE, FTA.LZW):
data = LZWDecode._decodeb(data, params)
data = LzwCodec().decode(data)
elif filter_type in (FT.ASCII_85_DECODE, FTA.A85):
data = ASCII85Decode.decode(data)
elif filter_type == FT.DCT_DECODE:
Expand Down
5 changes: 2 additions & 3 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,7 @@ def test_decompress_zlib_error(caplog):
def test_lzw_decode_neg1():
reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf")))
page = reader.pages[47]
with pytest.raises(PdfReadError) as exc:
page.extract_text()
assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
assert page.extract_text().startswith("Chapter 2")


@pytest.mark.enable_socket()
Expand All @@ -249,6 +247,7 @@ def test_issue_399():
@pytest.mark.enable_socket()
def test_image_without_pillow(tmp_path):
import os

name = "tika-914102.pdf"
pdf_path = Path(__file__).parent / "pdf_cache" / name
pdf_path_str = str(pdf_path.resolve()).replace("\\", "/")
Expand Down
Loading