Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add decode_as_image() to ContentStreams #2615

Merged
merged 10 commits into from
Jun 9, 2024
25 changes: 25 additions & 0 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,31 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
retval._data = FlateDecode.encode(b_(self._data), level)
return retval

def decode_as_image(self) -> Any:
"""
Try to decode the stream object as an image

Returns:
a PIL image if proper decoding has been found
Raises:
Exception: (any)during decoding to to invalid object or
errors during decoding will be reported
It is recommended to catch exceptions to prevent
stops in your program.
"""
from ..filters import _xobj_to_image

if self.get("/Subtype", "") != "/Image":
try:
msg = f"{self.indirect_reference} does not seems to be an Image" # pragma: no cover
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
except AttributeError:
msg = f"{self.__repr__()} object does not seems to be an Image" # pragma: no cover
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
logger_warning(msg, __name__)
extension, byte_stream, img = _xobj_to_image(self)
if extension is None:
return None # pragma: no cover
return img


class DecodedStreamObject(StreamObject):
pass
Expand Down
21 changes: 21 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,24 @@ def test_inline_image_extraction():
name = "iss2598d.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[0].images[0].image, img) == 1


@pytest.mark.enable_socket()
def test_extract_image_from_object(caplog):
url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf"
name = "iss2613.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][
"/X1"
].decode_as_image()
assert isinstance(image, Image.Image)
with pytest.raises(Exception):
co = reader.pages[0].get_contents()
co.decode_as_image()
assert "does not seems to be an Image" in caplog.text
caplog.clear()
co.indirect_reference = "for_test"
with pytest.raises(Exception):
co = reader.pages[0].get_contents()
co.decode_as_image()
assert "does not seems to be an Image" in caplog.text
Loading