Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add decode_as_image() to ContentStreams #2615

Merged
merged 10 commits into from
Jun 9, 2024
22 changes: 22 additions & 0 deletions docs/user/extract-images.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,25 @@ for image_file_object in page.images:
fp.write(image_file_object.data)
count += 1
```

# Other images

Some other objects can contain images, such as stamp annotations.

For example, this document contains such stamps:
[test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved

You can extract the image from the annotation with the following code:

```python
from pypdf import PdfReader

reader = PdfReader("test_stamp.pdf")
im = (
reader.pages[0]["/Annots"][0]
.get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"]
.decode_as_image()
)

im.show()
```
25 changes: 25 additions & 0 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,31 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
retval._data = FlateDecode.encode(b_(self._data), level)
return retval

def decode_as_image(self) -> Any:
"""
Try to decode the stream object as an image

Returns:
a PIL image if proper decoding has been found
Raises:
Exception: (any)during decoding to to invalid object or
errors during decoding will be reported
It is recommended to catch exceptions to prevent
stops in your program.
"""
from ..filters import _xobj_to_image

if self.get("/Subtype", "") != "/Image":
try:
msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover
except AttributeError:
msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover
logger_warning(msg, __name__)
extension, byte_stream, img = _xobj_to_image(self)
if extension is None:
return None # pragma: no cover
return img


class DecodedStreamObject(StreamObject):
pass
Expand Down
21 changes: 21 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,24 @@ def test_inline_image_extraction():
name = "iss2598d.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[0].images[0].image, img) == 1


@pytest.mark.enable_socket()
def test_extract_image_from_object(caplog):
url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf"
name = "iss2613.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][
"/X1"
].decode_as_image()
assert isinstance(image, Image.Image)
with pytest.raises(Exception):
co = reader.pages[0].get_contents()
co.decode_as_image()
assert "does not seem to be an Image" in caplog.text
caplog.clear()
co.indirect_reference = "for_test"
with pytest.raises(Exception):
co = reader.pages[0].get_contents()
co.decode_as_image()
assert "does not seem to be an Image" in caplog.text
Loading