Skip to content

Commit

Permalink
Corrections for 4186
Browse files Browse the repository at this point in the history
Extracting JPEG-CMYK images consistently need inverting the colors.
We have taken this opportunity to reuse as much as possible the creation of the image dictionaries in 'Document.extract_image' and the image block in the Python version of text extraction.
  • Loading branch information
JorjMcKie authored and julian-smith-artifex-com committed Jan 8, 2025
1 parent 653617c commit e6092aa
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 107 deletions.
152 changes: 51 additions & 101 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4196,8 +4196,7 @@ def extract_image(self, xref):
raise ValueError("document closed or encrypted")

pdf = _as_pdf_document(self)
img_type = 0
smask = 0

if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1):
raise ValueError( MSG_BAD_XREF)

Expand All @@ -4210,65 +4209,15 @@ def extract_image(self, xref):
o = mupdf.pdf_dict_geta(obj, PDF_NAME('SMask'), PDF_NAME('Mask'))
if o.m_internal:
smask = mupdf.pdf_to_num(o)

if mupdf.pdf_is_jpx_image(obj):
img_type = mupdf.FZ_IMAGE_JPX
res = mupdf.pdf_load_stream(obj)
ext = "jpx"
if JM_is_jbig2_image(obj):
img_type = mupdf.FZ_IMAGE_JBIG2
res = mupdf.pdf_load_stream(obj)
ext = "jb2"
res = mupdf.pdf_load_raw_stream(obj)
if img_type == mupdf.FZ_IMAGE_UNKNOWN:
res = mupdf.pdf_load_raw_stream(obj)
_, c = mupdf.fz_buffer_storage(res)
#log( '{=_ c}')
img_type = mupdf.fz_recognize_image_format(c)
ext = JM_image_extension(img_type)
if img_type == mupdf.FZ_IMAGE_UNKNOWN:
res = None
img = mupdf.pdf_load_image(pdf, obj)
ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal)
if (ll_cbuf
and ll_cbuf.params.type not in (
mupdf.FZ_IMAGE_RAW,
mupdf.FZ_IMAGE_FAX,
mupdf.FZ_IMAGE_FLATE,
mupdf.FZ_IMAGE_LZW,
mupdf.FZ_IMAGE_RLD,
)
):
img_type = ll_cbuf.params.type
ext = JM_image_extension(img_type)
res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer))
else:
res = mupdf.fz_new_buffer_from_image_as_png(
img,
mupdf.FzColorParams(mupdf.fz_default_color_params),
)
ext = "png"
else:
img = mupdf.fz_new_image_from_buffer(res)

xres, yres = mupdf.fz_image_resolution(img)
width = img.w()
height = img.h()
colorspace = img.n()
bpc = img.bpc()
cs_name = mupdf.fz_colorspace_name(img.colorspace())
smask = 0

# load the image
img = mupdf.pdf_load_image(pdf, obj)
rc = dict()
rc[ dictkey_ext] = ext
rc[ dictkey_smask] = smask
rc[ dictkey_width] = width
rc[ dictkey_height] = height
rc[ dictkey_colorspace] = colorspace
rc[ dictkey_bpc] = bpc
rc[ dictkey_xres] = xres
rc[ dictkey_yres] = yres
rc[ dictkey_cs_name] = cs_name
rc[ dictkey_image] = JM_BinFromBuffer(res)
_make_image_dict(img, rc)
rc[dictkey_smask] = smask
rc[dictkey_cs_name] = mupdf.fz_colorspace_name(img.colorspace())
return rc

def ez_save(
Expand Down Expand Up @@ -16323,19 +16272,6 @@ def JM_irect_from_py(r):
f[i] = FZ_MAX_INF_RECT
return mupdf.fz_make_irect(f[0], f[1], f[2], f[3])


def JM_is_jbig2_image(dict_):
# fixme: should we remove this function?
return 0
#filter_ = pdf_dict_get(ctx, dict_, PDF_NAME(Filter));
#if (pdf_name_eq(ctx, filter_, PDF_NAME(JBIG2Decode)))
# return 1;
#n = pdf_array_len(ctx, filter_);
#for (i = 0; i < n; i++)
# if (pdf_name_eq(ctx, pdf_array_get(ctx, filter_, i), PDF_NAME(JBIG2Decode)))
# return 1;
#return 0;

def JM_listbox_value( annot):
'''
ListBox retrieve value
Expand Down Expand Up @@ -16533,38 +16469,52 @@ def __str__(self):
line_dict[dictkey_spans] = span_list
return line_rect

def _make_image_dict(img, img_dict):
"""Populate a dictionary with information extracted from a given image.

def JM_make_image_block(block, block_dict):
image = block.i_image()
n = mupdf.fz_colorspace_n(image.colorspace())
w = image.w()
h = image.h()
type_ = mupdf.FZ_IMAGE_UNKNOWN
# fz_compressed_image_buffer() is not available because
# `fz_compressed_buffer` is not copyable.
ll_fz_compressed_buffer = mupdf.ll_fz_compressed_image_buffer(image.m_internal)
if ll_fz_compressed_buffer:
type_ = ll_fz_compressed_buffer.params.type
if type_ < mupdf.FZ_IMAGE_BMP or type_ == mupdf.FZ_IMAGE_JBIG2:
type_ = mupdf.FZ_IMAGE_UNKNOWN
bytes_ = None
if ll_fz_compressed_buffer and type_ != mupdf.FZ_IMAGE_UNKNOWN:
buf = mupdf.FzBuffer( mupdf.ll_fz_keep_buffer( ll_fz_compressed_buffer.buffer))
ext = JM_image_extension(type_)
else:
buf = mupdf.fz_new_buffer_from_image_as_png(image, mupdf.FzColorParams())
Used by 'Document.extract_image' and by 'JM_make_image_block'.
Both of these functions will add some more specific information.
"""
img_type = img.fz_compressed_image_type()
ext = JM_image_extension(img_type)

# compressed image buffer if present, else None
ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal)

if (0
or not ll_cbuf
or img_type in (mupdf.FZ_IMAGE_JBIG2, mupdf.FZ_IMAGE_UNKNOWN)
or img_type < mupdf.FZ_IMAGE_BMP
):
# not an image with a compressed buffer: convert to PNG
res = mupdf.fz_new_buffer_from_image_as_png(
img,
mupdf.FzColorParams(mupdf.fz_default_color_params),
)
ext = "png"
bytes_ = JM_BinFromBuffer(buf)
block_dict[ dictkey_width] = w
block_dict[ dictkey_height] = h
block_dict[ dictkey_ext] = ext
block_dict[ dictkey_colorspace] = n
block_dict[ dictkey_xres] = image.xres()
block_dict[ dictkey_yres] = image.yres()
block_dict[ dictkey_bpc] = image.bpc()
block_dict[ dictkey_matrix] = JM_py_from_matrix(block.i_transform())
block_dict[ dictkey_size] = len(bytes_)
block_dict[ dictkey_image] = bytes_
elif ext == "jpeg" and img.n() == 4:
# JPEG with CMYK: invert colors
res = mupdf.fz_new_buffer_from_image_as_jpeg(
img, mupdf.FzColorParams(mupdf.fz_default_color_params), 95, 1)
else:
# copy the compressed buffer
res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer))

bytes_ = JM_BinFromBuffer(res)
img_dict[dictkey_width] = img.w()
img_dict[dictkey_height] = img.h()
img_dict[dictkey_ext] = ext
img_dict[dictkey_colorspace] = img.n()
img_dict[dictkey_xres] = img.xres()
img_dict[dictkey_yres] = img.yres()
img_dict[dictkey_bpc] = img.bpc()
img_dict[dictkey_size] = len(bytes_)
img_dict[dictkey_image] = bytes_

def JM_make_image_block(block, block_dict):
img = block.i_image()
_make_image_dict(img, block_dict)
block_dict[dictkey_matrix] = JM_py_from_matrix(block.i_transform())


def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
Expand Down
20 changes: 14 additions & 6 deletions src/extra.i
Original file line number Diff line number Diff line change
Expand Up @@ -3511,22 +3511,30 @@ void JM_make_image_block(fz_stext_block *block, PyObject *block_dict)
int n = fz_colorspace_n(ctx, image->colorspace);
int w = image->w;
int h = image->h;
const char *ext = NULL;
const char *ext = "";
int type = FZ_IMAGE_UNKNOWN;
if (buffer)
if (buffer) {
type = buffer->params.type;
ext = JM_image_extension(type);
}
if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2)
type = FZ_IMAGE_UNKNOWN;
PyObject *bytes = NULL;
fz_var(bytes);
fz_try(ctx) {
if (buffer && type != FZ_IMAGE_UNKNOWN) {
buf = buffer->buffer;
ext = JM_image_extension(type);
} else {
if (!buffer || type == FZ_IMAGE_UNKNOWN)
{
buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params);
ext = "png";
}
else if (n == 4 && strcmp(ext, "jpeg") == 0) // JPEG CMYK needs another step
{
buf = freebuf = fz_new_buffer_from_image_as_jpeg(ctx, image, fz_default_color_params, 95, 1);
}
else
{
buf = buffer->buffer;
}
bytes = JM_BinFromBuffer(buf);
}
fz_always(ctx) {
Expand Down

0 comments on commit e6092aa

Please sign in to comment.