Skip to content

Commit

Permalink
Re-Implement select method
Browse files Browse the repository at this point in the history
This re-implements Document method "select()" based on new MuPDF function "pdf_rearrange_pages()".
This is a more complete (and faster) implementation of what needs to be done here in that not only pages will be rearranged, but also consequential changes will be made to the table of contents, links to removed pages and affected entries in the Optional Content definitions.

Update __init__.py
  • Loading branch information
JorjMcKie authored and julian-smith-artifex-com committed Feb 17, 2024
1 parent 0fd9594 commit ff1bdbb
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 447 deletions.
12 changes: 12 additions & 0 deletions changes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@ Change Log
==========


**Changes in version 1.23.23 (2024-02-14)**

* Fixed issues:

* **Fixed** `3150 <https://github.com/pymupdf/PyMuPDF/issues/3150>`_: doc.select() hangs on this doc.


* Other:

* Replaced major code portions previously supporting `Document.select()` MuPDF function `pdf_rearrange_pages()` which is faster and more thoroughly performing that task.


**Changes in version 1.23.22 (2024-02-12)**

* Fixed issues:
Expand Down
131 changes: 13 additions & 118 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5385,16 +5385,21 @@ def select(self, pyliste):
raise ValueError("is no PDF")
if not hasattr(pyliste, "__getitem__"):
raise ValueError("sequence required")
if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)):

valid_range = range(len(self))
if (len(pyliste) == 0
or min(pyliste) not in valid_range
or max(pyliste) not in valid_range
):
raise ValueError("bad page number(s)")
# preparatory stuff:
# (1) get underlying pdf document,
# (2) transform Python list into integer array

# get underlying pdf document,
pdf = _as_pdf_document(self)
# call retainpages (code copy of fz_clean_file.c)
retainpages(pdf, pyliste)
if pdf.m_internal.rev_page_map:
mupdf.ll_pdf_drop_page_tree(pdf.m_internal)

# create page sub-pdf via extra.rearrange_pages2
extra.rearrange_pages2(pdf, tuple(pyliste))

# remove any existing pages with their kids
self._reset_page_refs()

def set_language(self, language=None):
Expand Down Expand Up @@ -20862,116 +20867,6 @@ def repair_mono_font(page: "Page", font: "Font") -> None:
log("Cannot set width for '%s' in xref %i" % (font.name, xref))


def retainpage(doc, parent, kids, page):
'''
Recreate page tree to only retain specified pages.
'''
pageref = mupdf.pdf_lookup_page_obj(doc, page)
mupdf.pdf_flatten_inheritable_page_items(pageref)
mupdf.pdf_dict_put(pageref, PDF_NAME('Parent'), parent)
# Store page object in new kids array
mupdf.pdf_array_push(kids, pageref)


def retainpages(doc, liste):
'''
This is called by PyMuPDF:
liste = page numbers to retain
'''
argc = len(liste)
pagecount = mupdf.pdf_count_pages(doc)

# Keep only pages/type and (reduced) dest entries to avoid
# references to dropped pages
oldroot = mupdf.pdf_dict_get(mupdf.pdf_trailer(doc), PDF_NAME('Root'))
pages = mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages'))
olddests = mupdf.pdf_load_name_tree(doc, PDF_NAME('Dests'))
outlines = mupdf.pdf_dict_get(oldroot, PDF_NAME('Outlines'))
ocproperties = mupdf.pdf_dict_get(oldroot, PDF_NAME('OCProperties'))
names_list = None

root = mupdf.pdf_new_dict(doc, 3)
mupdf.pdf_dict_put(root, PDF_NAME('Type'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Type')))
mupdf.pdf_dict_put(root, PDF_NAME('Pages'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages')))
if outlines.m_internal:
mupdf.pdf_dict_put(root, PDF_NAME('Outlines'), outlines)
if ocproperties.m_internal:
mupdf.pdf_dict_put(root, PDF_NAME('OCProperties'), ocproperties)

mupdf.pdf_update_object(doc, mupdf.pdf_to_num(oldroot), root)

# Create a new kids array with only the pages we want to keep
kids = mupdf.pdf_new_array(doc, 1)

# Retain pages specified
for page in range(argc):
i = liste[page]
if i < 0 or i >= pagecount:
RAISEPY(MSG_BAD_PAGENO, PyExc_ValueError)
retainpage(doc, pages, kids, i)

# Update page count and kids array
countobj = mupdf.pdf_new_int(mupdf.pdf_array_len(kids))
mupdf.pdf_dict_put(pages, PDF_NAME('Count'), countobj)
mupdf.pdf_dict_put(pages, PDF_NAME('Kids'), kids)

pagecount = mupdf.pdf_count_pages(doc)
page_object_nums = []
for i in range(pagecount):
pageref = mupdf.pdf_lookup_page_obj(doc, i)
page_object_nums.append(mupdf.pdf_to_num(pageref))

# If we had an old Dests tree (now reformed as an olddests dictionary),
# keep any entries in there that point to valid pages.
# This may mean we keep more than we need, but it is safe at least.
if olddests:
names = mupdf.pdf_new_dict(doc, 1)
dests = mupdf.pdf_new_dict(doc, 1)
len_ = mupdf.pdf_dict_len(olddests)

names_list = mupdf.pdf_new_array(doc, 32)

for i in range(len_):
key = mupdf.pdf_dict_get_key(olddests, i)
val = mupdf.pdf_dict_get_val(olddests, i)
dest = mupdf.pdf_dict_get(val, PDF_NAME('D'))

dest = mupdf.pdf_array_get(dest if dest.m_internal else val, 0)
# fixme: need dest_is_valid_page.
if dest_is_valid_page(dest, page_object_nums, pagecount):
key_str = mupdf.pdf_new_string(mupdf.pdf_to_name(key), len(mupdf.pdf_to_name(key)))
mupdf.pdf_array_push(names_list, key_str)
mupdf.pdf_array_push(names_list, val)

mupdf.pdf_dict_put(dests, PDF_NAME('Names'), names_list)
mupdf.pdf_dict_put(names, PDF_NAME('Dests'), dests)
mupdf.pdf_dict_put(root, PDF_NAME('Names'), names)

# Edit each pages /Annot list to remove any links pointing to nowhere.
for i in range(pagecount):
pageref = mupdf.pdf_lookup_page_obj(doc, i)
annots = mupdf.pdf_dict_get(pageref, PDF_NAME('Annots'))
len_ = mupdf.pdf_array_len(annots)
j = 0
while 1:
if j >= len_:
break
o = mupdf.pdf_array_get(annots, j)

if not mupdf.pdf_name_eq(mupdf.pdf_dict_get(o, PDF_NAME('Subtype')), PDF_NAME('Link')):
continue

if not dest_is_valid(o, pagecount, page_object_nums, names_list):
# Remove this annotation
mupdf.pdf_array_delete(annots, j)
len_ -= 1
j -= 1
j += 1

if strip_outlines( doc, outlines, pagecount, page_object_nums, names_list) == 0:
mupdf.pdf_dict_del(root, PDF_NAME('Outlines'))


def sRGB_to_pdf(srgb: int) -> tuple:
"""Convert sRGB color code to a PDF color triple.

Expand Down
17 changes: 17 additions & 0 deletions src/extra.i
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,21 @@ PyObject* JM_EscapeStrFromBuffer(fz_buffer* buff)
return val;
}

void rearrange_pages2(
mupdf::PdfDocument& doc,
PyObject *new_pages
)
{
int len = (int) PyTuple_Size(new_pages);
int *pages = (int *) malloc((int) len * sizeof(int));
int i;
for (i = 0; i < len; i++) {
pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(new_pages, (Py_ssize_t) i));
}
mupdf::pdf_rearrange_pages(doc, len, pages);
free(pages);
}


//----------------------------------------------------------------------------
// Deep-copies a source page to the target.
Expand Down Expand Up @@ -4515,3 +4530,5 @@ fz_image* fz_new_image_from_compressed_buffer(
fz_compressed_buffer *buffer,
fz_image *mask
);

void rearrange_pages2( mupdf::PdfDocument& doc, PyObject *new_pages);
17 changes: 12 additions & 5 deletions src_classic/fitz_old.i
Original file line number Diff line number Diff line change
Expand Up @@ -2297,7 +2297,8 @@ if not self.is_pdf:
if not hasattr(pyliste, "__getitem__"):
raise ValueError("sequence required")
if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)):
raise ValueError("bad page number(s)")%}
raise ValueError("bad page number(s)")
pyliste = tuple(pyliste)%}
%pythonappend select %{self._reset_page_refs()%}
PyObject *select(PyObject *pyliste)
{
Expand All @@ -2306,17 +2307,23 @@ if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not
// (2) transform Python list into integer array

pdf_document *pdf = pdf_specifics(gctx, (fz_document *) $self);
int *pages = NULL;
fz_try(gctx) {
// call retainpages (code copy of fz_clean_file.c)
globals glo = {0};
glo.ctx = gctx;
glo.doc = pdf;
retainpages(gctx, &glo, pyliste);
int i, len = (int) PyTuple_Size(pyliste);
pages = fz_realloc_array(gctx, pages, len, int);
for (i = 0; i < len; i++) {
pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(pyliste, (Py_ssize_t) i));
}
pdf_rearrange_pages(gctx, pdf, len, pages);
if (pdf->rev_page_map)
{
pdf_drop_page_tree(gctx, pdf);
}
}
fz_always(gctx) {
fz_free(gctx, pages);
}
fz_catch(gctx) {
return NULL;
}
Expand Down
Loading

0 comments on commit ff1bdbb

Please sign in to comment.