Skip to content

Commit

Permalink
apply op pages should work!
Browse files Browse the repository at this point in the history
  • Loading branch information
ciur committed Nov 10, 2024
1 parent 8409560 commit cb6ccae
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 115 deletions.
29 changes: 28 additions & 1 deletion papermerge/core/features/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from fastapi import FastAPI
from fastapi.testclient import TestClient

from core.features.document.schema import ExtractPagesIn
from core.types import OCRStatusEnum
from papermerge.core import constants
from papermerge.core.features.auth.scopes import SCOPES
Expand Down Expand Up @@ -85,16 +86,42 @@ def _maker(
parent: orm.Folder,
ocr_status: OCRStatusEnum = OCRStatusEnum.unknown,
lang: str = "deu",
):
) -> doc_schema.Document:
attrs = doc_schema.NewDocument(
title=title, parent_id=parent.id, ocr_status=ocr_status, lang=lang
)
doc, _ = doc_dbapi.create_document(db_session, attrs, user.id)

if doc is None:
raise Exception("Document was not created")

return doc

return _maker


@pytest.fixture
def three_pages_pdf(make_document, db_session, user) -> doc_schema.Document:
doc: doc_schema.Document = make_document(
title="thee-pages.pdf", user=user, parent=user.home_folder
)
PDF_PATH = RESOURCES / "three-pages.pdf"

with open(PDF_PATH, "rb") as file:
content = file.read()
size = os.stat(PDF_PATH).st_size
doc_dbapi.upload(
db_session,
document_id=doc.id,
content=io.BytesIO(content),
file_name="three-pages.pdf",
size=size,
content_type=ContentType.APPLICATION_PDF,
)

return doc


@pytest.fixture
def make_document_with_pages(db_session: Session):
"""Creates a document with one version
Expand Down
167 changes: 99 additions & 68 deletions papermerge/core/features/document/db/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from typing import Tuple

from sqlalchemy import delete, func, insert, select, text, update
from sqlalchemy import delete, func, insert, select, text, update, Select
from sqlalchemy.exc import IntegrityError

from papermerge.core.db.engine import Session
Expand All @@ -30,73 +30,6 @@
logger = logging.getLogger(__name__)


def get_last_doc_ver(
db_session: Session,
doc_id: uuid.UUID, # noqa
user_id: uuid.UUID,
) -> orm.DocumentVersion:
"""
Returns last version of the document
identified by doc_id
"""

stmt = (
select(orm.DocumentVersion)
.join(orm.Document)
.where(
orm.DocumentVersion.document_id == doc_id,
orm.Document.user_id == user_id,
)
.order_by(orm.DocumentVersion.number.desc())
.limit(1)
)
return db_session.scalars(stmt).one()


def get_first_page(
db_session: Session,
doc_ver_id: uuid.UUID,
) -> orm.Page:
"""
Returns first page of the document version
identified by doc_ver_id
"""
with db_session as session: # noqa
stmt = (
select(orm.Page)
.where(
orm.Page.document_version_id == doc_ver_id,
)
.order_by(orm.Page.number.asc())
.limit(1)
)

db_page = session.scalars(stmt).one()

return db_page


def get_doc_ver(
db_session: Session,
id: uuid.UUID,
user_id: uuid.UUID, # noqa
) -> schema.DocumentVersion:
"""
Returns last version of the document
identified by doc_id
"""

stmt = (
select(orm.DocumentVersion)
.join(orm.Document)
.where(orm.Document.user_id == user_id, orm.DocumentVersion.id == id)
)
db_doc_ver = db_session.scalars(stmt).one()
model_doc_ver = schema.DocumentVersion.model_validate(db_doc_ver)

return model_doc_ver


def count_docs(session: Session) -> int:
stmt = select(func.count()).select_from(orm.Document)

Expand Down Expand Up @@ -834,3 +767,101 @@ def get_doc_ver_pages(db_session: Session, doc_ver_id: uuid.UUID) -> list[schema
models = [schema.Page.model_validate(db_page) for db_page in db_pages]

return models


def get_last_doc_ver(
db_session: Session,
doc_id: uuid.UUID, # noqa
user_id: uuid.UUID,
) -> orm.DocumentVersion:
"""
Returns last version of the document
identified by doc_id
"""

stmt = (
select(orm.DocumentVersion)
.join(orm.Document)
.where(
orm.DocumentVersion.document_id == doc_id,
orm.Document.user_id == user_id,
)
.order_by(orm.DocumentVersion.number.desc())
.limit(1)
)
return db_session.scalars(stmt).one()


def get_first_page(
db_session: Session,
doc_ver_id: uuid.UUID,
) -> orm.Page:
"""
Returns first page of the document version
identified by doc_ver_id
"""
with db_session as session: # noqa
stmt = (
select(orm.Page)
.where(
orm.Page.document_version_id == doc_ver_id,
)
.order_by(orm.Page.number.asc())
.limit(1)
)

db_page = session.scalars(stmt).one()

return db_page


def get_doc_ver(
db_session: Session,
id: uuid.UUID,
user_id: uuid.UUID, # noqa
) -> schema.DocumentVersion:
"""
Returns last version of the document
identified by doc_id
"""

stmt = (
select(orm.DocumentVersion)
.join(orm.Document)
.where(orm.Document.user_id == user_id, orm.DocumentVersion.id == id)
)
db_doc_ver = db_session.scalars(stmt).one()
model_doc_ver = schema.DocumentVersion.model_validate(db_doc_ver)

return model_doc_ver


def select_last_doc_ver(document_id: uuid.UUID, user_id: uuid.UUID) -> Select:
"""Returns a selectable for the last version of the document"""
stmt = (
select(orm.DocumentVersion.id)
.join(orm.Document)
.where(
orm.DocumentVersion.document_id == document_id,
orm.Document.user_id == user_id,
)
.order_by(orm.DocumentVersion.number.desc())
.limit(1)
)

return stmt


def get_last_ver_pages(
db_session: Session, document_id: uuid.UUID, user_id: uuid.UUID
) -> list[orm.Page]:
"""Returns all pages of the last version of the document"""
subq = select_last_doc_ver(document_id=document_id, user_id=user_id).subquery()

stmt = (
select(orm.Page)
.where(orm.Page.document_version_id == subq.c.id)
.order_by(orm.Page.number)
)

return db_session.execute(stmt).scalars().all()
5 changes: 5 additions & 0 deletions papermerge/core/features/document/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ class Tag(BaseModel):
model_config = ConfigDict(from_attributes=True)


class MicroPage(BaseModel):
id: UUID
number: int


class Page(BaseModel):
id: UUID
number: int
Expand Down
67 changes: 67 additions & 0 deletions papermerge/core/features/document/tests/test_dbapi_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,3 +543,70 @@ def test_document_upload_txt(make_document, user, db_session):
assert fresh_doc is None
error: err_schema.Error
assert len(error.messages) == 1


def test_get_last_ver_pages(db_session, make_document, user):
### Arrange ###

# version numbering starts with 1
# doc has only one version - "version.number = 1"
doc = make_document(title="basic.pdf", user=user, parent=user.home_folder)

# now doc has two versions
dbapi.version_bump(db_session, doc_id=doc.id, user_id=user.id, page_count=3)
# now doc has three versions
dbapi.version_bump(db_session, doc_id=doc.id, user_id=user.id, page_count=3)
# now doc has four versions
dbapi.version_bump(db_session, doc_id=doc.id, user_id=user.id, page_count=3)

### Act ###
pages = dbapi.get_last_ver_pages(db_session, document_id=doc.id, user_id=user.id)

### Assert
stmt = select(docs_orm.DocumentVersion).where(
docs_orm.DocumentVersion.document_id == doc.id,
# get document last version; last doc ver has number 4
docs_orm.DocumentVersion.number == 4,
)

last_ver = db_session.execute(stmt).scalar()

assert len(pages) == 3
assert last_ver.number == 4
assert pages[0].number == 1
assert pages[1].number == 2
assert pages[2].number == 3
assert pages[0].document_version_id == last_ver.id


def test_subsequent_updates_over_pages_returned_by_get_last_ver_pages(
db_session, make_document, user
):
"""
=== SqlAlchemy learning playground ===
Scenario in this test is not for testing a function. It is just
for me to validated my knowledge on how db_session works!
question: if I update ORM object (orm.Page) by attribute assignement
will this change reflected in subsequence db_session queries?
In other words, does change from (1) will be seen in (2)?
As this tests assert - the answer is Yes.
Note the tricky part - `dbapi.get_last_ver_pages` returns a list
of pages. Updating any member of this will be reflected in
page retrieved via same db_session!
"""
doc = make_document(title="basic.pdf", user=user, parent=user.home_folder)

dbapi.version_bump(db_session, doc_id=doc.id, user_id=user.id, page_count=3)
pages = dbapi.get_last_ver_pages(db_session, document_id=doc.id, user_id=user.id)

pages[0].text = "coco" # (1)

stmt = select(docs_orm.Page).where(docs_orm.Page.id == pages[0].id)

fresh_page = db_session.execute(stmt).scalar() # (2)

# Does (2) contain change from (1)?
assert fresh_page.text == "coco"
55 changes: 49 additions & 6 deletions papermerge/core/features/page_mngm/db/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from celery import current_app
from pikepdf import Pdf
from sqlalchemy import select
from sqlalchemy import select, func

from papermerge.core import constants as const
from papermerge.core.pathlib import abs_page_path
Expand Down Expand Up @@ -52,17 +52,60 @@ def collect_text_streams(
return result


def apply_pages_op(items: List[schema.PageAndRotOp]) -> List[schema.Document]:
pages = Page.objects.filter(pk__in=[item.page.id for item in items])
old_version = pages.first().document_version
def apply_pages_op(
db_session, items: List[schema.PageAndRotOp], user_id: uuid.UUID
) -> List[schema.Document]:
"""Apply operations (operation = transformation) on the document
It is assumed that all pages are part of the same document version.
Apply operation means following:
- create new document version
- copy to new document version only selected pages (i.e. the pages
identified by `List[schema.PageAndRotOp]`)
- The input list (the `items: List[schema.PageAndRotOp]`) may also
have angle != 0 - in such case page is also rotated
Note that "copy to new document version" has to parts:
- recreate the 'page' models (and copy text from old one to new ones)
- recreate pdf file (and copy its pages from old one to new ones)
`ValueError` exception will be raised if input pages do not belong
to the same document.
"""
# input validation, check if all pages belong to the same document
stmt = select(func.count(orm.DocumentVersion.id)).where(
orm.Page.id.in_(p.page.id for p in items),
orm.Page.document_version_id == orm.DocumentVersion.id,
)
doc_ver_count = db_session.execute(stmt).scalar()

if doc_ver_count > 1:
raise ValueError("Apply pages op: input pages belong to multiple documents")

pages = db_session.execute(
select(orm.Page).where(orm.Page.id.in_(item.page.id for item in items))
).scalars()

pages = pages.all()

old_version = db_session.execute(
select(orm.DocumentVersion)
.where(orm.DocumentVersion.id == pages[0].document_version_id)
.limit(1)
).scalar()

doc = old_version.document
new_version = doc.version_bump(page_count=len(items))
new_version = doc_dbapi.version_bump(
db_session, doc_id=doc.id, user_id=user_id, page_count=len(items)
)

copy_pdf_pages(src=old_version.file_path, dst=new_version.file_path, items=items)

copy_text_field(
src=old_version, dst=new_version, page_numbers=[p.number for p in pages]
db_session,
src=old_version,
dst=new_version,
page_numbers=[p.number for p in pages],
)

notify_version_update(
Expand Down
Loading

0 comments on commit cb6ccae

Please sign in to comment.