Skip to content

Commit

Permalink
chore: linting for ocr changes
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-awd committed Sep 5, 2024
1 parent bb77437 commit 4254962
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 21 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ disable_error_code = [

[[tool.mypy.overrides]]
module = [
"fitz",
"pymupdf",
"ocrmypdf",
"ocrmypdf.exceptions",
"pdftotext",
"pdf2john",
]
Expand Down
2 changes: 1 addition & 1 deletion src/monopoly/banks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@

logger = logging.getLogger(__name__)

__all__ = ["BankDetector", "BankBase", *banks]
__all__ = ["BankDetector", "BankBase", *[bank.__name__ for bank in banks]]
4 changes: 2 additions & 2 deletions src/monopoly/banks/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from typing import Any

from monopoly.config import PdfConfig, StatementConfig
from monopoly.identifiers import Identifier

logger = logging.getLogger(__name__)

Expand All @@ -16,7 +16,7 @@ class BankBase:

statement_configs: list[StatementConfig]
pdf_config: PdfConfig = PdfConfig()
identifiers: list[Identifier]
identifiers: list[list[Any]]

def __init_subclass__(cls, **kwargs) -> None:
if not hasattr(cls, "statement_configs"):
Expand Down
4 changes: 1 addition & 3 deletions src/monopoly/banks/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ def metadata_items(self) -> list[Any]:

return identifiers

def detect_bank(
self, banks: list[Type["BankBase"]] = None
) -> Type["BankBase"] | None:
def detect_bank(self, banks: list[Type["BankBase"]]) -> Type["BankBase"] | None:
"""
Reads the encryption metadata or actual metadata (if the PDF is not encrypted),
and checks for a bank based on unique identifiers.
Expand Down
4 changes: 1 addition & 3 deletions src/monopoly/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,4 @@ class PdfConfig:

page_range: tuple[Optional[int], Optional[int]] = (None, None)
page_bbox: Optional[tuple[float, float, float, float]] = None
ocr_identifiers: list[Optional[MetadataIdentifier]] = field(
default_factory=list[None]
)
ocr_identifiers: Optional[list[MetadataIdentifier]] = None
9 changes: 3 additions & 6 deletions src/monopoly/identifiers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from dataclasses import dataclass, fields
from dataclasses import fields

from pydantic.dataclasses import dataclass


@dataclass
Expand All @@ -16,11 +18,6 @@ class MetadataIdentifier(Identifier):
subject: str = ""
creator: str = ""
producer: str = ""
keywords: str = ""
creationDate: str = ""
modDate: str = ""
trapped: str = ""
encryption: dict = None

def matches(self, other: "MetadataIdentifier") -> bool:
"""Check for partial matches on all string fields."""
Expand Down
10 changes: 6 additions & 4 deletions src/monopoly/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
from functools import cached_property, lru_cache
from io import BytesIO
from pathlib import Path
from typing import Optional
from typing import TYPE_CHECKING, Optional, Type

import pdftotext
from pydantic import SecretStr
from pydantic_settings import BaseSettings, SettingsConfigDict
from pymupdf import TEXTFLAGS_TEXT, Document, Page

from monopoly.banks import BankBase
from monopoly.identifiers import MetadataIdentifier

if TYPE_CHECKING:
from monopoly.banks import BankBase

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -117,7 +119,7 @@ def raw_text(self) -> str:
class PdfParser:
def __init__(
self,
bank: BankBase,
bank: Type["BankBase"],
document: PdfDocument,
):
"""
Expand All @@ -144,7 +146,7 @@ def page_bbox(self):

@cached_property
def ocr_identifiers(self):
return self.pdf_config.ocr_identifiers
return self.pdf_config.ocr_identifiers or []

@lru_cache
def get_pages(self) -> list[PdfPage]:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_bank_identifier/test_auto_detect_bank.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def test_detect_bank_with_not_matching_text_identifier(
mock_banks_list = [MockBankTwo, MockBankWithMultipleTextIdentifier]
monkeypatch.setattr("monopoly.banks.banks", mock_banks_list)

assert not metadata_analyzer.detect_bank()
assert not metadata_analyzer.detect_bank(mock_banks_list)


@patch.object(PdfDocument, "raw_text", new_callable=PropertyMock)
Expand Down

0 comments on commit 4254962

Please sign in to comment.