diff --git a/pyproject.toml b/pyproject.toml index 343e3082..b19ee601 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,9 @@ disable_error_code = [ [[tool.mypy.overrides]] module = [ - "fitz", + "pymupdf", + "ocrmypdf", + "ocrmypdf.exceptions", "pdftotext", "pdf2john", ] diff --git a/src/monopoly/banks/base.py b/src/monopoly/banks/base.py index 4def5022..7120ba28 100644 --- a/src/monopoly/banks/base.py +++ b/src/monopoly/banks/base.py @@ -1,7 +1,7 @@ import logging +from typing import Any from monopoly.config import PdfConfig, StatementConfig -from monopoly.identifiers import Identifier logger = logging.getLogger(__name__) @@ -16,7 +16,7 @@ class BankBase: statement_configs: list[StatementConfig] pdf_config: PdfConfig = PdfConfig() - identifiers: list[Identifier] + identifiers: list[list[Any]] def __init_subclass__(cls, **kwargs) -> None: if not hasattr(cls, "statement_configs"): diff --git a/src/monopoly/banks/detector.py b/src/monopoly/banks/detector.py index e1526b06..3cf4295e 100644 --- a/src/monopoly/banks/detector.py +++ b/src/monopoly/banks/detector.py @@ -30,9 +30,7 @@ def metadata_items(self) -> list[Any]: return identifiers - def detect_bank( - self, banks: list[Type["BankBase"]] = None - ) -> Type["BankBase"] | None: + def detect_bank(self, banks: list[Type["BankBase"]]) -> Type["BankBase"] | None: """ Reads the encryption metadata or actual metadata (if the PDF is not encrypted), and checks for a bank based on unique identifiers. diff --git a/src/monopoly/config.py b/src/monopoly/config.py index 29c3f51c..365ab09e 100644 --- a/src/monopoly/config.py +++ b/src/monopoly/config.py @@ -71,6 +71,4 @@ class PdfConfig: page_range: tuple[Optional[int], Optional[int]] = (None, None) page_bbox: Optional[tuple[float, float, float, float]] = None - ocr_identifiers: list[Optional[MetadataIdentifier]] = field( - default_factory=list[None] - ) + ocr_identifiers: Optional[list[MetadataIdentifier]] = None diff --git a/src/monopoly/pdf.py b/src/monopoly/pdf.py index 09e38575..e1e2e8d7 100644 --- a/src/monopoly/pdf.py +++ b/src/monopoly/pdf.py @@ -3,7 +3,7 @@ from functools import cached_property, lru_cache from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Type import pdftotext from pydantic import SecretStr @@ -119,7 +119,7 @@ def raw_text(self) -> str: class PdfParser: def __init__( self, - bank: "BankBase", + bank: Type["BankBase"], document: PdfDocument, ): """ @@ -146,7 +146,7 @@ def page_bbox(self): @cached_property def ocr_identifiers(self): - return self.pdf_config.ocr_identifiers + return self.pdf_config.ocr_identifiers or [] @lru_cache def get_pages(self) -> list[PdfPage]: diff --git a/tests/unit/test_bank_identifier/test_auto_detect_bank.py b/tests/unit/test_bank_identifier/test_auto_detect_bank.py index 31c82a0b..98c7fc07 100644 --- a/tests/unit/test_bank_identifier/test_auto_detect_bank.py +++ b/tests/unit/test_bank_identifier/test_auto_detect_bank.py @@ -135,7 +135,7 @@ def test_detect_bank_with_not_matching_text_identifier( mock_banks_list = [MockBankTwo, MockBankWithMultipleTextIdentifier] monkeypatch.setattr("monopoly.banks.banks", mock_banks_list) - assert not metadata_analyzer.detect_bank() + assert not metadata_analyzer.detect_bank(mock_banks_list) @patch.object(PdfDocument, "raw_text", new_callable=PropertyMock)