chore: linting for ocr changes

benjamin-awd · Sep 5, 2024 · 4254962 · 4254962
1 parent bb77437
commit 4254962
Show file tree

Hide file tree

Showing 8 changed files with 18 additions and 21 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -87,7 +87,9 @@ disable_error_code = [
 
 [[tool.mypy.overrides]]
 module = [
-    "fitz",
+    "pymupdf",
+    "ocrmypdf",
+    "ocrmypdf.exceptions",
     "pdftotext",
     "pdf2john",
 ]

diff --git a/src/monopoly/banks/__init__.py b/src/monopoly/banks/__init__.py
@@ -23,4 +23,4 @@
 
 logger = logging.getLogger(__name__)
 
-__all__ = ["BankDetector", "BankBase", *banks]
+__all__ = ["BankDetector", "BankBase", *[bank.__name__ for bank in banks]]
diff --git a/src/monopoly/banks/base.py b/src/monopoly/banks/base.py
@@ -1,7 +1,7 @@
 import logging
+from typing import Any
 
 from monopoly.config import PdfConfig, StatementConfig
-from monopoly.identifiers import Identifier
 
 logger = logging.getLogger(__name__)
 
@@ -16,7 +16,7 @@ class BankBase:
 
     statement_configs: list[StatementConfig]
     pdf_config: PdfConfig = PdfConfig()
-    identifiers: list[Identifier]
+    identifiers: list[list[Any]]
 
     def __init_subclass__(cls, **kwargs) -> None:
         if not hasattr(cls, "statement_configs"):

diff --git a/src/monopoly/banks/detector.py b/src/monopoly/banks/detector.py
@@ -30,9 +30,7 @@ def metadata_items(self) -> list[Any]:
 
         return identifiers
 
-    def detect_bank(
-        self, banks: list[Type["BankBase"]] = None
-    ) -> Type["BankBase"] | None:
+    def detect_bank(self, banks: list[Type["BankBase"]]) -> Type["BankBase"] | None:
         """
         Reads the encryption metadata or actual metadata (if the PDF is not encrypted),
         and checks for a bank based on unique identifiers.

diff --git a/src/monopoly/config.py b/src/monopoly/config.py
@@ -71,6 +71,4 @@ class PdfConfig:
 
     page_range: tuple[Optional[int], Optional[int]] = (None, None)
     page_bbox: Optional[tuple[float, float, float, float]] = None
-    ocr_identifiers: list[Optional[MetadataIdentifier]] = field(
-        default_factory=list[None]
-    )
+    ocr_identifiers: Optional[list[MetadataIdentifier]] = None
diff --git a/src/monopoly/identifiers.py b/src/monopoly/identifiers.py
@@ -1,4 +1,6 @@
-from dataclasses import dataclass, fields
+from dataclasses import fields
+
+from pydantic.dataclasses import dataclass
 
 
 @dataclass
@@ -16,11 +18,6 @@ class MetadataIdentifier(Identifier):
     subject: str = ""
     creator: str = ""
     producer: str = ""
-    keywords: str = ""
-    creationDate: str = ""
-    modDate: str = ""
-    trapped: str = ""
-    encryption: dict = None
 
     def matches(self, other: "MetadataIdentifier") -> bool:
         """Check for partial matches on all string fields."""

diff --git a/src/monopoly/pdf.py b/src/monopoly/pdf.py
@@ -3,16 +3,18 @@
 from functools import cached_property, lru_cache
 from io import BytesIO
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING, Optional, Type
 
 import pdftotext
 from pydantic import SecretStr
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pymupdf import TEXTFLAGS_TEXT, Document, Page
 
-from monopoly.banks import BankBase
 from monopoly.identifiers import MetadataIdentifier
 
+if TYPE_CHECKING:
+    from monopoly.banks import BankBase
+
 logger = logging.getLogger(__name__)
 
 
@@ -117,7 +119,7 @@ def raw_text(self) -> str:
 class PdfParser:
     def __init__(
         self,
-        bank: BankBase,
+        bank: Type["BankBase"],
         document: PdfDocument,
     ):
         """
@@ -144,7 +146,7 @@ def page_bbox(self):
 
     @cached_property
     def ocr_identifiers(self):
-        return self.pdf_config.ocr_identifiers
+        return self.pdf_config.ocr_identifiers or []
 
     @lru_cache
     def get_pages(self) -> list[PdfPage]:

diff --git a/tests/unit/test_bank_identifier/test_auto_detect_bank.py b/tests/unit/test_bank_identifier/test_auto_detect_bank.py
@@ -135,7 +135,7 @@ def test_detect_bank_with_not_matching_text_identifier(
     mock_banks_list = [MockBankTwo, MockBankWithMultipleTextIdentifier]
     monkeypatch.setattr("monopoly.banks.banks", mock_banks_list)
 
-    assert not metadata_analyzer.detect_bank()
+    assert not metadata_analyzer.detect_bank(mock_banks_list)
 
 
 @patch.object(PdfDocument, "raw_text", new_callable=PropertyMock)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,4 +23,4 @@

		logger = logging.getLogger(__name__)

		__all__ = ["BankDetector", "BankBase", *banks]
		__all__ = ["BankDetector", "BankBase", *[bank.__name__ for bank in banks]]