refactor(pipeline): move parser instantiation logic to CLI

benjamin-awd · Sep 5, 2024 · 88daff3 · 88daff3
1 parent b7e4638
commit 88daff3
Show file tree

Hide file tree

Showing 9 changed files with 56 additions and 28 deletions.
diff --git a/src/monopoly/cli.py b/src/monopoly/cli.py
@@ -126,16 +126,18 @@ def process_statement(
     # pylint: disable=import-outside-toplevel, too-many-locals
     from monopoly.banks import BankDetector, banks
     from monopoly.generic import GenericBank
-    from monopoly.pdf import PdfDocument
+    from monopoly.pdf import PdfDocument, PdfParser
     from monopoly.pipeline import Pipeline
 
     try:
         document = PdfDocument(file)
         analyzer = BankDetector(document)
         bank = analyzer.detect_bank(banks) or GenericBank
+        parser = PdfParser(bank, document)
+        pages = parser.get_pages()
 
         pipeline = Pipeline(file, bank=bank)
-        statement = pipeline.extract(safety_check=safety_check)
+        statement = pipeline.extract(pages, safety_check=safety_check)
         transactions = pipeline.transform(statement)
 
         if print_df:

diff --git a/src/monopoly/examples/single_statement.py b/src/monopoly/examples/single_statement.py
@@ -1,3 +1,5 @@
+from monopoly.banks import ExampleBank
+from monopoly.pdf import PdfParser
 from monopoly.pipeline import Pipeline
 
 
@@ -6,28 +8,31 @@ def example():
     a single bank statement
 
     You can pass in the bank class if you want to specify a specific bank,
-    or ignore the bank argument and let the Pipeline try to automatically
-    detect the bank.
+    or use the BankDetector class to try to detect the bank automatically.
     """
     pipeline = Pipeline(
-        file_path="src/monopoly/examples/example_statement.pdf",
-        # bank=ExampleBank
+        file_path="src/monopoly/examples/example_statement.pdf", bank=ExampleBank
     )
+    parser = PdfParser(pipeline.bank, pipeline.document)
+    pages = parser.get_pages()
 
     # This runs pdftotext on the PDF and
     # extracts transactions as raw text
-    statement = pipeline.extract()
+    statement = pipeline.extract(pages)
 
     # Dates are converted into an ISO 8601 date format
     transactions = pipeline.transform(statement)
 
     # Parsed transactions writen to a CSV file in the "example" directory
-    pipeline.load(
+    file_path = pipeline.load(
         transactions=transactions,
         statement=statement,
         output_directory="src/monopoly/examples",
     )
 
+    with open(file_path) as file:
+        print(file.read()[0:248])
+
 
 if __name__ == "__main__":
     example()
diff --git a/src/monopoly/pdf.py b/src/monopoly/pdf.py
@@ -145,8 +145,12 @@ def page_bbox(self):
         return self.pdf_config.page_bbox
 
     @cached_property
-    def ocr_identifiers(self):
-        return self.pdf_config.ocr_identifiers or []
+    def ocr_available(self):
+        if ids := self.pdf_config.ocr_identifiers:
+            for identifiers in ids:
+                if self.metadata_identifier.matches(identifiers):
+                    return True
+        return False
 
     @lru_cache
     def get_pages(self) -> list[PdfPage]:
@@ -165,10 +169,6 @@ def get_pages(self) -> list[PdfPage]:
                 page.set_cropbox(cropbox)
             page = self._remove_vertical_text(page)
 
-        for identifier in self.ocr_identifiers:
-            if self.metadata_identifier.matches(identifier):
-                document = self._apply_ocr(document)
-
         # certain statements requsire garbage collection, so that duplicate objects
         # do not cause pdftotext to fail due to missing xrefs/null values
         # however, setting `garbage=2` may cause issues with other statements
@@ -217,7 +217,7 @@ def _remove_vertical_text(page: Page):
         return page
 
     @staticmethod
-    def _apply_ocr(document: PdfDocument) -> PdfDocument:
+    def apply_ocr(document: PdfDocument) -> PdfDocument:
         # pylint: disable=import-outside-toplevel
         try:
             from ocrmypdf import Verbosity, configure_logging, ocr

diff --git a/src/monopoly/pipeline.py b/src/monopoly/pipeline.py
@@ -10,7 +10,7 @@
 from monopoly.config import DateOrder
 from monopoly.generic import GenericBank, GenericStatementHandler
 from monopoly.handler import StatementHandler
-from monopoly.pdf import PdfDocument, PdfPage, PdfParser
+from monopoly.pdf import PdfDocument, PdfPage
 from monopoly.statements import BaseStatement, Transaction
 from monopoly.write import generate_name
 
@@ -50,11 +50,9 @@ def create_handler(bank: Type[BankBase], pages: list[PdfPage]) -> StatementHandl
         logger.debug("Using statement handler with bank: %s", bank.__name__)
         return StatementHandler(bank, pages)
 
-    def extract(self, safety_check=True) -> BaseStatement:
+    def extract(self, pages: list[PdfPage], safety_check=True) -> BaseStatement:
         """Extracts transactions from the statement, and performs
         a safety check to make sure that total transactions add up"""
-        parser = PdfParser(self.bank, self.document)
-        pages = parser.get_pages()
         handler = self.create_handler(self.bank, pages)
         statement = handler.get_statement()
         transactions = statement.get_transactions()

diff --git a/tests/integration/banks/test_banks_credit.py b/tests/integration/banks/test_banks_credit.py
@@ -7,6 +7,7 @@
 
 from monopoly.banks import Citibank, Dbs, Hsbc, Maybank, Ocbc, StandardChartered
 from monopoly.banks.base import BankBase
+from monopoly.pdf import PdfParser
 from monopoly.pipeline import Pipeline
 from monopoly.statements import CreditStatement
 
@@ -33,7 +34,10 @@ def test_bank_credit_statements(
     bank_name = bank.credit_config.bank_name
     test_directory = Path(__file__).parent / bank_name / "credit"
     pipeline = Pipeline(test_directory / "input.pdf", bank=bank)
-    statement: CreditStatement = pipeline.extract()
+
+    parser = PdfParser(bank, pipeline.document)
+    pages = parser.get_pages()
+    statement: CreditStatement = pipeline.extract(pages)
 
     # check raw data
     expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")

diff --git a/tests/integration/banks/test_banks_debit.py b/tests/integration/banks/test_banks_debit.py
@@ -6,6 +6,7 @@
 from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv
 
 from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
+from monopoly.pdf import PdfParser
 from monopoly.pipeline import Pipeline
 from monopoly.statements import DebitStatement
 
@@ -30,7 +31,10 @@ def test_bank_debit_statements(
     bank_name = bank.debit_config.bank_name
     test_directory = Path(__file__).parent / bank_name / "debit"
     pipeline = Pipeline(test_directory / "input.pdf", bank=bank)
-    statement: DebitStatement = pipeline.extract()
+
+    parser = PdfParser(bank, pipeline.document)
+    pages = parser.get_pages()
+    statement: DebitStatement = pipeline.extract(pages)
 
     # check raw data
     expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")

diff --git a/tests/integration/banks/test_banks_generic_credit.py b/tests/integration/banks/test_banks_generic_credit.py
@@ -5,9 +5,9 @@
 from test_utils.skip import skip_if_encrypted
 from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv
 
-from monopoly.banks import Citibank, Dbs, Maybank, Ocbc, StandardChartered
-from monopoly.banks.base import BankBase
+from monopoly.banks import BankBase, Citibank, Dbs, Maybank, Ocbc, StandardChartered
 from monopoly.constants import Columns
+from monopoly.pdf import PdfParser
 from monopoly.pipeline import Pipeline
 from monopoly.statements import CreditStatement
 
@@ -33,7 +33,10 @@ def test_bank_credit_statements(
     bank_name = bank.credit_config.bank_name
     test_directory = Path(__file__).parent / bank_name / "credit"
     pipeline = Pipeline(test_directory / "input.pdf")
-    statement: CreditStatement = pipeline.extract()
+
+    parser = PdfParser(bank, pipeline.document)
+    pages = parser.get_pages()
+    statement: CreditStatement = pipeline.extract(pages)
 
     # check raw data
     expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")

diff --git a/tests/integration/banks/test_banks_generic_debit.py b/tests/integration/banks/test_banks_generic_debit.py
@@ -6,6 +6,7 @@
 from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv
 
 from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
+from monopoly.pdf import PdfParser
 from monopoly.pipeline import Pipeline
 from monopoly.statements import DebitStatement
 
@@ -36,7 +37,10 @@ def test_bank_debit_statements(
     bank_name = bank.debit_config.bank_name
     test_directory = Path(__file__).parent / bank_name / "debit"
     pipeline = Pipeline(test_directory / "input.pdf")
-    statement: DebitStatement = pipeline.extract()
+
+    parser = PdfParser(bank, pipeline.document)
+    pages = parser.get_pages()
+    statement: DebitStatement = pipeline.extract(pages)
 
     # check raw data
     expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")

diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
@@ -3,6 +3,7 @@
 import pytest
 
 from monopoly.banks import Dbs, ExampleBank
+from monopoly.pdf import PdfParser
 from monopoly.pipeline import Pipeline
 
 
@@ -24,16 +25,21 @@ def test_pipeline_initialization_with_bytes(pdf_file_bytes):
 def test_pipeline_with_bank():
     file_path = Path("src/monopoly/examples/example_statement.pdf")
     pipeline = Pipeline(file_path=file_path, bank=ExampleBank)
-    transactions = pipeline.extract().transactions
+    parser = PdfParser(pipeline.bank, pipeline.document)
+    pages = parser.get_pages()
+    transactions = pipeline.extract(pages).transactions
     assert len(transactions) == 53
     assert transactions[0].description == "LAST MONTH'S BALANCE"
 
 
 def test_pipeline_with_bad_bank():
     file_path = Path("src/monopoly/examples/example_statement.pdf")
     pipeline = Pipeline(file_path=file_path, bank=Dbs)
+    parser = PdfParser(pipeline.bank, pipeline.document)
+    pages = parser.get_pages()
+
     with pytest.raises(ValueError, match="No transactions found"):
-        pipeline.extract()
+        pipeline.extract(pages)
 
 
 def test_pipeline_initialization_with_file_path():
@@ -62,6 +68,8 @@ def test_pipeline_initialization_with_neither_raises_error():
 
 def test_pipeline_bytes_etl(pdf_file_bytes):
     pipeline = Pipeline(file_bytes=pdf_file_bytes, bank=ExampleBank)
-    statement = pipeline.extract()
+    parser = PdfParser(pipeline.bank, pipeline.document)
+    pages = parser.get_pages()
+    statement = pipeline.extract(pages)
     transactions = pipeline.transform(statement)
     assert len(transactions) == 53