diff --git a/src/monopoly/cli.py b/src/monopoly/cli.py index 3b19ce27..72962dfd 100644 --- a/src/monopoly/cli.py +++ b/src/monopoly/cli.py @@ -126,16 +126,18 @@ def process_statement( # pylint: disable=import-outside-toplevel, too-many-locals from monopoly.banks import BankDetector, banks from monopoly.generic import GenericBank - from monopoly.pdf import PdfDocument + from monopoly.pdf import PdfDocument, PdfParser from monopoly.pipeline import Pipeline try: document = PdfDocument(file) analyzer = BankDetector(document) bank = analyzer.detect_bank(banks) or GenericBank + parser = PdfParser(bank, document) + pages = parser.get_pages() pipeline = Pipeline(file, bank=bank) - statement = pipeline.extract(safety_check=safety_check) + statement = pipeline.extract(pages, safety_check=safety_check) transactions = pipeline.transform(statement) if print_df: diff --git a/src/monopoly/examples/single_statement.py b/src/monopoly/examples/single_statement.py index 3e3656bd..a4a77207 100644 --- a/src/monopoly/examples/single_statement.py +++ b/src/monopoly/examples/single_statement.py @@ -1,3 +1,5 @@ +from monopoly.banks import ExampleBank +from monopoly.pdf import PdfParser from monopoly.pipeline import Pipeline @@ -6,28 +8,31 @@ def example(): a single bank statement You can pass in the bank class if you want to specify a specific bank, - or ignore the bank argument and let the Pipeline try to automatically - detect the bank. + or use the BankDetector class to try to detect the bank automatically. """ pipeline = Pipeline( - file_path="src/monopoly/examples/example_statement.pdf", - # bank=ExampleBank + file_path="src/monopoly/examples/example_statement.pdf", bank=ExampleBank ) + parser = PdfParser(pipeline.bank, pipeline.document) + pages = parser.get_pages() # This runs pdftotext on the PDF and # extracts transactions as raw text - statement = pipeline.extract() + statement = pipeline.extract(pages) # Dates are converted into an ISO 8601 date format transactions = pipeline.transform(statement) # Parsed transactions writen to a CSV file in the "example" directory - pipeline.load( + file_path = pipeline.load( transactions=transactions, statement=statement, output_directory="src/monopoly/examples", ) + with open(file_path) as file: + print(file.read()[0:248]) + if __name__ == "__main__": example() diff --git a/src/monopoly/pdf.py b/src/monopoly/pdf.py index e1e2e8d7..297aebd0 100644 --- a/src/monopoly/pdf.py +++ b/src/monopoly/pdf.py @@ -145,8 +145,12 @@ def page_bbox(self): return self.pdf_config.page_bbox @cached_property - def ocr_identifiers(self): - return self.pdf_config.ocr_identifiers or [] + def ocr_available(self): + if ids := self.pdf_config.ocr_identifiers: + for identifiers in ids: + if self.metadata_identifier.matches(identifiers): + return True + return False @lru_cache def get_pages(self) -> list[PdfPage]: @@ -165,10 +169,6 @@ def get_pages(self) -> list[PdfPage]: page.set_cropbox(cropbox) page = self._remove_vertical_text(page) - for identifier in self.ocr_identifiers: - if self.metadata_identifier.matches(identifier): - document = self._apply_ocr(document) - # certain statements requsire garbage collection, so that duplicate objects # do not cause pdftotext to fail due to missing xrefs/null values # however, setting `garbage=2` may cause issues with other statements @@ -217,7 +217,7 @@ def _remove_vertical_text(page: Page): return page @staticmethod - def _apply_ocr(document: PdfDocument) -> PdfDocument: + def apply_ocr(document: PdfDocument) -> PdfDocument: # pylint: disable=import-outside-toplevel try: from ocrmypdf import Verbosity, configure_logging, ocr diff --git a/src/monopoly/pipeline.py b/src/monopoly/pipeline.py index 9eec4731..38c0ce01 100644 --- a/src/monopoly/pipeline.py +++ b/src/monopoly/pipeline.py @@ -10,7 +10,7 @@ from monopoly.config import DateOrder from monopoly.generic import GenericBank, GenericStatementHandler from monopoly.handler import StatementHandler -from monopoly.pdf import PdfDocument, PdfPage, PdfParser +from monopoly.pdf import PdfDocument, PdfPage from monopoly.statements import BaseStatement, Transaction from monopoly.write import generate_name @@ -50,11 +50,9 @@ def create_handler(bank: Type[BankBase], pages: list[PdfPage]) -> StatementHandl logger.debug("Using statement handler with bank: %s", bank.__name__) return StatementHandler(bank, pages) - def extract(self, safety_check=True) -> BaseStatement: + def extract(self, pages: list[PdfPage], safety_check=True) -> BaseStatement: """Extracts transactions from the statement, and performs a safety check to make sure that total transactions add up""" - parser = PdfParser(self.bank, self.document) - pages = parser.get_pages() handler = self.create_handler(self.bank, pages) statement = handler.get_statement() transactions = statement.get_transactions() diff --git a/tests/integration/banks/test_banks_credit.py b/tests/integration/banks/test_banks_credit.py index 79b9c89d..1aca49c1 100644 --- a/tests/integration/banks/test_banks_credit.py +++ b/tests/integration/banks/test_banks_credit.py @@ -7,6 +7,7 @@ from monopoly.banks import Citibank, Dbs, Hsbc, Maybank, Ocbc, StandardChartered from monopoly.banks.base import BankBase +from monopoly.pdf import PdfParser from monopoly.pipeline import Pipeline from monopoly.statements import CreditStatement @@ -33,7 +34,10 @@ def test_bank_credit_statements( bank_name = bank.credit_config.bank_name test_directory = Path(__file__).parent / bank_name / "credit" pipeline = Pipeline(test_directory / "input.pdf", bank=bank) - statement: CreditStatement = pipeline.extract() + + parser = PdfParser(bank, pipeline.document) + pages = parser.get_pages() + statement: CreditStatement = pipeline.extract(pages) # check raw data expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv") diff --git a/tests/integration/banks/test_banks_debit.py b/tests/integration/banks/test_banks_debit.py index 012c6d0e..3f2391ba 100644 --- a/tests/integration/banks/test_banks_debit.py +++ b/tests/integration/banks/test_banks_debit.py @@ -6,6 +6,7 @@ from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv from monopoly.banks import BankBase, Dbs, Maybank, Ocbc +from monopoly.pdf import PdfParser from monopoly.pipeline import Pipeline from monopoly.statements import DebitStatement @@ -30,7 +31,10 @@ def test_bank_debit_statements( bank_name = bank.debit_config.bank_name test_directory = Path(__file__).parent / bank_name / "debit" pipeline = Pipeline(test_directory / "input.pdf", bank=bank) - statement: DebitStatement = pipeline.extract() + + parser = PdfParser(bank, pipeline.document) + pages = parser.get_pages() + statement: DebitStatement = pipeline.extract(pages) # check raw data expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv") diff --git a/tests/integration/banks/test_banks_generic_credit.py b/tests/integration/banks/test_banks_generic_credit.py index ab2b1e75..bc028051 100644 --- a/tests/integration/banks/test_banks_generic_credit.py +++ b/tests/integration/banks/test_banks_generic_credit.py @@ -5,9 +5,9 @@ from test_utils.skip import skip_if_encrypted from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv -from monopoly.banks import Citibank, Dbs, Maybank, Ocbc, StandardChartered -from monopoly.banks.base import BankBase +from monopoly.banks import BankBase, Citibank, Dbs, Maybank, Ocbc, StandardChartered from monopoly.constants import Columns +from monopoly.pdf import PdfParser from monopoly.pipeline import Pipeline from monopoly.statements import CreditStatement @@ -33,7 +33,10 @@ def test_bank_credit_statements( bank_name = bank.credit_config.bank_name test_directory = Path(__file__).parent / bank_name / "credit" pipeline = Pipeline(test_directory / "input.pdf") - statement: CreditStatement = pipeline.extract() + + parser = PdfParser(bank, pipeline.document) + pages = parser.get_pages() + statement: CreditStatement = pipeline.extract(pages) # check raw data expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv") diff --git a/tests/integration/banks/test_banks_generic_debit.py b/tests/integration/banks/test_banks_generic_debit.py index 80ab2059..82a2ffe1 100644 --- a/tests/integration/banks/test_banks_generic_debit.py +++ b/tests/integration/banks/test_banks_generic_debit.py @@ -6,6 +6,7 @@ from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv from monopoly.banks import BankBase, Dbs, Maybank, Ocbc +from monopoly.pdf import PdfParser from monopoly.pipeline import Pipeline from monopoly.statements import DebitStatement @@ -36,7 +37,10 @@ def test_bank_debit_statements( bank_name = bank.debit_config.bank_name test_directory = Path(__file__).parent / bank_name / "debit" pipeline = Pipeline(test_directory / "input.pdf") - statement: DebitStatement = pipeline.extract() + + parser = PdfParser(bank, pipeline.document) + pages = parser.get_pages() + statement: DebitStatement = pipeline.extract(pages) # check raw data expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv") diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py index 321315bf..42fcb732 100644 --- a/tests/integration/test_pipeline.py +++ b/tests/integration/test_pipeline.py @@ -3,6 +3,7 @@ import pytest from monopoly.banks import Dbs, ExampleBank +from monopoly.pdf import PdfParser from monopoly.pipeline import Pipeline @@ -24,7 +25,9 @@ def test_pipeline_initialization_with_bytes(pdf_file_bytes): def test_pipeline_with_bank(): file_path = Path("src/monopoly/examples/example_statement.pdf") pipeline = Pipeline(file_path=file_path, bank=ExampleBank) - transactions = pipeline.extract().transactions + parser = PdfParser(pipeline.bank, pipeline.document) + pages = parser.get_pages() + transactions = pipeline.extract(pages).transactions assert len(transactions) == 53 assert transactions[0].description == "LAST MONTH'S BALANCE" @@ -32,8 +35,11 @@ def test_pipeline_with_bank(): def test_pipeline_with_bad_bank(): file_path = Path("src/monopoly/examples/example_statement.pdf") pipeline = Pipeline(file_path=file_path, bank=Dbs) + parser = PdfParser(pipeline.bank, pipeline.document) + pages = parser.get_pages() + with pytest.raises(ValueError, match="No transactions found"): - pipeline.extract() + pipeline.extract(pages) def test_pipeline_initialization_with_file_path(): @@ -62,6 +68,8 @@ def test_pipeline_initialization_with_neither_raises_error(): def test_pipeline_bytes_etl(pdf_file_bytes): pipeline = Pipeline(file_bytes=pdf_file_bytes, bank=ExampleBank) - statement = pipeline.extract() + parser = PdfParser(pipeline.bank, pipeline.document) + pages = parser.get_pages() + statement = pipeline.extract(pages) transactions = pipeline.transform(statement) assert len(transactions) == 53