Skip to content

Commit

Permalink
refactor(pipeline): move parser instantiation logic to CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-awd committed Sep 5, 2024
1 parent b7e4638 commit 88daff3
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 28 deletions.
6 changes: 4 additions & 2 deletions src/monopoly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,18 @@ def process_statement(
# pylint: disable=import-outside-toplevel, too-many-locals
from monopoly.banks import BankDetector, banks
from monopoly.generic import GenericBank
from monopoly.pdf import PdfDocument
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline

try:
document = PdfDocument(file)
analyzer = BankDetector(document)
bank = analyzer.detect_bank(banks) or GenericBank
parser = PdfParser(bank, document)
pages = parser.get_pages()

pipeline = Pipeline(file, bank=bank)
statement = pipeline.extract(safety_check=safety_check)
statement = pipeline.extract(pages, safety_check=safety_check)
transactions = pipeline.transform(statement)

if print_df:
Expand Down
17 changes: 11 additions & 6 deletions src/monopoly/examples/single_statement.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from monopoly.banks import ExampleBank
from monopoly.pdf import PdfParser
from monopoly.pipeline import Pipeline


Expand All @@ -6,28 +8,31 @@ def example():
a single bank statement
You can pass in the bank class if you want to specify a specific bank,
or ignore the bank argument and let the Pipeline try to automatically
detect the bank.
or use the BankDetector class to try to detect the bank automatically.
"""
pipeline = Pipeline(
file_path="src/monopoly/examples/example_statement.pdf",
# bank=ExampleBank
file_path="src/monopoly/examples/example_statement.pdf", bank=ExampleBank
)
parser = PdfParser(pipeline.bank, pipeline.document)
pages = parser.get_pages()

# This runs pdftotext on the PDF and
# extracts transactions as raw text
statement = pipeline.extract()
statement = pipeline.extract(pages)

# Dates are converted into an ISO 8601 date format
transactions = pipeline.transform(statement)

# Parsed transactions writen to a CSV file in the "example" directory
pipeline.load(
file_path = pipeline.load(
transactions=transactions,
statement=statement,
output_directory="src/monopoly/examples",
)

with open(file_path) as file:
print(file.read()[0:248])


if __name__ == "__main__":
example()
14 changes: 7 additions & 7 deletions src/monopoly/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,12 @@ def page_bbox(self):
return self.pdf_config.page_bbox

@cached_property
def ocr_identifiers(self):
return self.pdf_config.ocr_identifiers or []
def ocr_available(self):
if ids := self.pdf_config.ocr_identifiers:
for identifiers in ids:
if self.metadata_identifier.matches(identifiers):
return True
return False

@lru_cache
def get_pages(self) -> list[PdfPage]:
Expand All @@ -165,10 +169,6 @@ def get_pages(self) -> list[PdfPage]:
page.set_cropbox(cropbox)
page = self._remove_vertical_text(page)

for identifier in self.ocr_identifiers:
if self.metadata_identifier.matches(identifier):
document = self._apply_ocr(document)

# certain statements requsire garbage collection, so that duplicate objects
# do not cause pdftotext to fail due to missing xrefs/null values
# however, setting `garbage=2` may cause issues with other statements
Expand Down Expand Up @@ -217,7 +217,7 @@ def _remove_vertical_text(page: Page):
return page

@staticmethod
def _apply_ocr(document: PdfDocument) -> PdfDocument:
def apply_ocr(document: PdfDocument) -> PdfDocument:
# pylint: disable=import-outside-toplevel
try:
from ocrmypdf import Verbosity, configure_logging, ocr
Expand Down
6 changes: 2 additions & 4 deletions src/monopoly/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from monopoly.config import DateOrder
from monopoly.generic import GenericBank, GenericStatementHandler
from monopoly.handler import StatementHandler
from monopoly.pdf import PdfDocument, PdfPage, PdfParser
from monopoly.pdf import PdfDocument, PdfPage
from monopoly.statements import BaseStatement, Transaction
from monopoly.write import generate_name

Expand Down Expand Up @@ -50,11 +50,9 @@ def create_handler(bank: Type[BankBase], pages: list[PdfPage]) -> StatementHandl
logger.debug("Using statement handler with bank: %s", bank.__name__)
return StatementHandler(bank, pages)

def extract(self, safety_check=True) -> BaseStatement:
def extract(self, pages: list[PdfPage], safety_check=True) -> BaseStatement:
"""Extracts transactions from the statement, and performs
a safety check to make sure that total transactions add up"""
parser = PdfParser(self.bank, self.document)
pages = parser.get_pages()
handler = self.create_handler(self.bank, pages)
statement = handler.get_statement()
transactions = statement.get_transactions()
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/banks/test_banks_credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from monopoly.banks import Citibank, Dbs, Hsbc, Maybank, Ocbc, StandardChartered
from monopoly.banks.base import BankBase
from monopoly.pdf import PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import CreditStatement

Expand All @@ -33,7 +34,10 @@ def test_bank_credit_statements(
bank_name = bank.credit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "credit"
pipeline = Pipeline(test_directory / "input.pdf", bank=bank)
statement: CreditStatement = pipeline.extract()

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: CreditStatement = pipeline.extract(pages)

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/banks/test_banks_debit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv

from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
from monopoly.pdf import PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import DebitStatement

Expand All @@ -30,7 +31,10 @@ def test_bank_debit_statements(
bank_name = bank.debit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "debit"
pipeline = Pipeline(test_directory / "input.pdf", bank=bank)
statement: DebitStatement = pipeline.extract()

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: DebitStatement = pipeline.extract(pages)

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
9 changes: 6 additions & 3 deletions tests/integration/banks/test_banks_generic_credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from test_utils.skip import skip_if_encrypted
from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv

from monopoly.banks import Citibank, Dbs, Maybank, Ocbc, StandardChartered
from monopoly.banks.base import BankBase
from monopoly.banks import BankBase, Citibank, Dbs, Maybank, Ocbc, StandardChartered
from monopoly.constants import Columns
from monopoly.pdf import PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import CreditStatement

Expand All @@ -33,7 +33,10 @@ def test_bank_credit_statements(
bank_name = bank.credit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "credit"
pipeline = Pipeline(test_directory / "input.pdf")
statement: CreditStatement = pipeline.extract()

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: CreditStatement = pipeline.extract(pages)

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/banks/test_banks_generic_debit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv

from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
from monopoly.pdf import PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import DebitStatement

Expand Down Expand Up @@ -36,7 +37,10 @@ def test_bank_debit_statements(
bank_name = bank.debit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "debit"
pipeline = Pipeline(test_directory / "input.pdf")
statement: DebitStatement = pipeline.extract()

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: DebitStatement = pipeline.extract(pages)

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
14 changes: 11 additions & 3 deletions tests/integration/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest

from monopoly.banks import Dbs, ExampleBank
from monopoly.pdf import PdfParser
from monopoly.pipeline import Pipeline


Expand All @@ -24,16 +25,21 @@ def test_pipeline_initialization_with_bytes(pdf_file_bytes):
def test_pipeline_with_bank():
file_path = Path("src/monopoly/examples/example_statement.pdf")
pipeline = Pipeline(file_path=file_path, bank=ExampleBank)
transactions = pipeline.extract().transactions
parser = PdfParser(pipeline.bank, pipeline.document)
pages = parser.get_pages()
transactions = pipeline.extract(pages).transactions
assert len(transactions) == 53
assert transactions[0].description == "LAST MONTH'S BALANCE"


def test_pipeline_with_bad_bank():
file_path = Path("src/monopoly/examples/example_statement.pdf")
pipeline = Pipeline(file_path=file_path, bank=Dbs)
parser = PdfParser(pipeline.bank, pipeline.document)
pages = parser.get_pages()

with pytest.raises(ValueError, match="No transactions found"):
pipeline.extract()
pipeline.extract(pages)


def test_pipeline_initialization_with_file_path():
Expand Down Expand Up @@ -62,6 +68,8 @@ def test_pipeline_initialization_with_neither_raises_error():

def test_pipeline_bytes_etl(pdf_file_bytes):
pipeline = Pipeline(file_bytes=pdf_file_bytes, bank=ExampleBank)
statement = pipeline.extract()
parser = PdfParser(pipeline.bank, pipeline.document)
pages = parser.get_pages()
statement = pipeline.extract(pages)
transactions = pipeline.transform(statement)
assert len(transactions) == 53

0 comments on commit 88daff3

Please sign in to comment.