Skip to content

Commit

Permalink
feat(generic): support inconsistent header spacing across pages
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-awd committed Sep 25, 2024
1 parent 0abb3b0 commit 99981e2
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 13 deletions.
5 changes: 3 additions & 2 deletions src/monopoly/generic/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,10 @@ def get_debit_statement_header_line(self, lines_before_first_transaction) -> str
if line:
# assume that the header always starts with `date` or `DATE`
result = header_pattern.search(line)

if result:
escaped_result = f"({re.escape(result.string)})"
generalized_result = re.sub(r"\s{3,}", r"\\s+", result.string)
escaped_result = re.escape(generalized_result)
escaped_result = escaped_result.replace("\\\\s\\+", r"\s+")
logger.debug("Found header statement: %s", escaped_result)
return escaped_result

Expand Down
3 changes: 2 additions & 1 deletion src/monopoly/statements/debit_statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,10 @@ def get_header_pos(self, column_name: str, page_number: int) -> int:
continue
return header_start_pos + len(column_name)

raise ValueError(
logger.debug(
f"Debit header {column_name} cannot be found on page {page_number}"
)
return -1

@lru_cache
def perform_safety_check(self) -> bool:
Expand Down
8 changes: 2 additions & 6 deletions tests/integration/banks/test_banks_generic_debit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv

from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
from monopoly.generic import GenericBank
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import DebitStatement
Expand All @@ -17,17 +18,11 @@
]


@pytest.fixture
def no_banks(monkeypatch):
monkeypatch.setattr("monopoly.banks.banks", [])


@skip_if_encrypted
@pytest.mark.parametrize(
"bank, expected_debit_sum, expected_credit_sum, statement_date",
test_cases,
)
@pytest.mark.usefixtures("no_banks")
def test_bank_debit_statements(
bank: BankBase,
expected_debit_sum: float,
Expand All @@ -38,6 +33,7 @@ def test_bank_debit_statements(

document = PdfDocument(test_directory / "input.pdf")
parser = PdfParser(bank, document)
parser.bank = GenericBank
pipeline = Pipeline(parser)
statement: DebitStatement = pipeline.extract()

Expand Down
9 changes: 5 additions & 4 deletions tests/unit/generic/test_date_pattern_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,14 +349,15 @@ def __init__(self, lines):
pages = [MockPdfPage(lines_before_first_transaction)]
date_pattern_analyzer.pages = pages
date_pattern_analyzer.matches = [MockDateMatch(line_number=22, page_number=0)]
expected_header = re.escape(
" DATE DETAILS OF TRANSACTIONS WITHDRAWAL($) DEPOSIT($) BALANCE($)"
expected_header = (
"\\ DATE\\s+DETAILS\\ OF\\ TRANSACTIONS\\s+WITHDRAWAL\\(\\$\\)"
+ "\\s+DEPOSIT\\(\\$\\)\\s+BALANCE\\(\\$\\)"
)

result = date_pattern_analyzer.get_debit_statement_header_line(
frozenset(lines_before_first_transaction)
lines_before_first_transaction
)
assert result == f"({expected_header})"
assert result == expected_header


def test_check_if_multiline(date_pattern_analyzer: DatePatternAnalyzer):
Expand Down

0 comments on commit 99981e2

Please sign in to comment.