Skip to content

Commit

Permalink
refactor(pipeline): allow custom document to be passed
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-awd committed Sep 5, 2024
1 parent 88daff3 commit a964bea
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 83 deletions.
5 changes: 2 additions & 3 deletions src/monopoly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,9 @@ def process_statement(
analyzer = BankDetector(document)
bank = analyzer.detect_bank(banks) or GenericBank
parser = PdfParser(bank, document)
pages = parser.get_pages()
pipeline = Pipeline(parser)

pipeline = Pipeline(file, bank=bank)
statement = pipeline.extract(pages, safety_check=safety_check)
statement = pipeline.extract(safety_check=safety_check)
transactions = pipeline.transform(statement)

if print_df:
Expand Down
14 changes: 6 additions & 8 deletions src/monopoly/examples/single_statement.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from monopoly.banks import ExampleBank
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline


Expand All @@ -10,15 +10,13 @@ def example():
You can pass in the bank class if you want to specify a specific bank,
or use the BankDetector class to try to detect the bank automatically.
"""
pipeline = Pipeline(
file_path="src/monopoly/examples/example_statement.pdf", bank=ExampleBank
)
parser = PdfParser(pipeline.bank, pipeline.document)
pages = parser.get_pages()
document = PdfDocument(file_path="src/monopoly/examples/example_statement.pdf")
parser = PdfParser(ExampleBank, document)
pipeline = Pipeline(parser)

# This runs pdftotext on the PDF and
# extracts transactions as raw text
statement = pipeline.extract(pages)
statement = pipeline.extract()

# Dates are converted into an ISO 8601 date format
transactions = pipeline.transform(statement)
Expand All @@ -30,7 +28,7 @@ def example():
output_directory="src/monopoly/examples",
)

with open(file_path) as file:
with open(file_path, encoding="utf8") as file:
print(file.read()[0:248])


Expand Down
8 changes: 8 additions & 0 deletions src/monopoly/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ def __init__(
self.file_bytes = file_bytes
self.passwords = passwords or PdfPasswords().pdf_passwords

if not any([self.file_path, self.file_bytes]):
raise RuntimeError("Either `file_path` or `file_bytes` must be passed")

if self.file_path and self.file_bytes:
raise RuntimeError(
"Only one of `file_path` or `file_bytes` should be passed"
)

args = {"filename": self.file_path, "stream": self.file_bytes}
super().__init__(**args)

Expand Down
29 changes: 10 additions & 19 deletions src/monopoly/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from monopoly.config import DateOrder
from monopoly.generic import GenericBank, GenericStatementHandler
from monopoly.handler import StatementHandler
from monopoly.pdf import PdfDocument, PdfPage
from monopoly.pdf import PdfDocument, PdfPage, PdfParser
from monopoly.statements import BaseStatement, Transaction
from monopoly.write import generate_name

Expand All @@ -22,25 +22,17 @@ class Pipeline:

def __init__(
self,
file_path: Optional[Path] = None,
file_bytes: Optional[bytes] = None,
parser: PdfParser,
document: Optional[PdfDocument] = None,
passwords: Optional[list[SecretStr]] = None,
bank: Type[BankBase] = GenericBank,
):
self.file_path = file_path
self.file_bytes = file_bytes
self.document = document
self.passwords = passwords
self.bank = bank
pages = parser.get_pages()
if not document:
document = parser.document

if not any([self.file_path, self.file_bytes]):
raise RuntimeError("Either `file_path` or `file_bytes` must be passed")

if self.file_path and self.file_bytes:
raise RuntimeError(
"Only one of `file_path` or `file_bytes` should be passed"
)

self.document = PdfDocument(file_path, file_bytes, passwords)
self.handler = self.create_handler(parser.bank, pages)

@staticmethod
def create_handler(bank: Type[BankBase], pages: list[PdfPage]) -> StatementHandler:
Expand All @@ -50,11 +42,10 @@ def create_handler(bank: Type[BankBase], pages: list[PdfPage]) -> StatementHandl
logger.debug("Using statement handler with bank: %s", bank.__name__)
return StatementHandler(bank, pages)

def extract(self, pages: list[PdfPage], safety_check=True) -> BaseStatement:
def extract(self, safety_check=True) -> BaseStatement:
"""Extracts transactions from the statement, and performs
a safety check to make sure that total transactions add up"""
handler = self.create_handler(self.bank, pages)
statement = handler.get_statement()
statement = self.handler.get_statement()
transactions = statement.get_transactions()

if not transactions:
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def mock_env():

@pytest.fixture
def pdf_document():
yield PdfDocument()
yield PdfDocument(file_path="src/monopoly/examples/example_statement.pdf")


@pytest.fixture
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/banks/test_banks_credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from monopoly.banks import Citibank, Dbs, Hsbc, Maybank, Ocbc, StandardChartered
from monopoly.banks.base import BankBase
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import CreditStatement

Expand All @@ -33,11 +33,11 @@ def test_bank_credit_statements(
):
bank_name = bank.credit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "credit"
pipeline = Pipeline(test_directory / "input.pdf", bank=bank)

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: CreditStatement = pipeline.extract(pages)
document = PdfDocument(test_directory / "input.pdf")
parser = PdfParser(bank, document)
pipeline = Pipeline(parser)
statement: CreditStatement = pipeline.extract()

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/banks/test_banks_debit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv

from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import DebitStatement

Expand All @@ -30,11 +30,11 @@ def test_bank_debit_statements(
):
bank_name = bank.debit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "debit"
pipeline = Pipeline(test_directory / "input.pdf", bank=bank)

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: DebitStatement = pipeline.extract(pages)
document = PdfDocument(test_directory / "input.pdf")
parser = PdfParser(bank, document)
pipeline = Pipeline(parser)
statement: DebitStatement = pipeline.extract()

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/banks/test_banks_generic_credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from monopoly.banks import BankBase, Citibank, Dbs, Maybank, Ocbc, StandardChartered
from monopoly.constants import Columns
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import CreditStatement

Expand All @@ -32,11 +32,11 @@ def test_bank_credit_statements(
):
bank_name = bank.credit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "credit"
pipeline = Pipeline(test_directory / "input.pdf")

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: CreditStatement = pipeline.extract(pages)
document = PdfDocument(test_directory / "input.pdf")
parser = PdfParser(bank, document)
pipeline = Pipeline(parser)
statement: CreditStatement = pipeline.extract()

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/banks/test_banks_generic_debit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from test_utils.transactions import get_transactions_as_dict, read_transactions_from_csv

from monopoly.banks import BankBase, Dbs, Maybank, Ocbc
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline
from monopoly.statements import DebitStatement

Expand Down Expand Up @@ -36,11 +36,11 @@ def test_bank_debit_statements(
):
bank_name = bank.debit_config.bank_name
test_directory = Path(__file__).parent / bank_name / "debit"
pipeline = Pipeline(test_directory / "input.pdf")

parser = PdfParser(bank, pipeline.document)
pages = parser.get_pages()
statement: DebitStatement = pipeline.extract(pages)
document = PdfDocument(test_directory / "input.pdf")
parser = PdfParser(bank, document)
pipeline = Pipeline(parser)
statement: DebitStatement = pipeline.extract()

# check raw data
expected_raw_transactions = read_transactions_from_csv(test_directory, "raw.csv")
Expand Down
15 changes: 15 additions & 0 deletions tests/integration/test_pdf_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@
fixture_directory = Path(__file__).parent / "fixtures"


def test_document_initialization_with_both_raises_error():
file_path = Path("src/monopoly/examples/example_statement.pdf")
with raises(
RuntimeError, match="Only one of `file_path` or `file_bytes` should be passed"
):
PdfDocument(file_path=file_path, file_bytes=b"123")


def test_document_initialization_with_neither_raises_error():
with raises(
RuntimeError, match="Either `file_path` or `file_bytes` must be passed"
):
PdfDocument()


def test_can_open_file_stream():
with open(fixture_directory / "4_pages_blank.pdf", "rb") as file:
pdf_document = PdfDocument(file_bytes=file.read())
Expand Down
50 changes: 18 additions & 32 deletions tests/integration/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from monopoly.banks import Dbs, ExampleBank
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfDocument, PdfParser
from monopoly.pipeline import Pipeline


Expand All @@ -14,18 +14,11 @@ def pdf_file_bytes():
yield f.read()


def test_pipeline_initialization_with_bytes(pdf_file_bytes):
try:
pipeline = Pipeline(file_bytes=pdf_file_bytes)
assert pipeline is not None
except RuntimeError as e:
pytest.fail(f"Pipeline initialization failed with RuntimeError: {e}")


def test_pipeline_with_bank():
file_path = Path("src/monopoly/examples/example_statement.pdf")
pipeline = Pipeline(file_path=file_path, bank=ExampleBank)
parser = PdfParser(pipeline.bank, pipeline.document)
document = PdfDocument(file_path)
parser = PdfParser(ExampleBank, document)
pipeline = Pipeline(parser)
pages = parser.get_pages()
transactions = pipeline.extract(pages).transactions
assert len(transactions) == 53
Expand All @@ -34,8 +27,9 @@ def test_pipeline_with_bank():

def test_pipeline_with_bad_bank():
file_path = Path("src/monopoly/examples/example_statement.pdf")
pipeline = Pipeline(file_path=file_path, bank=Dbs)
parser = PdfParser(pipeline.bank, pipeline.document)
document = PdfDocument(file_path)
parser = PdfParser(Dbs, document)
pipeline = Pipeline(parser)
pages = parser.get_pages()

with pytest.raises(ValueError, match="No transactions found"):
Expand All @@ -45,30 +39,22 @@ def test_pipeline_with_bad_bank():
def test_pipeline_initialization_with_file_path():
file_path = Path("src/monopoly/examples/example_statement.pdf")
try:
pipeline = Pipeline(file_path=file_path)
assert pipeline is not None
document = PdfDocument(file_path)
parser = PdfParser(ExampleBank, document)
pipeline = Pipeline(parser)
pages = parser.get_pages()
statement = pipeline.extract(pages)
transactions = pipeline.transform(statement)
assert len(transactions) == 53
except RuntimeError as e:
pytest.fail(f"Pipeline initialization failed with RuntimeError: {e}")


def test_pipeline_initialization_with_both_raises_error(pdf_file_bytes):
file_path = Path("src/monopoly/examples/example_statement.pdf")
with pytest.raises(
RuntimeError, match="Only one of `file_path` or `file_bytes` should be passed"
):
Pipeline(file_path=file_path, file_bytes=pdf_file_bytes)


def test_pipeline_initialization_with_neither_raises_error():
with pytest.raises(
RuntimeError, match="Either `file_path` or `file_bytes` must be passed"
):
Pipeline()


def test_pipeline_bytes_etl(pdf_file_bytes):
pipeline = Pipeline(file_bytes=pdf_file_bytes, bank=ExampleBank)
parser = PdfParser(pipeline.bank, pipeline.document)
document = PdfDocument(file_bytes=pdf_file_bytes)
parser = PdfParser(ExampleBank, document)
pipeline = Pipeline(parser)

pages = parser.get_pages()
statement = pipeline.extract(pages)
transactions = pipeline.transform(statement)
Expand Down

0 comments on commit a964bea

Please sign in to comment.