Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perf: optimize postal_parse_address implementation #49

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
db/
Binary file added benchmarks/apoc_addresses_1M.parquet
Binary file not shown.
56 changes: 56 additions & 0 deletions benchmarks/parse_address.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import datetime
import os
from pathlib import Path
from time import time
from typing import Callable

import ibis
import ibis.expr.types as ir
import pandas as pd
from ibis import _, Table

from mismo.lib.geo import postal_parse_address
from mismo.lib.geo.tests.test_postal_benchmark import noop, postal_only, postal_parse_address__direct_import, postal_parse_address__initial_impl, python_only


_CURRENT_DIR = Path(__file__).parent
_DB_DIR = Path(_CURRENT_DIR, 'db')


def _prepare_db_table(benchmark_id: str, db_name: str) -> Table:
apoc_file = Path(_CURRENT_DIR, 'apoc_addresses_1M.parquet')
apoc_data = pd.read_parquet(apoc_file)

# db_file = Path(_DB_DIR, benchmark_id, db_name)
# os.makedirs(db_file.parent, exist_ok=True)
# con = ibis.duckdb.connect(db_file)
# t = con.create_table(db_name, apoc_data)
t = ibis.memtable(apoc_data)

return t


def run_benchmark(benchmark_id: str, parse_fn: Callable[..., ir.Value]) -> None:
input_table = _prepare_db_table(benchmark_id, f"{parse_fn.__name__}.ddb")
input_table = input_table.cache()
start = time()
res = parse_fn(input_table.full_address)
persisted = res.as_table().cache()
end = time()
print(f"{parse_fn.__name__:<35} took {end - start:>8.4f} seconds")


def main():
# Windows does not allow ':' in file names
benchmark_id = datetime.datetime.now(datetime.timezone.utc).isoformat().replace(":", "-")

run_benchmark(benchmark_id, noop)
run_benchmark(benchmark_id, python_only)
run_benchmark(benchmark_id, postal_only)
run_benchmark(benchmark_id, postal_parse_address)
run_benchmark(benchmark_id, postal_parse_address__direct_import)
run_benchmark(benchmark_id, postal_parse_address__initial_impl)


if __name__ == '__main__':
main()
69 changes: 54 additions & 15 deletions mismo/lib/geo/_address.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

from collections import defaultdict
import re

import ibis
Expand Down Expand Up @@ -295,24 +294,64 @@ def postal_parse_address(address_string: ir.StringValue) -> ir.StructValue:
from postal.parser import parse_address as _parse_address

@ibis.udf.scalar.python(signature=((str,), ADDRESS_SCHEMA))
def udf(address_string: str | None) -> dict[str, str] | None:
# remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
def udf(address_string: str | None) -> dict[str, str | None] | None:
# TODO: remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
if address_string is None:
return None

# Initially, the key set of the `result` dict is given by the union of
# both the names of the fields in the `ADDRESS_SCHEMA` struct and
# the names of the pypostal fields we use.
# Later, the latter are popped to match the shape of `ADDRESS_SCHEMA`.

# NB: due to https://github.com/ibis-project/ibis/issues/9613
# the keys of the `result` dict returned at the end of this function
# must be sorted in the same order as they are declared in the
# `ADDRESS_SCHEMA` struct. Current workaround is to create the dict
# with all those keys in the proper order since the beginning.
result: dict[str, str | None] = {
"street1": None,
"street2": None,
"city": None,
"state": None,
"postal_code": None,
"country": None,

# Temporary keys used to store values returned by pypostal and
# popped before returning the dictionary
"house_number": None,
"road": None,
"unit": None,
"postcode": None
}

parsed_fields = _parse_address(address_string)
label_to_values = defaultdict(list)
for value, label in parsed_fields:
label_to_values[label].append(value)
renamed = {
"street1": label_to_values["house_number"] + label_to_values["road"],
"street2": label_to_values["unit"],
"city": label_to_values["city"],
"state": label_to_values["state"],
"postal_code": label_to_values["postcode"],
"country": label_to_values["country"],
}
# replace empty strings with None
return {k: " ".join(v) or None for k, v in renamed.items()}
# Pypostal returns more fields than the ones we actually need.
# Here `False` is used as a placeholder under the assumption that
# such value is never returned by pypostal a field value.
current = result.get(label, False)

# Keep only the fields declared when `result` is initialized.
# Pypostal fields can be repeated, in such case we concat their values.
if current is not False:
result[label] = value if current is None else f"{current} {value}"

# Hack to prepend "house_number" to "road"
house_number = result.pop("house_number")
if house_number is not None:
road = result["road"]
if road is None:
result["road"] = house_number
else:
result["road"] = f"{house_number} {road}"

# Modify `result` to match the shape of an `ADDRESS_SCHEMA`.
result["street1"] = result.pop("road")
result["street2"] = result.pop("unit")
result["postal_code"] = result.pop("postcode")

return result

return udf(address_string)

Expand Down
87 changes: 58 additions & 29 deletions mismo/lib/geo/tests/test_postal_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from collections import defaultdict
from pathlib import Path

import ibis
Expand Down Expand Up @@ -31,23 +32,30 @@


@udf
def noop(address_string: str | None) -> dict:
def noop(address_string: str | None) -> dict[str, None]:
return _NOOP_ADDRESS


@udf
def python_only(address_string: str | None) -> dict:
def python_only(address_string: str | None) -> dict[str, str | None] | None:
# remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
if address_string is None:
return None

result: dict[str, str | None] = {
"house_number": None,
"road": None,
"unit": None,
"city": None,
"state": None,
"postcode": None,
"country": None,
"street1": None,
"street2": None,
"city": None,
"state": None,
"postal_code": None,
"country": None,
"house_number": None,
"road": None,
"unit": None,
"postcode": None
}

# Fake 'parse_address' function that emits just one field ("street")
# Fake '_parse_address' function that emits just one field ("street")
# containing the whole address.
parsed_fields = (("street", address_string),)
for value, label in parsed_fields:
Expand All @@ -71,40 +79,40 @@ def python_only(address_string: str | None) -> dict:


@udf
def postal_only(address_string: str | None) -> dict:
_parse_address(address_string or "")
def postal_only(address_string: str | None) -> dict[str, None] | None:
# remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
if address_string is None:
return None

_parse_address(address_string)
return _NOOP_ADDRESS


@udf
def complete(address_string: str | None) -> dict | None:
@ibis.udf.scalar.python
def postal_parse_address__direct_import(address_string: str) -> ADDRESS_SCHEMA:
# TODO: remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
if address_string is None:
return None
# Initially, the keys match the names of pypostal fields we need.
# Later, this dict is modified to match the shape of an `ADDRESS_SCHEMA`.

result: dict[str, str | None] = {
"house_number": None,
"road": None,
"unit": None,
"street1": None,
"street2": None,
"city": None,
"state": None,
"postcode": None,
"postal_code": None,
"country": None,
"house_number": None,
"road": None,
"unit": None,
"postcode": None
}

parsed_fields = _parse_address(address_string)
for value, label in parsed_fields:
# Pypostal returns more fields than the ones we actually need.
# Here `False` is used as a placeholder under the assumption that
# such value is never returned by pypostal a field value.
current = result.get(label, False)

# Keep only the fields declared when `result` is initialized.
# Pypostal fields can be repeated, in such case we concat their values.
if current is not False:
result[label] = value if current is None else f"{current} {value}"

# Hack to prepend "house_number" to "road"
house_number = result.pop("house_number")
if house_number is not None:
road = result["road"]
Expand All @@ -113,14 +121,34 @@ def complete(address_string: str | None) -> dict | None:
else:
result["road"] = f"{house_number} {road}"

# Modify `result` in-place to match the shape of an `ADDRESS_SCHEMA`.
result["street1"] = result.pop("road")
result["street2"] = result.pop("unit")
result["postal_code"] = result.pop("postcode")

return result


@udf
def postal_parse_address__initial_impl(address_string: str | None) -> dict[str, str | None] | None:
# remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
if address_string is None:
return None
parsed_fields = _parse_address(address_string)
label_to_values = defaultdict(list)
for value, label in parsed_fields:
label_to_values[label].append(value)
renamed = {
"street1": label_to_values["house_number"] + label_to_values["road"],
"street2": label_to_values["unit"],
"city": label_to_values["city"],
"state": label_to_values["state"],
"postal_code": label_to_values["postcode"],
"country": label_to_values["country"],
}
# replace empty strings with None
return {k: " ".join(v) or None for k, v in renamed.items()}


def download_test_data() -> ir.Table:
# download test data from https://github.com/NickCrews/apoc-data/releases/tag/20240717-111158
URL_TEMPLATE = "https://github.com/NickCrews/apoc-data/releases/download/20240717-111158/income_{year}.csv"
Expand Down Expand Up @@ -162,8 +190,9 @@ def data(backend: ibis.BaseBackend) -> ir.Table:
noop,
python_only,
postal_only,
complete,
postal_parse_address,
postal_parse_address__direct_import,
postal_parse_address__initial_impl,
],
)
@pytest.mark.parametrize(
Expand Down
Loading