NickCrews · lmores · Jul 19, 2024 · Jul 19, 2024 · Jul 26, 2024
diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+db/
diff --git a/benchmarks/apoc_addresses_1M.parquet b/benchmarks/apoc_addresses_1M.parquet
diff --git a/benchmarks/parse_address.py b/benchmarks/parse_address.py
@@ -0,0 +1,56 @@
+import datetime
+import os
+from pathlib import Path
+from time import time
+from typing import Callable
+
+import ibis
+import ibis.expr.types as ir
+import pandas as pd
+from ibis import _, Table
+
+from mismo.lib.geo import postal_parse_address
+from mismo.lib.geo.tests.test_postal_benchmark import noop, postal_only, postal_parse_address__direct_import, postal_parse_address__initial_impl, python_only
+
+
+_CURRENT_DIR = Path(__file__).parent
+_DB_DIR = Path(_CURRENT_DIR, 'db')
+
+
+def _prepare_db_table(benchmark_id: str, db_name: str) -> Table:
+    apoc_file = Path(_CURRENT_DIR, 'apoc_addresses_1M.parquet')
+    apoc_data = pd.read_parquet(apoc_file)
+
+    # db_file = Path(_DB_DIR, benchmark_id, db_name)
+    # os.makedirs(db_file.parent, exist_ok=True)
+    # con = ibis.duckdb.connect(db_file)
+    # t = con.create_table(db_name, apoc_data)
+    t = ibis.memtable(apoc_data)
+
+    return t
+
+
+def run_benchmark(benchmark_id: str, parse_fn: Callable[..., ir.Value]) -> None:
+    input_table = _prepare_db_table(benchmark_id, f"{parse_fn.__name__}.ddb")
+    input_table = input_table.cache()
+    start = time()
+    res = parse_fn(input_table.full_address)
+    persisted = res.as_table().cache()
+    end = time()
+    print(f"{parse_fn.__name__:<35} took {end - start:>8.4f} seconds")
+
+
+def main():
+    # Windows does not allow ':' in file names
+    benchmark_id = datetime.datetime.now(datetime.timezone.utc).isoformat().replace(":", "-")
+
+    run_benchmark(benchmark_id, noop)
+    run_benchmark(benchmark_id, python_only)
+    run_benchmark(benchmark_id, postal_only)
+    run_benchmark(benchmark_id, postal_parse_address)
+    run_benchmark(benchmark_id, postal_parse_address__direct_import)
+    run_benchmark(benchmark_id, postal_parse_address__initial_impl)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mismo/lib/geo/_address.py b/mismo/lib/geo/_address.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from collections import defaultdict
 import re
 
 import ibis
@@ -295,24 +294,64 @@ def postal_parse_address(address_string: ir.StringValue) -> ir.StructValue:
         from postal.parser import parse_address as _parse_address
 
     @ibis.udf.scalar.python(signature=((str,), ADDRESS_SCHEMA))
-    def udf(address_string: str | None) -> dict[str, str] | None:
-        # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
+    def udf(address_string: str | None) -> dict[str, str | None] | None:
+        # TODO: remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
         if address_string is None:
             return None
+
+        # Initially, the key set of the `result` dict is given by the union of
+        # both the names of the fields in the `ADDRESS_SCHEMA` struct and
+        # the names of the pypostal fields we use.
+        # Later, the latter are popped to match the shape of `ADDRESS_SCHEMA`.
+
+        # NB: due to https://github.com/ibis-project/ibis/issues/9613
+        # the keys of the `result` dict returned at the end of this function
+        # must be sorted in the same order as they are declared in the
+        # `ADDRESS_SCHEMA` struct. Current workaround is to create the dict
+        # with all those keys in the proper order since the beginning.
+        result: dict[str, str | None] = {
+            "street1": None,
+            "street2": None,
+            "city": None,
+            "state": None,
+            "postal_code": None,
+            "country": None,
+
+            # Temporary keys used to store values returned by pypostal and
+            # popped before returning the dictionary
+            "house_number": None,
+            "road": None,
+            "unit": None,
+            "postcode": None
+        }
+
         parsed_fields = _parse_address(address_string)
-        label_to_values = defaultdict(list)
         for value, label in parsed_fields:
-            label_to_values[label].append(value)
-        renamed = {
-            "street1": label_to_values["house_number"] + label_to_values["road"],
-            "street2": label_to_values["unit"],
-            "city": label_to_values["city"],
-            "state": label_to_values["state"],
-            "postal_code": label_to_values["postcode"],
-            "country": label_to_values["country"],
-        }
-        # replace empty strings with None
-        return {k: " ".join(v) or None for k, v in renamed.items()}
+            # Pypostal returns more fields than the ones we actually need.
+            # Here `False` is used as a placeholder under the assumption that
+            # such value is never returned by pypostal a field value.
+            current = result.get(label, False)
+
+            # Keep only the fields declared when `result` is initialized.
+            # Pypostal fields can be repeated, in such case we concat their values.
+            if current is not False:
+                result[label] = value if current is None else f"{current} {value}"
+
+        # Hack to prepend "house_number" to "road"
+        house_number = result.pop("house_number")
+        if house_number is not None:
+            road = result["road"]
+            if road is None:
+                result["road"] = house_number
+            else:
+                result["road"] = f"{house_number} {road}"
+
+        # Modify `result` to match the shape of an `ADDRESS_SCHEMA`.
+        result["street1"] = result.pop("road")
+        result["street2"] = result.pop("unit")
+        result["postal_code"] = result.pop("postcode")
+
+        return result
 
     return udf(address_string)
 

diff --git a/mismo/lib/geo/tests/test_postal_benchmark.py b/mismo/lib/geo/tests/test_postal_benchmark.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections import defaultdict
 from pathlib import Path
 
 import ibis
@@ -31,23 +32,30 @@
 
 
 @udf
-def noop(address_string: str | None) -> dict:
+def noop(address_string: str | None) -> dict[str, None]:
     return _NOOP_ADDRESS
 
 
 @udf
-def python_only(address_string: str | None) -> dict:
+def python_only(address_string: str | None) -> dict[str, str | None] | None:
+    # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
+    if address_string is None:
+        return None
+
     result: dict[str, str | None] = {
-        "house_number": None,
-        "road": None,
-        "unit": None,
-        "city": None,
-        "state": None,
-        "postcode": None,
-        "country": None,
+            "street1": None,
+            "street2": None,
+            "city": None,
+            "state": None,
+            "postal_code": None,
+            "country": None,
+            "house_number": None,
+            "road": None,
+            "unit": None,
+            "postcode": None
     }
 
-    # Fake 'parse_address' function that emits just one field ("street")
+    # Fake '_parse_address' function that emits just one field ("street")
     # containing the whole address.
     parsed_fields = (("street", address_string),)
     for value, label in parsed_fields:
@@ -71,40 +79,40 @@ def python_only(address_string: str | None) -> dict:
 
 
 @udf
-def postal_only(address_string: str | None) -> dict:
-    _parse_address(address_string or "")
+def postal_only(address_string: str | None) -> dict[str, None] | None:
+    # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
+    if address_string is None:
+        return None
+
+    _parse_address(address_string)
     return _NOOP_ADDRESS
 
 
-@udf
-def complete(address_string: str | None) -> dict | None:
+@ibis.udf.scalar.python
+def postal_parse_address__direct_import(address_string: str) -> ADDRESS_SCHEMA:
+    # TODO: remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
     if address_string is None:
         return None
-    # Initially, the keys match the names of pypostal fields we need.
-    # Later, this dict is modified to match the shape of an `ADDRESS_SCHEMA`.
+
     result: dict[str, str | None] = {
-        "house_number": None,
-        "road": None,
-        "unit": None,
+        "street1": None,
+        "street2": None,
         "city": None,
         "state": None,
-        "postcode": None,
+        "postal_code": None,
         "country": None,
+        "house_number": None,
+        "road": None,
+        "unit": None,
+        "postcode": None
     }
 
     parsed_fields = _parse_address(address_string)
     for value, label in parsed_fields:
-        # Pypostal returns more fields than the ones we actually need.
-        # Here `False` is used as a placeholder under the assumption that
-        # such value is never returned by pypostal a field value.
         current = result.get(label, False)
-
-        # Keep only the fields declared when `result` is initialized.
-        # Pypostal fields can be repeated, in such case we concat their values.
         if current is not False:
             result[label] = value if current is None else f"{current} {value}"
 
-    # Hack to prepend "house_number" to "road"
     house_number = result.pop("house_number")
     if house_number is not None:
         road = result["road"]
@@ -113,14 +121,34 @@ def complete(address_string: str | None) -> dict | None:
         else:
             result["road"] = f"{house_number} {road}"
 
-    # Modify `result` in-place to match the shape of an `ADDRESS_SCHEMA`.
     result["street1"] = result.pop("road")
     result["street2"] = result.pop("unit")
     result["postal_code"] = result.pop("postcode")
 
     return result
 
 
+@udf
+def postal_parse_address__initial_impl(address_string: str | None) -> dict[str, str | None] | None:
+    # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed
+    if address_string is None:
+        return None
+    parsed_fields = _parse_address(address_string)
+    label_to_values = defaultdict(list)
+    for value, label in parsed_fields:
+        label_to_values[label].append(value)
+    renamed = {
+        "street1": label_to_values["house_number"] + label_to_values["road"],
+        "street2": label_to_values["unit"],
+        "city": label_to_values["city"],
+        "state": label_to_values["state"],
+        "postal_code": label_to_values["postcode"],
+        "country": label_to_values["country"],
+    }
+    # replace empty strings with None
+    return {k: " ".join(v) or None for k, v in renamed.items()}
+
+
 def download_test_data() -> ir.Table:
     # download test data from https://github.com/NickCrews/apoc-data/releases/tag/20240717-111158
     URL_TEMPLATE = "https://github.com/NickCrews/apoc-data/releases/download/20240717-111158/income_{year}.csv"
@@ -162,8 +190,9 @@ def data(backend: ibis.BaseBackend) -> ir.Table:
         noop,
         python_only,
         postal_only,
-        complete,
         postal_parse_address,
+        postal_parse_address__direct_import,
+        postal_parse_address__initial_impl,
     ],
 )
 @pytest.mark.parametrize(