From 59fb4f2e0d00c89f143ec59064be71eb255775c0 Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Tue, 14 May 2024 00:30:47 +0200 Subject: [PATCH] feat: add automatic resolution finding logic --- CHANGELOG.md | 4 ++++ quackosm/_h3.py | 48 +++++++++++++++++++++++++++++-------- quackosm/pbf_file_reader.py | 11 ++++----- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9adf45f..ed52a9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Nodes intersection from `ST_Intersects` to an approximation using H3 cells for bigger scale transformations [#109](https://github.com/kraina-ai/quackosm/issues/109) + ## [0.8.1] - 2024-05-11 ### Added diff --git a/quackosm/_h3.py b/quackosm/_h3.py index 9d4b3ac..4f1a208 100644 --- a/quackosm/_h3.py +++ b/quackosm/_h3.py @@ -1,27 +1,55 @@ import platform from pathlib import Path +from typing import Optional import duckdb import pyarrow as pa import pyarrow.parquet as pq from h3ronpy.arrow.vector import ContainmentMode, wkb_to_cells from pooch import Decompress, retrieve -from shapely.geometry.base import BaseGeometry +from shapely.geometry.base import BaseGeometry, BaseMultipartGeometry + +START_H3_RESOLUTION = 4 +MAX_H3_RESOLUTION = 11 +MAX_H3_CELLS = 10_000_000 def _transform_geometry_filter_to_h3( - geometry: BaseGeometry, working_directory: Path, h3_resolution: int = 11 -) -> Path: + geometry: BaseGeometry, working_directory: Path, h3_resolution: Optional[int] = None +) -> tuple[Path, int]: """Fill geometry filter with H3 polygons and save them in a dedicated parquet file.""" result_file_path = working_directory / "geometry_filter_h3_indexes.parquet" - h3_indexes = wkb_to_cells( - [sub_geometry.wkb for sub_geometry in geometry.geoms], - resolution=h3_resolution, - containment_mode=ContainmentMode.Covers, - flatten=True, - ).unique() + if isinstance(geometry, BaseMultipartGeometry): + wkb = [sub_geometry.wkb for sub_geometry in geometry.geoms] + else: + wkb = [geometry.wkb] + + search_for_matching_resolution = False + if h3_resolution is None: + h3_resolution = START_H3_RESOLUTION + search_for_matching_resolution = True + + finished_searching = False + while not finished_searching: + h3_indexes = wkb_to_cells( + wkb, + resolution=h3_resolution, + containment_mode=ContainmentMode.Covers, + flatten=True, + ).unique() + + if not search_for_matching_resolution: + finished_searching = True + else: + number_of_cells = len(h3_indexes) + # Multiplying by 7, because thats the number of children in a parent cell. + if (number_of_cells * 7) > MAX_H3_CELLS or h3_resolution == MAX_H3_RESOLUTION: + finished_searching = True + else: + h3_resolution += 1 + pq.write_table(pa.table(dict(h3=h3_indexes)), result_file_path) - return result_file_path + return result_file_path, h3_resolution # Based on https://github.com/fusedio/udfs/blob/main/public/DuckDB_H3_Example/utils.py diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index 8939058..e47ad94 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -111,7 +111,7 @@ def __init__( ] = None, parquet_compression: str = "snappy", osm_extract_source: Union[OsmExtractSource, str] = OsmExtractSource.geofabrik, - geometry_filter_intersection_h3_resolution: int = 10, + geometry_filter_intersection_h3_resolution: Optional[int] = None, verbosity_mode: Literal["silent", "transient", "verbose"] = "transient", allow_uncovered_geometry: bool = False, debug_memory: bool = False, @@ -149,7 +149,8 @@ def __init__( intersections. First H3 cells are generated to cover given geometry and then intersection is done based on H3 indexes, not using ST_Intersects. Higher resolution will result in finer approximation of the original geometry, but might require more - memory during operations. Defaults to 11. + memory during operations. If `None`, will automatically determine the appropriate + value to cover geometries using less than 10 million H3 cells. Defaults to `None`. verbosity_mode (Literal["silent", "transient", "verbose"], optional): Set progress verbosity mode. Can be one of: silent, transient and verbose. Silent disables output completely. Transient tracks progress, but removes output after finished. @@ -1109,7 +1110,7 @@ def _prefilter_elements_ids( # - select all from NI with tags filter filter_osm_node_ids_filter = self._generate_elements_filter(filter_osm_ids, "node") if is_intersecting: - h3_cells_path = _transform_geometry_filter_to_h3( + h3_cells_path, determined_h3_resolution = _transform_geometry_filter_to_h3( geometry=self.geometry_filter, working_directory=self.tmp_dir_path, h3_resolution=self.geometry_filter_intersection_h3_resolution, @@ -1122,9 +1123,7 @@ def _prefilter_elements_ids( sql_query=f""" SELECT DISTINCT id FROM ({nodes_valid_with_tags.sql_query()}) n SEMI JOIN '{h3_cells_path}' - ON h3_latlng_to_cell( - lat, lon, {self.geometry_filter_intersection_h3_resolution} - ) = h3 + ON h3_latlng_to_cell(lat, lon, {determined_h3_resolution}) = h3 """, file_path=self.tmp_dir_path / "nodes_intersecting_ids", )