From d32aa50eec94907ed526a611e52a97bcae8ff753 Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Wed, 3 Apr 2024 08:48:21 +0200 Subject: [PATCH] feat: add some QoL changes (#73) * fix: change tqdm kwargs * feat: add progress bar for combining multiple results files * feat: add option to suppress errors if geometry isn't fully covered by OSM extracts * chore: add changelog entry * chore: modify changelog entry * fix: change docs css * chore: modify cli example * feat: add automatic CLI docs generation * fix: change docstrings for osm extracts * fix: replace the typo --- CHANGELOG.md | 9 ++ docs/api/CLI.md | 3 + docs/assets/css/font.css | 2 +- docs/assets/css/jupyter.css | 15 +- docs/gen_cli_docs.py | 75 ++++++++++ examples/command_line_interface.ipynb | 57 +++++++- mkdocs.yml | 1 + quackosm/_exceptions.py | 7 + quackosm/_rich_progress.py | 30 +++- quackosm/cli.py | 93 ++++++------ quackosm/functions.py | 14 +- quackosm/osm_extracts/__init__.py | 85 +++++++++-- quackosm/pbf_file_reader.py | 201 ++++++++++++++++---------- tests/base/test_cli.py | 13 ++ tests/base/test_osm_extracts.py | 30 +++- tests/base/test_pbf_file_reader.py | 21 +++ 16 files changed, 510 insertions(+), 146 deletions(-) create mode 100644 docs/api/CLI.md create mode 100644 docs/gen_cli_docs.py create mode 100644 quackosm/_exceptions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 13a624e..61cc387 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Progress bars for final merge of multiple geoparquet files into a single file +- Option to allow provided geometry to not be fully covered by existing OSM extracts [#68](https://github.com/kraina-ai/quackosm/issues/68) + +### Fixed + +- Changed tqdm's kwargs for parallel OSM extracts checks + ## [0.5.1] - 2024-03-23 ### Fixed diff --git a/docs/api/CLI.md b/docs/api/CLI.md new file mode 100644 index 0000000..25af5a6 --- /dev/null +++ b/docs/api/CLI.md @@ -0,0 +1,3 @@ +# CLI Reference + +Text below is a captured `--help` command output. diff --git a/docs/assets/css/font.css b/docs/assets/css/font.css index 6b67484..67b92ac 100644 --- a/docs/assets/css/font.css +++ b/docs/assets/css/font.css @@ -1,5 +1,5 @@ @import url("https://fonts.googleapis.com/css2?family=Noto+Sans+Mono&display=swap"); -@import url("https://fonts.googleapis.com/css2?family=Noto+Sans+Mono&text=┌─┬┐└┴┘│├┼┤&display=swap"); +@import url("https://fonts.googleapis.com/css2?family=Noto+Sans+Mono&text=┌─┬┐└┴┘│├┼┤╭╮╰╯&display=swap"); @import url("https://fonts.googleapis.com/css2?family=Playpen+Sans:wght@500&display=swap"); .md-header__topic:first-child { diff --git a/docs/assets/css/jupyter.css b/docs/assets/css/jupyter.css index 6f179b6..6ef4771 100644 --- a/docs/assets/css/jupyter.css +++ b/docs/assets/css/jupyter.css @@ -11,13 +11,14 @@ right: 0; } -.jp-CodeCell > .jp-Cell-outputWrapper { +.jp-CodeCell>.jp-Cell-outputWrapper { margin-top: -10px; padding-top: 0; display: table-cell; text-align: left; } -.jp-Cell-outputWrapper > .jp-Cell-outputCollapser { + +.jp-Cell-outputWrapper>.jp-Cell-outputCollapser { margin-top: -17px; } @@ -26,6 +27,7 @@ .jupyter-wrapper table.dataframe td { text-align: left; } + .jupyter-wrapper table.dataframe { table-layout: auto; } @@ -39,8 +41,8 @@ } div.highlight pre code, -div.jp-RenderedText.jp-OutputArea-output > pre, -div.jp-RenderedText.jp-OutputArea-output.jp-OutputArea-executeResult > pre { +div.jp-RenderedText.jp-OutputArea-output>pre, +div.jp-RenderedText.jp-OutputArea-output.jp-OutputArea-executeResult>pre { font-family: "Noto Sans Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace, SFMono-Regular, Consolas, Menlo, monospace; @@ -57,3 +59,8 @@ div.admonition p { margin-top: 0.6rem; margin-bottom: 0.6rem; } + +.jupyter-wrapper .jp-RenderedText pre .ansi-bold { + font-weight: normal !important; + text-shadow: calc(-0.06ex) 0 0 currentColor, calc(0.06ex) 0 0 currentColor; +} diff --git a/docs/gen_cli_docs.py b/docs/gen_cli_docs.py new file mode 100644 index 0000000..0db3dc6 --- /dev/null +++ b/docs/gen_cli_docs.py @@ -0,0 +1,75 @@ +"""Capture the CLI help page and save it to the docs.""" + +from pathlib import Path +from typing import cast + +import mkdocs_gen_files +import typer +from rich.console import Console +from typer.rich_utils import ( + COLOR_SYSTEM, + FORCE_TERMINAL, + STYLE_METAVAR, + STYLE_METAVAR_SEPARATOR, + STYLE_NEGATIVE_OPTION, + STYLE_NEGATIVE_SWITCH, + STYLE_OPTION, + STYLE_SWITCH, + STYLE_USAGE, + Theme, + highlighter, + rich_format_help, +) + +from quackosm.cli import app + +API_DIRECTORY_PATH = Path("api") + +GLOBAL_CONSOLE = None + + +def _get_rich_console_new(stderr: bool = False) -> Console: + global GLOBAL_CONSOLE # noqa: PLW0603 + GLOBAL_CONSOLE = Console( + theme=Theme( + { + "option": STYLE_OPTION, + "switch": STYLE_SWITCH, + "negative_option": STYLE_NEGATIVE_OPTION, + "negative_switch": STYLE_NEGATIVE_SWITCH, + "metavar": STYLE_METAVAR, + "metavar_sep": STYLE_METAVAR_SEPARATOR, + "usage": STYLE_USAGE, + }, + ), + record=True, + highlighter=highlighter, + color_system=COLOR_SYSTEM, + force_terminal=FORCE_TERMINAL, + width=240, + stderr=stderr, + ) + return GLOBAL_CONSOLE + + +typer.rich_utils._get_rich_console = _get_rich_console_new + +typer_obj = app + +click_obj = typer.main.get_command(typer_obj) +ctx = typer.Context(command=click_obj, info_name="QuackOSM") +rich_format_help(obj=click_obj, ctx=ctx, markup_mode="rich") +html_text: str = cast(Console, GLOBAL_CONSOLE).export_html( + inline_styles=True, + code_format='
{code}
', +) +html_text = html_text.replace( + "font-weight: bold", + ( + "font-weight: normal;" + " text-shadow: calc(-0.06ex) 0 0 currentColor, calc(0.06ex) 0 0 currentColor;" + ), +) + +with mkdocs_gen_files.open(API_DIRECTORY_PATH / "CLI.md", "a") as fd: + print(html_text, file=fd) diff --git a/examples/command_line_interface.ipynb b/examples/command_line_interface.ipynb index ca4ab53..67ece1f 100644 --- a/examples/command_line_interface.ipynb +++ b/examples/command_line_interface.ipynb @@ -230,6 +230,13 @@ "During first execution, QuackOSM will cache three PBF files sources locally. This operation takes some time." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Geocoding" + ] + }, { "cell_type": "code", "execution_count": null, @@ -243,6 +250,13 @@ "! QuackOSM --geom-filter-geocode 'Monaco-Ville, Monaco'" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GeoJSON" + ] + }, { "cell_type": "code", "execution_count": null, @@ -256,6 +270,13 @@ "! QuackOSM --geom-filter-geojson '{\"type\":\"Feature\",\"geometry\":{\"coordinates\":[[[7.416,43.734],[7.416,43.731],[7.421,43.731],[7.421,43.734],[7.416,43.734]]],\"type\":\"Polygon\"}}'" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Geohash" + ] + }, { "cell_type": "code", "execution_count": null, @@ -269,6 +290,13 @@ "! QuackOSM --geom-filter-index-geohash spv2bcs" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### H3" + ] + }, { "cell_type": "code", "execution_count": null, @@ -282,6 +310,13 @@ "! QuackOSM --geom-filter-index-h3 893969a4037ffff" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### S2" + ] + }, { "cell_type": "code", "execution_count": null, @@ -295,6 +330,13 @@ "! QuackOSM --geom-filter-index-s2 12cdc28d" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### WKT" + ] + }, { "cell_type": "code", "execution_count": null, @@ -396,6 +438,13 @@ "- `--compact-tags` (or `--compact`): will always keep tags together as a single column." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Separated tags (`explode`)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -419,14 +468,14 @@ }, "outputs": [], "source": [ - "! QuackOSM andorra.osm.pbf --osm-tags-filter '{ \"amenity\": \"parking\", \"building\": \"office\" }' --compact --output files/andorra_filtered_compact.geoparquet" + "! ./duckdb :memory: \"FROM read_parquet('files/andorra_filtered_exploded.geoparquet')\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's see the difference in the files structure." + "### Compact tags (`compact`)" ] }, { @@ -439,7 +488,7 @@ }, "outputs": [], "source": [ - "! ./duckdb :memory: \"FROM read_parquet('files/andorra_filtered_exploded.geoparquet')\"" + "! QuackOSM andorra.osm.pbf --osm-tags-filter '{ \"amenity\": \"parking\", \"building\": \"office\" }' --compact --output files/andorra_filtered_compact.geoparquet" ] }, { @@ -509,7 +558,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/mkdocs.yml b/mkdocs.yml index 48ddc6b..900af36 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -89,6 +89,7 @@ plugins: - docs/copy_readme.py - docs/copy_examples.py - docs/gen_ref_pages.py + - docs/gen_cli_docs.py - search - mkdocstrings: custom_templates: docs/templates diff --git a/quackosm/_exceptions.py b/quackosm/_exceptions.py new file mode 100644 index 0000000..a6bab4e --- /dev/null +++ b/quackosm/_exceptions.py @@ -0,0 +1,7 @@ +class EmptyResultWarning(Warning): ... + + +class GeometryNotCoveredWarning(Warning): ... + + +class GeometryNotCoveredError(Exception): ... diff --git a/quackosm/_rich_progress.py b/quackosm/_rich_progress.py index 1cecab2..e55d996 100644 --- a/quackosm/_rich_progress.py +++ b/quackosm/_rich_progress.py @@ -1,5 +1,6 @@ # type: ignore """Wrapper over Rich progress bar.""" + import os from collections.abc import Iterable from contextlib import suppress @@ -28,10 +29,13 @@ def show_total_elapsed_time(elapsed_seconds: float) -> None: class TaskProgressSpinner: - def __init__(self, step_name: str, step_number: str, silent_mode: bool): + def __init__( + self, step_name: str, step_number: str, silent_mode: bool, skip_step_number: bool = False + ): self.step_name = step_name self.step_number = step_number self.silent_mode = silent_mode + self.skip_step_number = skip_step_number self.progress = None self.force_terminal = os.getenv("FORCE_TERMINAL_MODE", "false").lower() == "true" @@ -48,12 +52,19 @@ def __enter__(self): TimeElapsedColumn, ) - self.progress = Progress( + columns = [ SpinnerColumn(), TextColumn(f"[{self.step_number: >4}/{TOTAL_STEPS}]"), TextColumn("[progress.description]{task.description}"), TextColumn("•"), TimeElapsedColumn(), + ] + + if self.skip_step_number: + columns.pop(1) + + self.progress = Progress( + *columns, refresh_per_second=1, transient=False, console=Console( @@ -77,15 +88,17 @@ def __exit__(self, exc_type, exc_value, exc_tb): class TaskProgressBar: - def __init__(self, step_name: str, step_number: str, silent_mode: bool): + def __init__( + self, step_name: str, step_number: str, silent_mode: bool, skip_step_number: bool = False + ): self.step_name = step_name self.step_number = step_number self.silent_mode = silent_mode + self.skip_step_number = skip_step_number self.progress = None self.force_terminal = os.getenv("FORCE_TERMINAL_MODE", "false").lower() == "true" def __enter__(self): - try: # pragma: no cover if self.silent_mode: self.progress = None @@ -113,7 +126,7 @@ def render(self, task: "Task") -> Text: else: return Text(f"{1/task.speed:.2f} s/it") # noqa: FURB126 - self.progress = Progress( + columns = [ SpinnerColumn(), TextColumn(f"[{self.step_number: >4}/{TOTAL_STEPS}]"), TextColumn( @@ -128,6 +141,13 @@ def render(self, task: "Task") -> Text: TimeRemainingColumn(), TextColumn("•"), SpeedColumn(), + ] + + if self.skip_step_number: + columns.pop(1) + + self.progress = Progress( + *columns, refresh_per_second=1, transient=False, speed_estimate_period=1800, diff --git a/quackosm/cli.py b/quackosm/cli.py index 66880b0..88dc028 100644 --- a/quackosm/cli.py +++ b/quackosm/cli.py @@ -3,7 +3,6 @@ import json import logging import re -import warnings from pathlib import Path from typing import Annotated, Optional, Union, cast @@ -517,6 +516,17 @@ def main( show_default=False, ), ] = False, + allow_uncovered_geometry: Annotated[ + bool, + typer.Option( + "--allow-uncovered-geometry/", + help=( + "Suppresses an error if some geometry parts aren't covered by any OSM extract." + " Works only when PbfFileReader is asked to download OSM extracts automatically." + ), + show_default=False, + ), + ] = False, version: Annotated[ Optional[bool], typer.Option( @@ -573,45 +583,44 @@ def main( if osm_tags_filter is not None and osm_tags_filter_file is not None: raise typer.BadParameter("Provided more than one osm tags filter parameter") - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - logging.disable(logging.CRITICAL) - if pbf_file: - geoparquet_path = convert_pbf_to_gpq( - pbf_path=pbf_file, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - geometry_filter=geometry_filter_value, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - save_as_wkt=wkt_result, - silent_mode=silent_mode, - ) - else: - geoparquet_path = convert_geometry_to_gpq( - geometry_filter=geometry_filter_value, - osm_extract_source=osm_extract_source, - tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore - keep_all_tags=keep_all_tags, - explode_tags=explode_tags, - ignore_cache=ignore_cache, - working_directory=working_directory, - result_file_path=result_file_path, - osm_way_polygon_features_config=( - json.loads(Path(osm_way_polygon_features_config).read_text()) - if osm_way_polygon_features_config - else None - ), - filter_osm_ids=filter_osm_ids, # type: ignore - save_as_wkt=wkt_result, - silent_mode=silent_mode, - ) + logging.disable(logging.CRITICAL) + if pbf_file: + geoparquet_path = convert_pbf_to_gpq( + pbf_path=pbf_file, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + geometry_filter=geometry_filter_value, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + save_as_wkt=wkt_result, + silent_mode=silent_mode, + ) + else: + geoparquet_path = convert_geometry_to_gpq( + geometry_filter=geometry_filter_value, + osm_extract_source=osm_extract_source, + tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore + keep_all_tags=keep_all_tags, + explode_tags=explode_tags, + ignore_cache=ignore_cache, + working_directory=working_directory, + result_file_path=result_file_path, + osm_way_polygon_features_config=( + json.loads(Path(osm_way_polygon_features_config).read_text()) + if osm_way_polygon_features_config + else None + ), + filter_osm_ids=filter_osm_ids, # type: ignore + save_as_wkt=wkt_result, + silent_mode=silent_mode, + allow_uncovered_geometry=allow_uncovered_geometry, + ) typer.secho(geoparquet_path, fg="green") diff --git a/quackosm/functions.py b/quackosm/functions.py index 47d4500..e0203b2 100644 --- a/quackosm/functions.py +++ b/quackosm/functions.py @@ -250,6 +250,7 @@ def convert_geometry_to_gpq( osm_way_polygon_features_config: Optional[Union[OsmWayPolygonConfig, dict[str, Any]]] = None, save_as_wkt: bool = False, silent_mode: bool = False, + allow_uncovered_geometry: bool = False, ) -> Path: """ Get a GeoParquet file with OpenStreetMap features within given geometry. @@ -260,7 +261,7 @@ def convert_geometry_to_gpq( Args: geometry_filter (BaseGeometry): Geometry filter used to download matching OSM extracts. osm_extract_source (OsmExtractSource): A source for automatic downloading of - OSM extracts. Can be Geofabrik, BBBike, OSM_fr or any. Defaults to `any`. + OSM extracts. Can be Geofabrik, BBBike, OSMfr or any. Defaults to `any`. tags_filter (Union[OsmTagsFilter, GroupedOsmTagsFilter], optional): A dictionary specifying which tags to download. The keys should be OSM tags (e.g. `building`, `amenity`). @@ -297,6 +298,9 @@ def convert_geometry_to_gpq( If `True`, it will be saved as a `.parquet` file, because it won't be in the GeoParquet standard. Defaults to `False`. silent_mode (bool): Disable progress bars. Defaults to `False`. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Works only when PbfFileReader is asked to download OSM extracts + automatically. Defaults to `False`. Returns: Path: Path to the generated GeoParquet file. @@ -402,6 +406,7 @@ def convert_geometry_to_gpq( osm_way_polygon_features_config=osm_way_polygon_features_config, osm_extract_source=osm_extract_source, silent_mode=silent_mode, + allow_uncovered_geometry=allow_uncovered_geometry, ).convert_geometry_filter_to_gpq( result_file_path=result_file_path, keep_all_tags=keep_all_tags, @@ -604,6 +609,7 @@ def get_features_gdf_from_geometry( working_directory: Union[str, Path] = "files", osm_way_polygon_features_config: Optional[Union[OsmWayPolygonConfig, dict[str, Any]]] = None, silent_mode: bool = False, + allow_uncovered_geometry: bool = False, ) -> gpd.GeoDataFrame: """ Get a GeoParquet file with OpenStreetMap features within given geometry. @@ -614,7 +620,7 @@ def get_features_gdf_from_geometry( Args: geometry_filter (BaseGeometry): Geometry filter used to download matching OSM extracts. osm_extract_source (OsmExtractSource): A source for automatic downloading of - OSM extracts. Can be Geofabrik, BBBike, OSM_fr or any. Defaults to `any`. + OSM extracts. Can be Geofabrik, BBBike, OSMfr or any. Defaults to `any`. tags_filter (Union[OsmTagsFilter, GroupedOsmTagsFilter], optional): A dictionary specifying which tags to download. The keys should be OSM tags (e.g. `building`, `amenity`). @@ -645,6 +651,9 @@ def get_features_gdf_from_geometry( Modifications to this config left are left for experienced OSM users. Defaults to predefined "osm_way_polygon_features.json". silent_mode (bool): Disable progress bars. Defaults to `False`. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Works only when PbfFileReader is asked to download OSM extracts + automatically. Defaults to `False`. Returns: gpd.GeoDataFrame: GeoDataFrame with OSM features. @@ -704,6 +713,7 @@ def get_features_gdf_from_geometry( osm_way_polygon_features_config=osm_way_polygon_features_config, osm_extract_source=osm_extract_source, silent_mode=silent_mode, + allow_uncovered_geometry=allow_uncovered_geometry, ).get_features_gdf_from_geometry( keep_all_tags=keep_all_tags, explode_tags=explode_tags, diff --git a/quackosm/osm_extracts/__init__.py b/quackosm/osm_extracts/__init__.py index 13e539a..fee9974 100644 --- a/quackosm/osm_extracts/__init__.py +++ b/quackosm/osm_extracts/__init__.py @@ -6,6 +6,7 @@ """ import os +import warnings from collections.abc import Iterable from enum import Enum from functools import partial @@ -19,6 +20,7 @@ from shapely.geometry.base import BaseGeometry, BaseMultipartGeometry from tqdm.contrib.concurrent import process_map +from quackosm._exceptions import GeometryNotCoveredError, GeometryNotCoveredWarning from quackosm.osm_extracts.bbbike import _get_bbbike_index from quackosm.osm_extracts.extract import OpenStreetMapExtract from quackosm.osm_extracts.geofabrik import _get_geofabrik_index @@ -79,6 +81,7 @@ def download_extracts_pbf_files( def find_smallest_containing_extracts_total( geometry: Union[BaseGeometry, BaseMultipartGeometry], + allow_uncovered_geometry: bool = False, ) -> list[OpenStreetMapExtract]: """ Find smallest extracts from all OSM extract indexes that contains given polygon. @@ -87,6 +90,8 @@ def find_smallest_containing_extracts_total( Args: geometry (Union[BaseGeometry, BaseMultipartGeometry]): Geometry to be covered. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. Returns: List[OpenStreetMapExtract]: List of extracts name, URL to download it and boundary polygon. @@ -95,11 +100,14 @@ def find_smallest_containing_extracts_total( [_get_bbbike_index(), _get_geofabrik_index(), _get_openstreetmap_fr_index()] ) indexes.sort_values(by="area", ignore_index=True, inplace=True) - return _find_smallest_containing_extracts(geometry, indexes) + return _find_smallest_containing_extracts( + geometry, indexes, allow_uncovered_geometry=allow_uncovered_geometry + ) def find_smallest_containing_geofabrik_extracts( geometry: Union[BaseGeometry, BaseMultipartGeometry], + allow_uncovered_geometry: bool = False, ) -> list[OpenStreetMapExtract]: """ Find smallest extracts from Geofabrik that contains given geometry. @@ -108,15 +116,20 @@ def find_smallest_containing_geofabrik_extracts( Args: geometry (Union[BaseGeometry, BaseMultipartGeometry]): Geometry to be covered. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. Returns: List[OpenStreetMapExtract]: List of extracts name, URL to download it and boundary polygon. """ - return _find_smallest_containing_extracts(geometry, _get_geofabrik_index()) + return _find_smallest_containing_extracts( + geometry, _get_geofabrik_index(), allow_uncovered_geometry=allow_uncovered_geometry + ) def find_smallest_containing_openstreetmap_fr_extracts( geometry: Union[BaseGeometry, BaseMultipartGeometry], + allow_uncovered_geometry: bool = False, ) -> list[OpenStreetMapExtract]: """ Find smallest extracts from OpenStreetMap.fr that contains given polygon. @@ -125,15 +138,20 @@ def find_smallest_containing_openstreetmap_fr_extracts( Args: geometry (Union[BaseGeometry, BaseMultipartGeometry]): Geometry to be covered. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. Returns: List[OpenStreetMapExtract]: List of extracts name, URL to download it and boundary polygon. """ - return _find_smallest_containing_extracts(geometry, _get_openstreetmap_fr_index()) + return _find_smallest_containing_extracts( + geometry, _get_openstreetmap_fr_index(), allow_uncovered_geometry=allow_uncovered_geometry + ) def find_smallest_containing_bbbike_extracts( geometry: Union[BaseGeometry, BaseMultipartGeometry], + allow_uncovered_geometry: bool = False, ) -> list[OpenStreetMapExtract]: """ Find smallest extracts from BBBike that contains given polygon. @@ -142,11 +160,15 @@ def find_smallest_containing_bbbike_extracts( Args: geometry (Union[BaseGeometry, BaseMultipartGeometry]): Geometry to be covered. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. Returns: List[OpenStreetMapExtract]: List of extracts name, URL to download it and boundary polygon. """ - return _find_smallest_containing_extracts(geometry, _get_bbbike_index()) + return _find_smallest_containing_extracts( + geometry, _get_bbbike_index(), allow_uncovered_geometry=allow_uncovered_geometry + ) OSM_EXTRACT_SOURCE_MATCHING_FUNCTION = { @@ -158,11 +180,30 @@ def find_smallest_containing_bbbike_extracts( def find_smallest_containing_extract( - geometry: Union[BaseGeometry, BaseMultipartGeometry], source: Union[OsmExtractSource, str] + geometry: Union[BaseGeometry, BaseMultipartGeometry], + source: Union[OsmExtractSource, str], + allow_uncovered_geometry: bool = False, ) -> list[OpenStreetMapExtract]: + """ + Find smallest extracts from a given OSM source that contains given polygon. + + Iterates an OSM source index and finds smallest extracts that covers a given geometry. + + Args: + geometry (Union[BaseGeometry, BaseMultipartGeometry]): Geometry to be covered. + source (Union[OsmExtractSource, str]): OSM source name. Can be one of: 'any', 'Geofabrik', + 'BBBike', 'OSM_fr'. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. + + Returns: + List[OpenStreetMapExtract]: List of extracts name, URL to download it and boundary polygon. + """ try: source_enum = OsmExtractSource(source) - return OSM_EXTRACT_SOURCE_MATCHING_FUNCTION[source_enum](geometry) + return OSM_EXTRACT_SOURCE_MATCHING_FUNCTION[source_enum]( + geometry, allow_uncovered_geometry=allow_uncovered_geometry + ) except ValueError as ex: raise ValueError(f"Unknown OSM extracts source: {source}.") from ex @@ -172,6 +213,7 @@ def _find_smallest_containing_extracts( polygons_index_gdf: gpd.GeoDataFrame, num_of_multiprocessing_workers: int = -1, multiprocessing_activation_threshold: Optional[int] = None, + allow_uncovered_geometry: bool = False, ) -> list[OpenStreetMapExtract]: """ Find smallest set of extracts covering a given geometry. @@ -191,6 +233,8 @@ def _find_smallest_containing_extracts( multiprocessing_activation_threshold (int, optional): Number of gometries required to start processing on multiple processes. Activating multiprocessing for a small amount of points might not be feasible. Defaults to 100. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. Returns: List[OpenStreetMapExtract]: List of extracts covering a given geometry. @@ -216,6 +260,7 @@ def _find_smallest_containing_extracts( find_extracts_func = partial( _find_smallest_containing_extracts_for_single_geometry, polygons_index_gdf=polygons_index_gdf, + allow_uncovered_geometry=allow_uncovered_geometry, ) force_terminal = os.getenv("FORCE_TERMINAL_MODE", "false").lower() == "true" @@ -225,14 +270,16 @@ def _find_smallest_containing_extracts( desc="Finding matching extracts", max_workers=num_of_multiprocessing_workers, chunksize=ceil(total_polygons / (4 * num_of_multiprocessing_workers)), - tqdm_kwargs=dict(disable=True if force_terminal else None), + disable=True if force_terminal else None, ): unique_extracts_ids.update(extract_ids_list) else: for sub_geometry in geometries: unique_extracts_ids.update( _find_smallest_containing_extracts_for_single_geometry( - sub_geometry, polygons_index_gdf + geometry=sub_geometry, + polygons_index_gdf=polygons_index_gdf, + allow_uncovered_geometry=allow_uncovered_geometry, ) ) @@ -248,7 +295,9 @@ def _find_smallest_containing_extracts( def _find_smallest_containing_extracts_for_single_geometry( - geometry: BaseGeometry, polygons_index_gdf: gpd.GeoDataFrame + geometry: BaseGeometry, + polygons_index_gdf: gpd.GeoDataFrame, + allow_uncovered_geometry: bool = False, ) -> set[str]: """ Find smallest set of extracts covering a given singular geometry. @@ -256,6 +305,8 @@ def _find_smallest_containing_extracts_for_single_geometry( Args: geometry (BaseGeometry): Geometry to be covered. polygons_index_gdf (gpd.GeoDataFrame): Index of available extracts. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered + by any OSM extract. Defaults to `False`. Raises: RuntimeError: If provided extracts index is empty. @@ -287,7 +338,19 @@ def _find_smallest_containing_extracts_for_single_geometry( & (polygons_index_gdf.intersects(geometry_to_cover)) ] if 0 in (len(matching_rows), iterations): - raise RuntimeError("Couldn't find extracts matching given geometry.") + if not allow_uncovered_geometry: + raise GeometryNotCoveredError( + "Couldn't find extracts covering given geometry." + " If it's expected behaviour, you can suppress this error by passing" + " the `allow_uncovered_geometry=True` argument" + " or add `--allow-uncovered-geometry` flag to the CLI command." + ) + warnings.warn( + "Couldn't find extracts covering given geometry.", + GeometryNotCoveredWarning, + stacklevel=0, + ) + break smallest_extract = matching_rows.iloc[0] geometry_to_cover = geometry_to_cover.difference(smallest_extract.geometry) @@ -351,7 +414,7 @@ def _filter_extracts( desc="Filtering extracts", max_workers=num_of_multiprocessing_workers, chunksize=ceil(total_geometries / (4 * num_of_multiprocessing_workers)), - tqdm_kwargs=dict(disable=True if force_terminal else None), + disable=True if force_terminal else None, ): filtered_extracts_ids.update(extract_ids_list) else: diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index f7bec0f..2973151 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -32,6 +32,7 @@ from shapely.geometry.base import BaseGeometry, BaseMultipartGeometry from quackosm._constants import FEATURES_INDEX, GEOMETRY_COLUMN, WGS84_CRS +from quackosm._exceptions import EmptyResultWarning from quackosm._osm_tags_filters import GroupedOsmTagsFilter, OsmTagsFilter, merge_osm_tags_filter from quackosm._osm_way_polygon_features import OsmWayPolygonConfig, parse_dict_to_config_object from quackosm._rich_progress import ( # type: ignore[attr-defined] @@ -103,6 +104,7 @@ def __init__( parquet_compression: str = "snappy", osm_extract_source: OsmExtractSource = OsmExtractSource.any, silent_mode: bool = False, + allow_uncovered_geometry: bool = False, ) -> None: """ Initialize PbfFileReader. @@ -130,12 +132,15 @@ def __init__( Check https://duckdb.org/docs/sql/statements/copy#parquet-options for more info. Defaults to "snappy". osm_extract_source (OsmExtractSource): A source for automatic downloading of - OSM extracts. Can be Geofabrik, BBBike, OSM_fr or any. Defaults to `any`. + OSM extracts. Can be Geofabrik, BBBike, OSMfr or any. Defaults to `any`. silent_mode (bool): Disable progress bars. + allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't + covered by any OSM extract. Defaults to `False`. """ self.tags_filter = tags_filter self.merged_tags_filter = merge_osm_tags_filter(tags_filter) if tags_filter else None self.geometry_filter = geometry_filter + self.allow_uncovered_geometry = allow_uncovered_geometry self.osm_extract_source = osm_extract_source self.working_directory = Path(working_directory) self.working_directory.mkdir(parents=True, exist_ok=True) @@ -302,7 +307,9 @@ def convert_geometry_filter_to_gpq( ) matching_extracts = find_smallest_containing_extract( - self.geometry_filter, self.osm_extract_source + self.geometry_filter, + self.osm_extract_source, + allow_uncovered_geometry=self.allow_uncovered_geometry, ) if len(matching_extracts) == 1: @@ -318,9 +325,6 @@ def convert_geometry_filter_to_gpq( ) else: if not result_file_path.exists() or ignore_cache: - matching_extracts = find_smallest_containing_extract( - self.geometry_filter, self.osm_extract_source - ) pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory) parsed_geoparquet_files = [] @@ -335,27 +339,42 @@ def convert_geometry_filter_to_gpq( ) parsed_geoparquet_files.append(parsed_geoparquet_file) - with tempfile.TemporaryDirectory( - dir=self.working_directory.resolve() - ) as tmp_dir_name: - try: - joined_parquet_table = self._drop_duplicated_features_in_pyarrow_table( - parsed_geoparquet_files - ) - except pa.ArrowInvalid: - tmp_dir_path = Path(tmp_dir_name) - joined_parquet_table = self._drop_duplicated_features_in_joined_table( - parsed_geoparquet_files, tmp_dir_path - ) - + if parsed_geoparquet_files: + with tempfile.TemporaryDirectory( + dir=self.working_directory.resolve() + ) as tmp_dir_name: + try: + joined_parquet_table = self._drop_duplicated_features_in_pyarrow_table( + parsed_geoparquet_files + ) + except pa.ArrowInvalid: + tmp_dir_path = Path(tmp_dir_name) + joined_parquet_table = self._drop_duplicated_features_in_joined_table( + parsed_geoparquet_files, tmp_dir_path + ) + else: + warnings.warn( + "Found 0 extracts covering the geometry. Returning empty result.", + EmptyResultWarning, + stacklevel=0, + ) if save_as_wkt: - pq.write_table(joined_parquet_table, result_file_path) + geometry_column = ga.as_wkt(gpd.GeoSeries([], crs=WGS84_CRS)) else: - io.write_geoparquet_table( - joined_parquet_table, - result_file_path, - primary_geometry_column=GEOMETRY_COLUMN, - ) + geometry_column = ga.as_wkb(gpd.GeoSeries([], crs=WGS84_CRS)) + joined_parquet_table = pa.table( + [pa.array([], type=pa.string()), geometry_column], + names=[FEATURES_INDEX, GEOMETRY_COLUMN], + ) + + if save_as_wkt: + pq.write_table(joined_parquet_table, result_file_path) + else: + io.write_geoparquet_table( + joined_parquet_table, + result_file_path, + primary_geometry_column=GEOMETRY_COLUMN, + ) return Path(result_file_path) @@ -413,20 +432,33 @@ def get_features_gdf( ) parsed_geoparquet_files.append(parsed_geoparquet_file) - with tempfile.TemporaryDirectory(dir=self.working_directory.resolve()) as tmp_dir_name: - try: - joined_parquet_table = self._drop_duplicated_features_in_pyarrow_table( - parsed_geoparquet_files - ) - except pa.ArrowInvalid: - tmp_dir_path = Path(tmp_dir_name) - joined_parquet_table = self._drop_duplicated_features_in_joined_table( - parsed_geoparquet_files, tmp_dir_path - ) + if parsed_geoparquet_files: + with tempfile.TemporaryDirectory(dir=self.working_directory.resolve()) as tmp_dir_name: + try: + joined_parquet_table = self._drop_duplicated_features_in_pyarrow_table( + parsed_geoparquet_files + ) + except pa.ArrowInvalid: + tmp_dir_path = Path(tmp_dir_name) + joined_parquet_table = self._drop_duplicated_features_in_joined_table( + parsed_geoparquet_files, tmp_dir_path + ) + gdf_parquet = gpd.GeoDataFrame( + data=joined_parquet_table.drop(GEOMETRY_COLUMN).to_pandas( + maps_as_pydicts="strict" + ), + geometry=ga.to_geopandas(joined_parquet_table.column(GEOMETRY_COLUMN)), + ).set_index(FEATURES_INDEX) + else: + warnings.warn( + "Found 0 extracts covering the geometry. Returning empty result.", + EmptyResultWarning, + stacklevel=0, + ) gdf_parquet = gpd.GeoDataFrame( - data=joined_parquet_table.drop(GEOMETRY_COLUMN).to_pandas(maps_as_pydicts="strict"), - geometry=ga.to_geopandas(joined_parquet_table.column(GEOMETRY_COLUMN)), + data={FEATURES_INDEX: []}, + geometry=gpd.GeoSeries([], crs=WGS84_CRS), ).set_index(FEATURES_INDEX) return gdf_parquet @@ -479,15 +511,19 @@ def get_features_gdf_from_geometry( def _drop_duplicated_features_in_pyarrow_table( self, parsed_geoparquet_files: list[Path] ) -> pa.Table: - parquet_tables = [ - pq.read_table(parsed_parquet_file) for parsed_parquet_file in parsed_geoparquet_files - ] - joined_parquet_table: pa.Table = pa.concat_tables(parquet_tables, promote_options="default") - if joined_parquet_table.num_rows > 0: - joined_parquet_table = drop_duplicates( - joined_parquet_table, on=["feature_id"], keep="first" + with TaskProgressSpinner("Combining results", "", self.silent_mode, skip_step_number=True): + parquet_tables = [ + pq.read_table(parsed_parquet_file) + for parsed_parquet_file in parsed_geoparquet_files + ] + joined_parquet_table: pa.Table = pa.concat_tables( + parquet_tables, promote_options="default" ) - return joined_parquet_table + if joined_parquet_table.num_rows > 0: + joined_parquet_table = drop_duplicates( + joined_parquet_table, on=["feature_id"], keep="first" + ) + return joined_parquet_table def _drop_duplicated_features_in_joined_table( self, parsed_geoparquet_files: list[Path], tmp_dir_path: Path @@ -503,47 +539,60 @@ def _drop_duplicated_features_in_joined_table( try: # Attempt 1: read all at once - output_file_name = tmp_dir_path / "joined_features_without_duplicates.parquet" - parquet_relation = connection.read_parquet( - [ - str(parsed_geoparquet_file) - for parsed_geoparquet_file in sorted_parsed_geoparquet_files - ], - union_by_name=True, - ) - query = f""" - COPY ( - {parquet_relation.sql_query()} - QUALIFY row_number() OVER (PARTITION BY feature_id) = 1 - ) TO '{output_file_name}' ( - FORMAT 'parquet', - PER_THREAD_OUTPUT true, - ROW_GROUP_SIZE 25000, - COMPRESSION '{self.parquet_compression}' + with TaskProgressSpinner( + "Combining results", "", self.silent_mode, skip_step_number=True + ): + output_file_name = tmp_dir_path / "joined_features_without_duplicates.parquet" + parquet_relation = connection.read_parquet( + [ + str(parsed_geoparquet_file) + for parsed_geoparquet_file in sorted_parsed_geoparquet_files + ], + union_by_name=True, ) - """ - self._run_query(query, run_in_separate_process=True, tmp_dir_path=tmp_dir_path) - return pq.read_table(output_file_name) - except MemoryError: - # Attempt 2: read one by one - result_parquet_files = [sorted_parsed_geoparquet_files[0]] - for idx, parsed_geoparquet_file in enumerate(sorted_parsed_geoparquet_files[1:]): - current_parquet_file_relation = connection.read_parquet(str(parsed_geoparquet_file)) - filtered_result_parquet_file = tmp_dir_path / f"sub_file_{idx}" query = f""" COPY ( - {current_parquet_file_relation.sql_query()} - ANTI JOIN read_parquet({[str(pq_file) for pq_file in result_parquet_files]}) - USING (feature_id) - ) TO '{filtered_result_parquet_file}' ( + {parquet_relation.sql_query()} + QUALIFY row_number() OVER (PARTITION BY feature_id) = 1 + ) TO '{output_file_name}' ( FORMAT 'parquet', PER_THREAD_OUTPUT true, ROW_GROUP_SIZE 25000, COMPRESSION '{self.parquet_compression}' ) """ - connection.sql(query) - result_parquet_files.extend(filtered_result_parquet_file.glob("*.parquet")) + self._run_query(query, run_in_separate_process=True, tmp_dir_path=tmp_dir_path) + return pq.read_table(output_file_name) + except MemoryError: + # Attempt 2: read one by one + result_parquet_files = [sorted_parsed_geoparquet_files[0]] + with TaskProgressBar( + "Combining results", "", self.silent_mode, skip_step_number=True + ) as bar: + for idx, parsed_geoparquet_file in bar.track( + enumerate(sorted_parsed_geoparquet_files[1:]) + ): + current_parquet_file_relation = connection.read_parquet( + str(parsed_geoparquet_file) + ) + filtered_result_parquet_file = tmp_dir_path / f"sub_file_{idx}" + result_parquet_files_strings = [ + str(pq_file) for pq_file in result_parquet_files + ] + query = f""" + COPY ( + {current_parquet_file_relation.sql_query()} + ANTI JOIN read_parquet({result_parquet_files_strings}) + USING (feature_id) + ) TO '{filtered_result_parquet_file}' ( + FORMAT 'parquet', + PER_THREAD_OUTPUT true, + ROW_GROUP_SIZE 25000, + COMPRESSION '{self.parquet_compression}' + ) + """ + connection.sql(query) + result_parquet_files.extend(filtered_result_parquet_file.glob("*.parquet")) return pq.read_table(result_parquet_files) def _parse_pbf_file( diff --git a/tests/base/test_cli.py b/tests/base/test_cli.py index de2cd14..32c9cbe 100644 --- a/tests/base/test_cli.py +++ b/tests/base/test_cli.py @@ -518,6 +518,19 @@ def test_proper_args_with_pbf( ], "files/6e3ec5872bf41c2c44698fcf71266971c552d13feea19c3714e171bcd7a2b2c8_nofilter_compact.geoparquet", ) # type: ignore +@P.case( + "Allow not covered geometry", + [ + "--geom-filter-wkt", + ( + "POLYGON ((-43.064 29.673, -43.064 29.644, -43.017 29.644," + " -43.017 29.673, -43.064 29.673))" + ), + "--allow-uncovered-geometry", + "--ignore-cache", + ], + "files/fa44926c5f128cd438ecbe06d29644849a9de323703076b8ac62ffd7a0747e50_nofilter_compact.geoparquet", +) # type: ignore def test_proper_args_without_pbf(args: list[str], expected_result: str) -> None: """Test if runs properly with options.""" result = runner.invoke(cli.app, [*args]) diff --git a/tests/base/test_osm_extracts.py b/tests/base/test_osm_extracts.py index 4f5b948..622ac73 100644 --- a/tests/base/test_osm_extracts.py +++ b/tests/base/test_osm_extracts.py @@ -7,7 +7,12 @@ from shapely import from_wkt from shapely.geometry.base import BaseGeometry -from quackosm.osm_extracts import OsmExtractSource, find_smallest_containing_extract +from quackosm._exceptions import GeometryNotCoveredError, GeometryNotCoveredWarning +from quackosm.osm_extracts import ( + OsmExtractSource, + find_smallest_containing_extract, + find_smallest_containing_extracts_total, +) ut = TestCase() @@ -29,6 +34,10 @@ "Case insensitive Geofabrik", "GEOFABRIK", ) # type: ignore +@P.case( + "OSM fr without underscore", + "osmfr", +) # type: ignore def test_proper_osm_extract_source(value: str): """Test if OsmExtractSource is parsed correctly.""" OsmExtractSource(value) @@ -132,3 +141,22 @@ def test_multiple_smallest_extracts( extracts = find_smallest_containing_extract(geometry, source) assert len(extracts) == len(expected_extract_ids) ut.assertListEqual([extract.id for extract in extracts], expected_extract_ids) + + +@pytest.mark.parametrize( + "expectation,allow_uncovered_geometry", + [ + (pytest.raises(GeometryNotCoveredError), False), + (pytest.warns(GeometryNotCoveredWarning), True), + ], +) # type: ignore +def test_uncovered_geometry_extract(expectation, allow_uncovered_geometry: bool): + """Test if raises errors as expected when geometry can't be covered.""" + with expectation: + geometry = from_wkt( + "POLYGON ((-43.064 29.673, -43.064 29.644, -43.017 29.644," + " -43.017 29.673, -43.064 29.673))" + ) + find_smallest_containing_extracts_total( + geometry=geometry, allow_uncovered_geometry=allow_uncovered_geometry + ) diff --git a/tests/base/test_pbf_file_reader.py b/tests/base/test_pbf_file_reader.py index c6e6583..ed87179 100644 --- a/tests/base/test_pbf_file_reader.py +++ b/tests/base/test_pbf_file_reader.py @@ -25,6 +25,7 @@ from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS, HEX2VEC_FILTER from quackosm._constants import FEATURES_INDEX +from quackosm._exceptions import GeometryNotCoveredError, GeometryNotCoveredWarning from quackosm._osm_tags_filters import GroupedOsmTagsFilter, OsmTagsFilter from quackosm.cli import ( GeocodeGeometryParser, @@ -211,6 +212,26 @@ def test_pbf_reader_features_ids_filtering(filter_osm_ids: list[str], expected_r assert len(features_gdf) == expected_result_length +@pytest.mark.parametrize( + "expectation,allow_uncovered_geometry", + [ + (pytest.raises(GeometryNotCoveredError), False), + (pytest.warns(GeometryNotCoveredWarning), True), + ], +) # type: ignore +def test_uncovered_geometry_extract(expectation, allow_uncovered_geometry: bool): + """Test if raises errors as expected when geometry can't be covered.""" + with expectation: + geometry = from_wkt( + "POLYGON ((-43.064 29.673, -43.064 29.644, -43.017 29.644," + " -43.017 29.673, -43.064 29.673))" + ) + features_gdf = PbfFileReader( + geometry_filter=geometry, allow_uncovered_geometry=allow_uncovered_geometry + ).get_features_gdf_from_geometry(ignore_cache=True) + assert len(features_gdf) == 0 + + @pytest.mark.parametrize( # type: ignore "filter_osm_id,osm_tags_filter,keep_all_tags,expected_tags_keys", [