From 0c099d2ea2775c3e449a5cb6cdff5017d00c2a6e Mon Sep 17 00:00:00 2001 From: Kamil Raczycki Date: Thu, 25 Apr 2024 22:21:09 +0200 Subject: [PATCH] chore: simplify tests (#86) * chore: simplify tests * chore: add changelog entry * feat: raise error if provifded geometry filter has no area * chore: add more docs info --- .github/workflows/_tests.yml | 33 +--- .github/workflows/manual_tests.yml | 48 +---- CHANGELOG.md | 8 + examples/command_line_interface.ipynb | 4 +- pdm.lock | 35 +--- pyproject.toml | 1 - quackosm/_exceptions.py | 3 + quackosm/_rich_progress.py | 4 +- quackosm/osm_extracts/__init__.py | 8 +- quackosm/osm_extracts/bbbike.py | 3 +- quackosm/osm_extracts/osm_fr.py | 3 +- quackosm/pbf_file_reader.py | 34 +++- tests/base/test_pbf_file_reader.py | 264 ++++++++++---------------- tests/test_files/osmconf.ini | 132 ------------- 14 files changed, 157 insertions(+), 423 deletions(-) delete mode 100644 tests/test_files/osmconf.ini diff --git a/.github/workflows/_tests.yml b/.github/workflows/_tests.yml index 00a0554..dcd8c1b 100644 --- a/.github/workflows/_tests.yml +++ b/.github/workflows/_tests.yml @@ -24,24 +24,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install GDAL (linux) - if: matrix.os == 'ubuntu-latest' - run: | - $CONDA/bin/conda install -c conda-forge gdal - $CONDA/bin/ogr2ogr --version - - name: Install GDAL (macos) - if: matrix.os == 'macos-13' - run: | - CONDA=$HOME/miniconda - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - bash Miniconda3-latest-MacOSX-x86_64.sh -b -p $CONDA - $CONDA/bin/conda install -c conda-forge gdal - $CONDA/bin/ogr2ogr --version - - name: Install GDAL (windows) - if: matrix.os == 'windows-latest' - run: | - & $env:CONDA\Scripts\conda.exe install -c conda-forge gdal - & $env:CONDA\Library\bin\ogr2ogr.exe --version - uses: pdm-project/setup-pdm@v3 name: Setup PDM (Python ${{ matrix.python-version }}) with: @@ -64,21 +46,8 @@ jobs: key: tox-cache-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/pdm.lock') }} restore-keys: | tox-cache-${{ matrix.os }}-${{ matrix.python-version }}- - - name: Run tests with tox (linux) - if: matrix.os == 'ubuntu-latest' - run: | - PATH=$CONDA/bin:$PATH - pdm run tox -e python${{ matrix.python-version }} - - name: Run tests with tox (macos) - if: matrix.os == 'macos-13' - run: | - CONDA=$HOME/miniconda - PATH=$CONDA/bin:$PATH - pdm run tox -e python${{ matrix.python-version }} - - name: Run tests with tox (windows) - if: matrix.os == 'windows-latest' + - name: Run tests with tox run: | - $env:Path = "$env:CONDA\Library\bin;" + $env:Path pdm run tox -e python${{ matrix.python-version }} - name: Upload coverage to Codecov uses: Wandalen/wretry.action@master diff --git a/.github/workflows/manual_tests.yml b/.github/workflows/manual_tests.yml index 6a6acf6..c2933f4 100644 --- a/.github/workflows/manual_tests.yml +++ b/.github/workflows/manual_tests.yml @@ -26,60 +26,14 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install GDAL (linux) - if: matrix.os == 'ubuntu-latest' - run: | - $CONDA/bin/conda install -c conda-forge gdal - $CONDA/bin/ogr2ogr --version - - name: Install GDAL (macos arm) - if: matrix.os == 'macos-latest' - run: | - CONDA=$HOME/miniconda - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh - bash Miniconda3-latest-MacOSX-arm64.sh -b -p $CONDA - $CONDA/bin/conda install -c conda-forge gdal - $CONDA/bin/ogr2ogr --version - - name: Install GDAL (macos x86) - if: matrix.os == 'macos-13' - run: | - CONDA=$HOME/miniconda - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - bash Miniconda3-latest-MacOSX-x86_64.sh -b -p $CONDA - $CONDA/bin/conda install -c conda-forge gdal - $CONDA/bin/ogr2ogr --version - - name: Install GDAL (windows) - if: matrix.os == 'windows-latest' - run: | - & $env:CONDA\Scripts\conda.exe install -c conda-forge gdal - & $env:CONDA\Library\bin\ogr2ogr.exe --version - name: Install pdm run: pip install pdm - name: Generate lock with newest dependencies run: pdm lock --lockfile pdm.newest.lock --strategy no_cross_platform -dG:all - name: Install quackosm and tests dependencies run: pdm install --lockfile pdm.newest.lock -dG:all - - name: Run tests with pytest (linux) - if: matrix.os == 'ubuntu-latest' - run: | - PATH=$CONDA/bin:$PATH - pdm run pytest -v -s --durations=20 --doctest-modules --doctest-continue-on-failure quackosm - pdm run pytest -v -s --durations=20 tests/base - pdm run pytest -v -s --durations=20 tests/optional_imports - pdm run pytest -v -s --durations=20 tests/benchmark - - name: Run tests with pytest (macos) - if: matrix.os == 'macos-latest' || matrix.os == 'macos-13' - run: | - CONDA=$HOME/miniconda - PATH=$CONDA/bin:$PATH - pdm run pytest -v -s --durations=20 --doctest-modules --doctest-continue-on-failure quackosm - pdm run pytest -v -s --durations=20 tests/base - pdm run pytest -v -s --durations=20 tests/optional_imports - pdm run pytest -v -s --durations=20 tests/benchmark - - name: Run tests with pytest (windows) - if: matrix.os == 'windows-latest' + - name: Run tests with pytest run: | - $env:Path = "$env:CONDA\Library\bin;" + $env:Path - $env:PYTHONIOENCODING = "utf-8" pdm run pytest -v -s --durations=20 --doctest-modules --doctest-continue-on-failure quackosm pdm run pytest -v -s --durations=20 tests/base pdm run pytest -v -s --durations=20 tests/optional_imports diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fcda9b..a17715a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Simplified GDAL parity tests by precalculating result files and uploading them to additional repository + +### Fixed + +- Added exception if parts of provided geometry have no area [#85](https://github.com/kraina-ai/quackosm/issues/85) + ## [0.7.0] - 2024-04-24 ### Added diff --git a/examples/command_line_interface.ipynb b/examples/command_line_interface.ipynb index 7a12c51..bbca81c 100644 --- a/examples/command_line_interface.ipynb +++ b/examples/command_line_interface.ipynb @@ -157,7 +157,9 @@ "- Geohash spatial index\n", "- S2 spatial index\n", "\n", - "These filters can also be used to filter out geometries from provided pbf file." + "These filters can also be used to filter out geometries from provided pbf file.\n", + "\n", + "`QuackOSM` will raise an error if provided geometry has parts without area (such as Points, LineStrings or empty geometry)." ] }, { diff --git a/pdm.lock b/pdm.lock index 320a31f..c079b58 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev", "docs", "license", "lint", "test", "cli", "cli-dev"] strategy = ["cross_platform"] lock_version = "4.4.1" -content_hash = "sha256:fce5d8113bda19405fd45df165447163abb6b0ef6218424461fbc16901662cbf" +content_hash = "sha256:be0840022c3a9d3aec37919ce1be4d1cd23e262dda98b8dce2da1e3cfda85376" [[package]] name = "anyio" @@ -2730,39 +2730,6 @@ files = [ {file = "pymdown_extensions-10.7.1.tar.gz", hash = "sha256:c70e146bdd83c744ffc766b4671999796aba18842b268510a329f7f64700d584"}, ] -[[package]] -name = "pyogrio" -version = "0.7.2" -requires_python = ">=3.8" -summary = "Vectorized spatial vector file format I/O using GDAL/OGR" -dependencies = [ - "certifi", - "numpy", - "packaging", -] -files = [ - {file = "pyogrio-0.7.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba386a02c9b5934c568b40acc95c9863f92075f6990167635e51368976569c66"}, - {file = "pyogrio-0.7.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:860b04ddf23b8c253ceb3621e4b0e0dc0f293eab66cb14f799a5c9f9fe0a882c"}, - {file = "pyogrio-0.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:caaf61d473ac207f170082e602ea57c096e8dd4c4be51de58fba96f1a5944096"}, - {file = "pyogrio-0.7.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bee556ca305b7e8c68aada259d925c612131205074fb2373badafacbef610b77"}, - {file = "pyogrio-0.7.2-cp310-cp310-win_amd64.whl", hash = "sha256:7e2c856961efdc6cb3809b97b49016cbbcee17c8a1e85fc4000b5fcb3cfcb9b1"}, - {file = "pyogrio-0.7.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5654e7c33442cbd98e7a56f705e160415d7503b2420d724d4f81b8cc88360b3e"}, - {file = "pyogrio-0.7.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b9a8a4854c7af2c76683ce5666ee765b207901b362576465219d75deb6159821"}, - {file = "pyogrio-0.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a23136d1bffa9d811263807b850c6e9854201710276f09de650131e89f2486aa"}, - {file = "pyogrio-0.7.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:234b0d1d22e9680229b0618c25077a0cb2428cbbc2939b4bb9bdd8ee77e0f3e0"}, - {file = "pyogrio-0.7.2-cp311-cp311-win_amd64.whl", hash = "sha256:33ae5aafcf3a557e107a33f5b3e878750d2e467b8cc911dc4bf261c1a602b534"}, - {file = "pyogrio-0.7.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:73577fecebeecf0d06e78c1a4bddd460a4d57c6d918affab7594c0bc72f5fa14"}, - {file = "pyogrio-0.7.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f2ff58184020da39540a2f5d4a5412005a01b0c4cd03c7b8294bc670d1f3fe50"}, - {file = "pyogrio-0.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31112bb0b6a4a3f80ec3252d7eeb7be81045860d49fd76e297c073759450652b"}, - {file = "pyogrio-0.7.2-cp312-cp312-win_amd64.whl", hash = "sha256:1b7197c72f034ac7187da2a8d50a063a5f1256aab732b154f11f887a7652dc3d"}, - {file = "pyogrio-0.7.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cc6db2e5dc50dfe23554d10502920eafa0648c365725e552aaa523432a9bf35"}, - {file = "pyogrio-0.7.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:be46be43c4148a3ad09da38670411485ec544a51cbd6b7d004a0eca5035023fc"}, - {file = "pyogrio-0.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3001efd5dfee36459d0cfdafbe91ed88fc5ae734353d771cdb75546ef1427735"}, - {file = "pyogrio-0.7.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:892fdab0e1c44c0125254d92928081c14f93ac553f371addc2c9a1d4bde41cad"}, - {file = "pyogrio-0.7.2-cp39-cp39-win_amd64.whl", hash = "sha256:d5fc2304aeb927564f77caaa4da9a47e2d77a8ceb1c624ea84c505140886b221"}, - {file = "pyogrio-0.7.2.tar.gz", hash = "sha256:33afb7d211c6434613f24174722347a5cb11d22a212f28c817f67c89d30d0c0d"}, -] - [[package]] name = "pyparsing" version = "3.1.2" diff --git a/pyproject.toml b/pyproject.toml index b04477a..7add98b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,6 @@ test = [ "pytest-parametrization", "pytest-xdist", "pytest-doctestplus", - "pyogrio", "srai>=0.6.2", ] # pdm add -dG docs diff --git a/quackosm/_exceptions.py b/quackosm/_exceptions.py index a6bab4e..abad8b7 100644 --- a/quackosm/_exceptions.py +++ b/quackosm/_exceptions.py @@ -5,3 +5,6 @@ class GeometryNotCoveredWarning(Warning): ... class GeometryNotCoveredError(Exception): ... + + +class InvalidGeometryFilter(Exception): ... diff --git a/quackosm/_rich_progress.py b/quackosm/_rich_progress.py index 15cc72f..2b198d7 100644 --- a/quackosm/_rich_progress.py +++ b/quackosm/_rich_progress.py @@ -205,7 +205,7 @@ def __init__( self.major_steps_prefix = "" if not self.verbosity_mode == "silent": - with suppress(ImportError): # pragma: no cover + with suppress(ImportError): # pragma: no cover from types import TracebackType from typing import Union @@ -281,7 +281,7 @@ def _check_live_obj(self): if self.verbosity_mode == "silent": return - with suppress(ImportError): + with suppress(ImportError): # pragma: no cover if not self.live or not self.live._started: from rich.progress import Live diff --git a/quackosm/osm_extracts/__init__.py b/quackosm/osm_extracts/__init__.py index fee9974..e0fd849 100644 --- a/quackosm/osm_extracts/__init__.py +++ b/quackosm/osm_extracts/__init__.py @@ -493,9 +493,11 @@ def _simplify_selected_extracts( extract_geometry = ( matching_extracts.loc[sorted_extracts_gdf["id"] == extract_id].iloc[0].geometry ) - other_geometries = matching_extracts.loc[ - sorted_extracts_gdf["id"] != extract_id - ].unary_union + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + other_geometries = matching_extracts.loc[ + sorted_extracts_gdf["id"] != extract_id + ].unary_union if extract_geometry.covered_by(other_geometries): extract_to_remove = extract_id simplify_again = True diff --git a/quackosm/osm_extracts/bbbike.py b/quackosm/osm_extracts/bbbike.py index 14192d7..a469df7 100644 --- a/quackosm/osm_extracts/bbbike.py +++ b/quackosm/osm_extracts/bbbike.py @@ -13,6 +13,7 @@ import requests from tqdm import tqdm +from quackosm._constants import WGS84_CRS from quackosm.osm_extracts._poly_parser import parse_polygon_file from quackosm.osm_extracts.extract import OpenStreetMapExtract @@ -46,7 +47,7 @@ def _load_bbbike_index() -> gpd.GeoDataFrame: extracts = _iterate_bbbike_index() gdf = gpd.GeoDataFrame( data=[asdict(extract) for extract in extracts], geometry="geometry" - ).set_crs("EPSG:4326") + ).set_crs(WGS84_CRS) gdf["area"] = gdf.geometry.area gdf.sort_values(by="area", ignore_index=True, inplace=True) diff --git a/quackosm/osm_extracts/osm_fr.py b/quackosm/osm_extracts/osm_fr.py index 7f4c9b0..ec18e59 100644 --- a/quackosm/osm_extracts/osm_fr.py +++ b/quackosm/osm_extracts/osm_fr.py @@ -14,6 +14,7 @@ import requests from tqdm import tqdm +from quackosm._constants import WGS84_CRS from quackosm.osm_extracts._poly_parser import parse_polygon_file from quackosm.osm_extracts.extract import OpenStreetMapExtract @@ -68,7 +69,7 @@ def _load_openstreetmap_fr_index() -> gpd.GeoDataFrame: pbar.update() gdf = gpd.GeoDataFrame( data=[asdict(extract) for extract in extracts], geometry="geometry" - ).set_crs("EPSG:4326") + ).set_crs(WGS84_CRS) gdf["area"] = gdf.geometry.area gdf.sort_values(by="area", ignore_index=True, inplace=True) diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py index b8df63a..3b7c5f8 100644 --- a/quackosm/pbf_file_reader.py +++ b/quackosm/pbf_file_reader.py @@ -32,7 +32,7 @@ from shapely.geometry.base import BaseGeometry, BaseMultipartGeometry from quackosm._constants import FEATURES_INDEX, GEOMETRY_COLUMN, WGS84_CRS -from quackosm._exceptions import EmptyResultWarning +from quackosm._exceptions import EmptyResultWarning, InvalidGeometryFilter from quackosm._osm_tags_filters import ( GroupedOsmTagsFilter, OsmTagsFilter, @@ -145,7 +145,13 @@ def __init__( Verbose leaves all progress outputs in the stdout. Defaults to "transient". allow_uncovered_geometry (bool): Suppress an error if some geometry parts aren't covered by any OSM extract. Defaults to `False`. + + Raises: + InvalidGeometryFilter: When provided geometry filter has parts without area. """ + self.geometry_filter = geometry_filter + self._check_if_valid_geometry_filter() + self.tags_filter = tags_filter self.is_tags_filter_positive = ( check_if_any_osm_tags_filter_value_is_positive(self.tags_filter) @@ -154,7 +160,7 @@ def __init__( ) self.expanded_tags_filter: Optional[Union[GroupedOsmTagsFilter, OsmTagsFilter]] = None self.merged_tags_filter: Optional[Union[GroupedOsmTagsFilter, OsmTagsFilter]] = None - self.geometry_filter = geometry_filter + self.allow_uncovered_geometry = allow_uncovered_geometry self.osm_extract_source = osm_extract_source self.working_directory = Path(working_directory) @@ -600,10 +606,7 @@ def _drop_duplicated_features_in_joined_table( with self.task_progress_tracker.get_basic_spinner("Combining results"): output_file_name = tmp_dir_path / "joined_features_without_duplicates.parquet" parquet_relation = connection.read_parquet( - [ - str(parsed_geoparquet_file) - for parsed_geoparquet_file in parsed_geoparquet_files - ], + [str(parsed_geoparquet_file) for parsed_geoparquet_file in parsed_geoparquet_files], union_by_name=True, ) query = f""" @@ -838,6 +841,25 @@ def _generate_result_file_path_from_geometry( ) return Path(self.working_directory) / result_file_name + def _check_if_valid_geometry_filter(self) -> None: + if self.geometry_filter is None: + return + + if isinstance(self.geometry_filter, BaseMultipartGeometry): + geometries_to_check = self.geometry_filter.geoms + else: + geometries_to_check = [self.geometry_filter] + + if not geometries_to_check: + raise InvalidGeometryFilter("Geometry filter is empty.") + + for geometry_to_check in geometries_to_check: + if geometry_to_check.area == 0: + raise InvalidGeometryFilter( + "Detected geometry with area equal to 0." + " Geometry filter cannot contain geometries without area." + ) + def _generate_geometry_hash(self) -> str: clipping_geometry_hash_part = "noclip" oriented_geometry = self._get_oriented_geometry_filter() diff --git a/tests/base/test_pbf_file_reader.py b/tests/base/test_pbf_file_reader.py index 4c0554a..9b07c2e 100644 --- a/tests/base/test_pbf_file_reader.py +++ b/tests/base/test_pbf_file_reader.py @@ -1,10 +1,7 @@ """Tests for PbfFileReader.""" -import platform -import re -import subprocess +import json import warnings -from collections.abc import Iterable from pathlib import Path from typing import Optional, Union, cast from unittest import TestCase @@ -13,13 +10,21 @@ import geopandas as gpd import pandas as pd import pyarrow as pa -import pyogrio import pytest -import six from parametrization import Parametrization as P from pytest_mock import MockerFixture from shapely import from_wkt, hausdorff_distance -from shapely.geometry import LinearRing, MultiPolygon, Polygon, box, polygon +from shapely.geometry import ( + GeometryCollection, + LinearRing, + LineString, + MultiPoint, + MultiPolygon, + Point, + Polygon, + box, + polygon, +) from shapely.geometry.base import BaseGeometry from shapely.ops import unary_union from srai.geometry import remove_interiors @@ -27,8 +32,12 @@ from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS, HEX2VEC_FILTER from quackosm import convert_geometry_to_gpq, get_features_gdf -from quackosm._constants import FEATURES_INDEX -from quackosm._exceptions import GeometryNotCoveredError, GeometryNotCoveredWarning +from quackosm._constants import FEATURES_INDEX, WGS84_CRS +from quackosm._exceptions import ( + GeometryNotCoveredError, + GeometryNotCoveredWarning, + InvalidGeometryFilter, +) from quackosm._osm_tags_filters import GroupedOsmTagsFilter, OsmTagsFilter from quackosm.cli import ( GeocodeGeometryParser, @@ -160,7 +169,6 @@ def test_combining_files_different_techniques( side_effect=pa.ArrowInvalid(), ) - if patch_methods > 1: # Leave _drop_duplicated_features_in_joined_table_one_by_one as backup mocker.patch( @@ -264,6 +272,61 @@ def test_uncovered_geometry_extract(expectation, allow_uncovered_geometry: bool) assert len(features_gdf) == 0 +@pytest.mark.parametrize( # type: ignore + "geometry", + [ + box( + minx=7.416486207767861, + miny=43.7310867041912, + maxx=7.421931388477276, + maxy=43.73370705597216, + ), + GeohashGeometryParser().convert("spv2bc"), # type: ignore + GeohashGeometryParser().convert("spv2bc,spv2bfr"), # type: ignore + H3GeometryParser().convert("8a3969a40ac7fff"), # type: ignore + H3GeometryParser().convert("8a3969a40ac7fff,893969a4037ffff"), # type: ignore + S2GeometryParser().convert("12cdc28bc"), # type: ignore + S2GeometryParser().convert("12cdc28bc,12cdc28f"), # type: ignore + GeocodeGeometryParser().convert("Monaco-Ville, Monaco"), # type: ignore + ], +) +def test_valid_geometries(geometry: BaseGeometry): + """Test if geometry filters as loaded properly.""" + PbfFileReader(geometry_filter=geometry) + + +@pytest.mark.parametrize( # type: ignore + "geometry", + [ + Point(10, 5), + box( + minx=7.416486207767861, + miny=43.7310867041912, + maxx=7.421931388477276, + maxy=43.73370705597216, + ).boundary, + Point(10, 5).boundary, + MultiPoint([(1, 2), (3, 4)]), + LineString([(1, 2), (3, 4)]), + GeometryCollection( + [ + box( + minx=7.416486207767861, + miny=43.7310867041912, + maxx=7.421931388477276, + maxy=43.73370705597216, + ), + Point(10, 5), + ] + ), + ], +) +def test_invalid_geometries(geometry: BaseGeometry): + """Test if invalid geometry filters raise errors.""" + with pytest.raises(InvalidGeometryFilter): + PbfFileReader(geometry_filter=geometry) + + @pytest.mark.parametrize( # type: ignore "geometry", [ @@ -292,147 +355,6 @@ def test_geometry_orienting(geometry: BaseGeometry): ut.assertAlmostEqual(iou, 1, delta=1e-4) -# Copyright (C) 2011 by Hong Minhee , -# Robert Kajic -# Copyright (C) 2020 by Salesforce.com, Inc - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -def parse_hstore_tags(tags: str) -> dict[str, Optional[str]]: - """ - Parse hstore tags to python dict. - - This function has been copied from pghstore library - https://github.com/heroku/pghstore/blob/main/src/pghstore/_native.py - since it can't be installed on Windows. - """ - ESCAPE_RE = re.compile(r"\\(.)") - - PAIR_RE = re.compile( - r'\s*(?:"(?P(?:[^\\"]|\\.)*)")\s*=>\s*' - r'(?:"(?P(?:[^\\"]|\\.)*)"|(?PNULL))' - r"\s*(?:(?P,)|$)", - re.IGNORECASE, - ) - - def _unescape(s: str) -> str: - return ESCAPE_RE.sub(r"\1", s) - - def _parse(string: str, encoding: str = "utf-8") -> Iterable[tuple[str, Optional[str]]]: - if isinstance(string, six.binary_type): - string = string.decode(encoding) - - string = string.strip() - offset = 0 - term_sep = None - for match in PAIR_RE.finditer(string): - if match.start() > offset: - raise ValueError("malformed hstore value: position %d" % offset) - - key = value = None - kq = match.group("kq") - if kq: - key = _unescape(kq) - - if key is None: - raise ValueError("Malformed hstore value starting at position %d" % offset) - - vq = match.group("vq") - if vq: - value = _unescape(vq) - elif match.group("vn"): - value = "" - else: - value = "" - - yield key, value - - term_sep = match.group("ts") - - offset = match.end() - - if len(string) > offset or term_sep: - raise ValueError("malformed hstore value: position %d" % offset) - - return dict(_parse(tags, encoding="utf-8")) - - -def transform_pbf_to_gpkg(extract_name: str, layer_name: str) -> Path: - """Uses GDAL ogr2ogr to transform PBF file into GPKG.""" - input_file = Path(__file__).parent.parent / "files" / f"{extract_name}.osm.pbf" - output_file = Path(__file__).parent.parent / "files" / f"{extract_name}_{layer_name}.gpkg" - config_file = Path(__file__).parent.parent / "test_files" / "osmconf.ini" - args = [ - "ogr2ogr" if platform.system() != "Windows" else "ogr2ogr.exe", - str(output_file), - str(input_file), - layer_name, - "-oo", - f"CONFIG_FILE={config_file}", - ] - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1) - _, err = p.communicate() - rc = p.returncode - if rc > 0: - raise RuntimeError(rc, err) - - return output_file - - -def read_features_with_pyogrio(extract_name: str) -> gpd.GeoDataFrame: - """Read features from *.osm.pbf file using pyogrio.""" - gdfs = [] - for layer_name in ("points", "lines", "multilinestrings", "multipolygons", "other_relations"): - gpkg_file_path = transform_pbf_to_gpkg(extract_name, layer_name) - gdf = pyogrio.read_dataframe(gpkg_file_path) - - if layer_name == "points": - gdf[FEATURES_INDEX] = "node/" + gdf["osm_id"] - elif layer_name == "lines": - gdf[FEATURES_INDEX] = "way/" + gdf["osm_id"] - elif layer_name in ("multilinestrings", "other_relations"): - gdf[FEATURES_INDEX] = "relation/" + gdf["osm_id"] - elif layer_name == "multipolygons": - gdf[FEATURES_INDEX] = gdf.apply( - lambda row: ( - "relation/" + row["osm_id"] - if row["osm_id"] is not None - else "way/" + row["osm_way_id"] - ), - axis=1, - ) - - gdfs.append(gdf) - - final_gdf = gpd.pd.concat(gdfs) - final_gdf = final_gdf[~final_gdf["all_tags"].isnull()] - final_gdf["tags"] = final_gdf["all_tags"].apply(parse_hstore_tags) - non_relations = ~final_gdf[FEATURES_INDEX].str.startswith("relation/") - relations = final_gdf[FEATURES_INDEX].str.startswith("relation/") - matching_relations = relations & final_gdf["tags"].apply( - lambda x: x.get("type") in ("boundary", "multipolygon") - ) - final_gdf = final_gdf[non_relations | matching_relations] - final_gdf.geometry = final_gdf.geometry.make_valid() - return final_gdf[[FEATURES_INDEX, "tags", "geometry"]].set_index(FEATURES_INDEX) - - def check_if_relation_in_osm_is_valid_based_on_tags(pbf_file: str, relation_id: str) -> bool: """Check if given relation in OSM is valid.""" duckdb.load_extension("spatial") @@ -608,16 +530,24 @@ def extract_polygons_from_geometry(geometry: BaseGeometry) -> list[Union[Polygon @P.case("Monaco", "monaco") # type: ignore @P.case("Panama", "panama") # type: ignore @P.case("Seychelles", "seychelles") # type: ignore -@P.case("Sierra Leone", "sierra-leone") # type: ignore def test_gdal_parity(extract_name: str) -> None: - """Test if loaded data is similar to GDAL results.""" + """ + Test if loaded data is similar to GDAL results. + + Test downloads prepared pbf files and parsed geoparquet using GDAL from kraina-ai/srai-test- + files repository. + """ pbf_file_download_url = LFS_DIRECTORY_URL + f"{extract_name}-latest.osm.pbf" pbf_file_path = Path(__file__).parent.parent / "files" / f"{extract_name}.osm.pbf" download_file(pbf_file_download_url, str(pbf_file_path), force_download=True) + gpq_file_download_url = LFS_DIRECTORY_URL + f"{extract_name}-latest.geoparquet" + gpq_file_path = Path(__file__).parent.parent / "files" / f"{extract_name}.geoparquet" + download_file(gpq_file_download_url, str(gpq_file_path), force_download=True) reader = PbfFileReader() duckdb_gdf = reader.get_features_gdf([pbf_file_path], explode_tags=False, ignore_cache=True) - gdal_gdf = read_features_with_pyogrio(extract_name) + gdal_gdf = gpd.read_parquet(gpq_file_path) + gdal_gdf["tags"] = gdal_gdf["tags"].apply(json.loads) gdal_index = gdal_gdf.index duckdb_index = duckdb_gdf.index @@ -734,12 +664,14 @@ def test_gdal_parity(extract_name: str) -> None: ] = gpd.GeoSeries( invalid_geometries_df.loc[ invalid_geometries_df["geometry_both_closed_or_not"], "duckdb_geometry" - ] + ], + crs=WGS84_CRS, ).geom_equals_exact( gpd.GeoSeries( invalid_geometries_df.loc[ invalid_geometries_df["geometry_both_closed_or_not"], "gdal_geometry" - ] + ], + crs=WGS84_CRS, ), tolerance=tolerance, ) @@ -758,12 +690,14 @@ def test_gdal_parity(extract_name: str) -> None: ] = gpd.GeoSeries( invalid_geometries_df.loc[ invalid_geometries_df["geometry_both_closed_or_not"], "duckdb_geometry" - ] + ], + crs=WGS84_CRS, ).geom_equals( gpd.GeoSeries( invalid_geometries_df.loc[ invalid_geometries_df["geometry_both_closed_or_not"], "gdal_geometry" - ] + ], + crs=WGS84_CRS, ) ) invalid_geometries_df = invalid_geometries_df.loc[ @@ -788,12 +722,14 @@ def test_gdal_parity(extract_name: str) -> None: ) invalid_geometries_df.loc[matching_polygon_geometries_mask, "geometry_intersection_area"] = ( gpd.GeoSeries( - invalid_geometries_df.loc[matching_polygon_geometries_mask, "duckdb_geometry"] + invalid_geometries_df.loc[matching_polygon_geometries_mask, "duckdb_geometry"], + crs=WGS84_CRS, ) .intersection( gpd.GeoSeries( - invalid_geometries_df.loc[matching_polygon_geometries_mask, "gdal_geometry"] - ) + invalid_geometries_df.loc[matching_polygon_geometries_mask, "gdal_geometry"], + crs=WGS84_CRS, + ), ) .area ) @@ -804,10 +740,12 @@ def test_gdal_parity(extract_name: str) -> None: matching_polygon_geometries_mask, "geometry_intersection_area" ] / ( gpd.GeoSeries( - invalid_geometries_df.loc[matching_polygon_geometries_mask, "duckdb_geometry"] + invalid_geometries_df.loc[matching_polygon_geometries_mask, "duckdb_geometry"], + crs=WGS84_CRS, ).area + gpd.GeoSeries( - invalid_geometries_df.loc[matching_polygon_geometries_mask, "gdal_geometry"] + invalid_geometries_df.loc[matching_polygon_geometries_mask, "gdal_geometry"], + crs=WGS84_CRS, ).area - invalid_geometries_df.loc[matching_polygon_geometries_mask, "geometry_intersection_area"] ) diff --git a/tests/test_files/osmconf.ini b/tests/test_files/osmconf.ini deleted file mode 100644 index 227433b..0000000 --- a/tests/test_files/osmconf.ini +++ /dev/null @@ -1,132 +0,0 @@ -# -# Configuration file for OSM import -# - -# put here the name of keys, or key=value, for ways that are assumed to be polygons if they are closed -# see http://wiki.openstreetmap.org/wiki/Map_Features -closed_ways_are_polygons=aeroway,amenity,boundary,building,building:part,craft,geological,historic,landuse,leisure,military,natural,office,place,shop,sport,tourism,highway=platform,public_transport=platform - -# Uncomment to avoid laundering of keys ( ':' turned into '_' ) -#attribute_name_laundering=no - -# Some tags, set on ways and when building multipolygons, multilinestrings or other_relations, -# are normally filtered out early, independent of the 'ignore' configuration below. -# Uncomment to disable early filtering. The 'ignore' lines below remain active. -#report_all_tags=yes - -# uncomment to report all nodes, including the ones without any (significant) tag -#report_all_nodes=yes - -# uncomment to report all ways, including the ones without any (significant) tag -#report_all_ways=yes - -# uncomment to specify the the format for the all_tags/other_tags field should be JSON -# instead of the default HSTORE formatting. -# Valid values for tags_format are "hstore" and "json" -tags_format=hstore - -[points] -# common attributes -osm_id=yes -osm_version=no -osm_timestamp=no -osm_uid=no -osm_user=no -osm_changeset=no - -# keys to report as OGR fields -attributes=name,barrier,highway,ref,address,is_in,place,man_made -# keys that, alone, are not significant enough to report a node as a OGR point -unsignificant=created_by,converted_by,source,time,ele,attribution -# keys that should NOT be reported in the "other_tags" field -ignore=created_by,converted_by,source,time,ele,note,todo,openGeoDB:,fixme,FIXME -# uncomment to avoid creation of "other_tags" field -other_tags=no -# uncomment to create "all_tags" field. "all_tags" and "other_tags" are exclusive -all_tags=yes - -[lines] -# common attributes -osm_id=yes -osm_version=no -osm_timestamp=no -osm_uid=no -osm_user=no -osm_changeset=no - -# keys to report as OGR fields -attributes=name,highway,waterway,aerialway,barrier,man_made,railway - -# type of attribute 'foo' can be changed with something like -#foo_type=Integer/Real/String/DateTime - -# keys that should NOT be reported in the "other_tags" field -ignore=created_by,converted_by,source,time,ele,note,todo,openGeoDB:,fixme,FIXME -# uncomment to avoid creation of "other_tags" field -other_tags=no -# uncomment to create "all_tags" field. "all_tags" and "other_tags" are exclusive -all_tags=yes - -#computed_attributes must appear before the keywords _type and _sql -computed_attributes=z_order -z_order_type=Integer -# Formula based on https://github.com/openstreetmap/osm2pgsql/blob/master/style.lua#L13 -# [foo] is substituted by value of tag foo. When substitution is not wished, the [ character can be escaped with \[ in literals -# Note for GDAL developers: if we change the below formula, make sure to edit ogrosmlayer.cpp since it has a hardcoded optimization for this very precise formula -z_order_sql="SELECT (CASE [highway] WHEN 'minor' THEN 3 WHEN 'road' THEN 3 WHEN 'unclassified' THEN 3 WHEN 'residential' THEN 3 WHEN 'tertiary_link' THEN 4 WHEN 'tertiary' THEN 4 WHEN 'secondary_link' THEN 6 WHEN 'secondary' THEN 6 WHEN 'primary_link' THEN 7 WHEN 'primary' THEN 7 WHEN 'trunk_link' THEN 8 WHEN 'trunk' THEN 8 WHEN 'motorway_link' THEN 9 WHEN 'motorway' THEN 9 ELSE 0 END) + (CASE WHEN [bridge] IN ('yes', 'true', '1') THEN 10 ELSE 0 END) + (CASE WHEN [tunnel] IN ('yes', 'true', '1') THEN -10 ELSE 0 END) + (CASE WHEN [railway] IS NOT NULL THEN 5 ELSE 0 END) + (CASE WHEN [layer] IS NOT NULL THEN 10 * CAST([layer] AS INTEGER) ELSE 0 END)" - -[multipolygons] -# common attributes -# note: for multipolygons, osm_id=yes instantiates a osm_id field for the id of relations -# and a osm_way_id field for the id of closed ways. Both fields are exclusively set. -osm_id=yes -osm_version=no -osm_timestamp=no -osm_uid=no -osm_user=no -osm_changeset=no - -# keys to report as OGR fields -attributes=name,type,aeroway,amenity,admin_level,barrier,boundary,building,craft,geological,historic,land_area,landuse,leisure,man_made,military,natural,office,place,shop,sport,tourism -# keys that should NOT be reported in the "other_tags" field -ignore=area,created_by,converted_by,source,time,ele,note,todo,openGeoDB:,fixme,FIXME -# uncomment to avoid creation of "other_tags" field -other_tags=no -# uncomment to create "all_tags" field. "all_tags" and "other_tags" are exclusive -all_tags=yes - -[multilinestrings] -# common attributes -osm_id=yes -osm_version=no -osm_timestamp=no -osm_uid=no -osm_user=no -osm_changeset=no - -# keys to report as OGR fields -attributes=name,type -# keys that should NOT be reported in the "other_tags" field -ignore=area,created_by,converted_by,source,time,ele,note,todo,openGeoDB:,fixme,FIXME -# uncomment to avoid creation of "other_tags" field -other_tags=no -# uncomment to create "all_tags" field. "all_tags" and "other_tags" are exclusive -all_tags=yes - -[other_relations] -# common attributes -osm_id=yes -osm_version=no -osm_timestamp=no -osm_uid=no -osm_user=no -osm_changeset=no - -# keys to report as OGR fields -attributes=name,type -# keys that should NOT be reported in the "other_tags" field -ignore=area,created_by,converted_by,source,time,ele,note,todo,openGeoDB:,fixme,FIXME -# uncomment to avoid creation of "other_tags" field -other_tags=no -# uncomment to create "all_tags" field. "all_tags" and "other_tags" are exclusive -all_tags=yes