feat: remove redundancy of geoparquet files (#36)

* feat: remove redundancy of geoparquet files * chore: add changelog entry * fix: change result file naming * fix: add requests dependency * chore: update readme * chore: add changelog entry
kraina-ai · Jan 31, 2024 · bd3f6f5 · bd3f6f5
1 parent edfad6c
commit bd3f6f5
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Removed additional redundancy of GeoParquet result files when only one extract covers whole area [#35](https://github.com/kraina-ai/quackosm/issues/35)
+
+### Fixed
+
+- Added missing `requests` dependency
+
 ## [0.4.0] - 2024-01-31
 
 ### Added

diff --git a/README.md b/README.md
@@ -41,15 +41,26 @@ QuackOSM supports **Python >= 3.9**
 ### Dependencies
 
 Required:
-- duckdb (>=0.9.2)
-- pyarrow (>=13.0.0)
-- geoarrow-pyarrow (>=0.1.1)
-- geopandas
-- shapely (>=2.0)
-- typeguard
+- duckdb (>=0.9.2) - For all DuckDB operations on PBF files
+- pyarrow (>=13.0.0) - For parquet files wrangling
+- pyarrow-ops - For easy removal of duplicated features in parquet files
+- geoarrow-pyarrow (>=0.1.1) - For GeoParquet IO operations
+- geopandas - For returning GeoDataFrames and reading Geo files
+- shapely (>=2.0) - For parsing WKT and GeoJSON strings and fixing geometries
+- typeguard - For internal validation of types
+- psutil - For automatic scaling of parameters based on available resources
+- pooch - For downloading `*.osm.pbf` files
+- tqdm - For showing progress bars
+- requests - For iterating OSM PBF files services
+- beautifulsoup4 - For parsing HTML files and scraping required information
 
 Optional:
-- typer[all] (click, colorama, rich, shellingham)
+- typer[all] (click, colorama, rich, shellingham) - For CLI
+- osmnx - For geocoding of strings in CLI
+- h3 - For reading H3 strings in CLI
+- h3ronpy - For transforming H3 indexes into geometries
+- s2 - For transforming S2 indexes into geometries
+- python-geohash - For transforming GeoHash indexes into geometries
 
 ## Usage
 

diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     "tqdm",
     "beautifulsoup4",
     "pyarrow-ops",
+    "requests",
 ]
 requires-python = ">=3.9"
 readme = "README.md"

diff --git a/quackosm/pbf_file_reader.py b/quackosm/pbf_file_reader.py
@@ -270,29 +270,45 @@ def convert_geometry_filter_to_gpq(
                 explode_tags=explode_tags,
             )
         )
-        if not result_file_path.exists() or ignore_cache:
-            matching_extracts = find_smallest_containing_extract(
-                self.geometry_filter, self.osm_extract_source
-            )
-            pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory)
 
-            parsed_geoparquet_files = []
-            for file_path in pbf_files:
-                parsed_geoparquet_file = self.convert_pbf_to_gpq(
-                    file_path,
-                    keep_all_tags=keep_all_tags,
-                    explode_tags=explode_tags,
-                    ignore_cache=ignore_cache,
-                    filter_osm_ids=filter_osm_ids,
-                )
-                parsed_geoparquet_files.append(parsed_geoparquet_file)
+        matching_extracts = find_smallest_containing_extract(
+            self.geometry_filter, self.osm_extract_source
+        )
 
-            joined_parquet_table = self._drop_duplicates_features_in_pyarrow_table(
-                parsed_geoparquet_files
-            )
-            io.write_geoparquet_table(  # type: ignore
-                joined_parquet_table, result_file_path, primary_geometry_column=GEOMETRY_COLUMN
+        if len(matching_extracts) == 1:
+            pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory)
+            return self.convert_pbf_to_gpq(
+                pbf_files[0],
+                result_file_path=result_file_path,
+                keep_all_tags=keep_all_tags,
+                explode_tags=explode_tags,
+                ignore_cache=ignore_cache,
+                filter_osm_ids=filter_osm_ids,
             )
+        else:
+            if not result_file_path.exists() or ignore_cache:
+                matching_extracts = find_smallest_containing_extract(
+                    self.geometry_filter, self.osm_extract_source
+                )
+                pbf_files = download_extracts_pbf_files(matching_extracts, self.working_directory)
+
+                parsed_geoparquet_files = []
+                for file_path in pbf_files:
+                    parsed_geoparquet_file = self.convert_pbf_to_gpq(
+                        file_path,
+                        keep_all_tags=keep_all_tags,
+                        explode_tags=explode_tags,
+                        ignore_cache=ignore_cache,
+                        filter_osm_ids=filter_osm_ids,
+                    )
+                    parsed_geoparquet_files.append(parsed_geoparquet_file)
+
+                joined_parquet_table = self._drop_duplicates_features_in_pyarrow_table(
+                    parsed_geoparquet_files
+                )
+                io.write_geoparquet_table(  # type: ignore
+                    joined_parquet_table, result_file_path, primary_geometry_column=GEOMETRY_COLUMN
+                )
 
         return Path(result_file_path)