Skip to content

Commit

Permalink
Add initial implementation of exactextract
Browse files Browse the repository at this point in the history
  • Loading branch information
tm-jc-nacpil committed Jul 1, 2024
1 parent 1cfb0d1 commit 806a769
Show file tree
Hide file tree
Showing 4 changed files with 656 additions and 63 deletions.
Binary file added data/ph_s5p_AER_AI_340_380.tiff
Binary file not shown.
7 changes: 6 additions & 1 deletion geowrangler/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,15 @@
'geowrangler/raster_to_dataframe.py'),
'geowrangler.raster_to_dataframe.read_bands': ( 'raster_to_dataframe.html#read_bands',
'geowrangler/raster_to_dataframe.py')},
'geowrangler.raster_zonal_stats': { 'geowrangler.raster_zonal_stats.create_raster_zonal_stats': ( 'raster_zonal_stats.html#create_raster_zonal_stats',
'geowrangler.raster_zonal_stats': { 'geowrangler.raster_zonal_stats._validate_aggs': ( 'raster_zonal_stats.html#_validate_aggs',
'geowrangler/raster_zonal_stats.py'),
'geowrangler.raster_zonal_stats.create_exactextract_zonal_stats': ( 'raster_zonal_stats.html#create_exactextract_zonal_stats',
'geowrangler/raster_zonal_stats.py'),
'geowrangler.raster_zonal_stats.create_raster_zonal_stats': ( 'raster_zonal_stats.html#create_raster_zonal_stats',
'geowrangler/raster_zonal_stats.py')},
'geowrangler.spatialjoin_highest_intersection': { 'geowrangler.spatialjoin_highest_intersection.get_highest_intersection': ( 'spatialjoin_highest_intersection.html#get_highest_intersection',
'geowrangler/spatialjoin_highest_intersection.py')},
'geowrangler.test_module': {},
'geowrangler.tile_clustering': { 'geowrangler.tile_clustering.TileClustering': ( 'tile_clustering.html#tileclustering',
'geowrangler/tile_clustering.py'),
'geowrangler.tile_clustering.TileClustering.__init__': ( 'tile_clustering.html#tileclustering.__init__',
Expand Down
159 changes: 157 additions & 2 deletions geowrangler/raster_zonal_stats.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../notebooks/03_raster_zonal_stats.ipynb.

# %% auto 0
__all__ = ['create_raster_zonal_stats']
__all__ = ['create_raster_zonal_stats', 'create_exactextract_zonal_stats']

# %% ../notebooks/03_raster_zonal_stats.ipynb 8
from pathlib import Path
from typing import Any, Dict, Union
from typing import Any, Dict, Union, List
import warnings
import json

import geopandas as gpd
import pandas as pd
import rasterio
import rasterstats as rs
from exactextract import exact_extract
from exactextract.raster import RasterioRasterSource

from .vector_zonal_stats import _expand_aggs, _fillnas, _fix_agg

Expand Down Expand Up @@ -98,3 +102,154 @@ def create_raster_zonal_stats(
aoi = aoi.merge(results, how="left", left_index=True, right_index=True)

return aoi

# %% ../notebooks/03_raster_zonal_stats.ipynb 29
def _validate_aggs(aggregations, band_count):
"Validate aggregations based on band count, dropping invalid entries"
aggregations_validated = []

# Iterate over the list of dictionaries
for agg in aggregations:
# Check if the 'band' value is less than or equal to band_count
if agg["band"] > band_count:
warnings.warn(
f"Band number in {json.dumps(agg)} is invalid. Aggregation will be skipped."
)
continue
aggregations_validated.append(agg)

return aggregations_validated

# %% ../notebooks/03_raster_zonal_stats.ipynb 30
def create_exactextract_zonal_stats(
aoi: Union[ # The area of interest geodataframe, or path to the vector file
str, gpd.GeoDataFrame
],
data: Union[str, Path], # The path to the raster data file
aggregations: List[Dict[str, Any]], # List of agg specs
extra_args: dict = dict(
strategy="feature-sequential", max_cells_in_memory=30000000
),
) -> gpd.GeoDataFrame:
"""
Computes zonal statistics from raster data sources using vector areas of interest (AOI).
This function is a wrapper over the `exact_extract` method from the `exactextract` Python package,
designed for compatibility with other geowrangler modules. It takes a list of agg specs,
with each agg spec applied to a specific band.
Parameters
----------
aoi : GeoDataframe
A GeoDataframe specifying geometries.
data : str
Path to the raster file.
aggregations: list[Dict]: list of aggregation specs, which are dictionaries which specify the ff.
- band (int): band number of the input raster to aggregate
- func (list[str]): list of operations to perform (i.e. "mean", "median", "count"). For a full list of
possible exactextract functions, see: https://github.com/isciences/exactextract/tree/master?tab=readme-ov-file#supported-statistics
- output (str or list[str], option): output columns for operation.
If None (default), the output column name will be given as band_<band number>_<func> for all functions.
If str, the output column name will be given as <output>_<func> for all functions.
If list(str), the list items will be used as the column names, in order of the passed funcs.
- nodata_val (int or float, optional): nodata value to use in calculation.
extra_args : dict
Examples
--------
>>> create_exactextract_zonal_stats(
aoi=aoi_gdf,
data="path/to/raster.tif",
aggregations=[
dict(band=1, func=["mean", "sum"], nodata_val=-9999) # default - band_1_mean, band_1_sum
dict(band=1, func=["mean", "sum"], output=["pop_mean", "pop_sum"]),
dict(band=2, func=["mean", "sum"]) # default - band_2_mean, band_2_sum
dict(band=2, func=["mean", "sum"], output="prefix") # prefix_mean, prefix_sum
],
)
"""
# TODO: add extra args handling

if isinstance(aoi, str) or isinstance(aoi, Path):
aoi = gpd.read_file(aoi)

# Read metadata to determine raster properties
with rasterio.open(data) as dst:
data_meta = dst.meta
data_count = data_meta["count"]
data_crs = data_meta["crs"]

if aoi.crs != data_crs:
warnings.warn(
f"The CRS of the AOI ({aoi.crs}) and the raster data ({data_crs}) do not match!"
)

# Validate passed aggregations
aggregations = _validate_aggs(aggregations, data_count)

all_operations = set()
for agg in aggregations:
all_operations.update(agg["func"])

# TODO: figure out NODATA handling
# # If no data is set, add funcs with default_value into the list of operations
# if agg.get('nodata_val') is not None:
# nodata_val = str(agg.get('nodata_val'))
# operations_default_val = [f"{func}(default_value={nodata_val})" for func in agg.get('func')]
# all_operations.update(operations_default_val)

all_operations = sorted(all_operations)

# If input is single band, the raster will be given as band_1_<func> for all funcs
if data_count == 1:
results = exact_extract(
RasterioRasterSource(data),
aoi,
all_operations,
output="pandas",
include_geom=False,
)
results = results.rename(
columns={col: f"band_1_{col}" for col in results.columns}
)
else:
results = exact_extract(
data,
aoi,
all_operations,
output="pandas",
include_geom=False,
)

# Create new columns as specified by agg specs
# Each renamed column will be a copy of its corresponding result column
out_cols = []
for agg in aggregations:
result_cols = [f"band_{agg['band']}_{func}" for func in agg["func"]]

if agg.get("output") is None:
agg_out_cols = result_cols

if isinstance(agg.get("output"), str):
agg_out_cols = [f"{agg.get('output')}_{func}" for func in agg["func"]]

if isinstance(agg.get("output"), list) and all(
isinstance(item, str) for item in agg.get("output")
):
assert len(agg.get("output")) == len(
agg.get("func")
), f"Output list only has {len(agg.get('output'))}, expected {len(agg.get('func'))}"
agg_out_cols = agg.get("output")

# Add output columns to result
for agg_out_col, result_col in zip(agg_out_cols, result_cols):
results[agg_out_col] = results[result_col]
out_cols.extend(agg_out_cols)

# Return only the columns specified
output_results = results.loc[:, out_cols]

return aoi.merge(output_results, how="left", left_index=True, right_index=True)
553 changes: 493 additions & 60 deletions notebooks/03_raster_zonal_stats.ipynb

Large diffs are not rendered by default.

0 comments on commit 806a769

Please sign in to comment.