Skip to content

Commit

Permalink
add new statistics methods and band names in ImageData object (#427)
Browse files Browse the repository at this point in the history
* add new statistics methods and band names in ImageData object

* update base classes and tests

* MultiBandReader.statistics should use self.preview as COGReader

* update tests
  • Loading branch information
vincentsarago authored Oct 12, 2021
1 parent 2028c42 commit cc50276
Show file tree
Hide file tree
Showing 9 changed files with 744 additions and 134 deletions.
186 changes: 164 additions & 22 deletions rio_tiler/io/base.py

Large diffs are not rendered by default.

86 changes: 81 additions & 5 deletions rio_tiler/io/cogeo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@
from ..constants import WEB_MERCATOR_TMS, WGS84_CRS, BBox, Indexes, NoData
from ..errors import ExpressionMixingWarning, NoOverviewWarning, TileOutsideBounds
from ..expression import apply_expression, parse_expression
from ..models import ImageData, ImageStatistics, Info
from ..utils import create_cutline, has_alpha_band, has_mask_band
from ..models import BandStatistics, ImageData, ImageStatistics, Info
from ..utils import (
create_cutline,
get_array_statistics,
get_bands_names,
has_alpha_band,
has_mask_band,
)
from .base import BaseReader


Expand Down Expand Up @@ -248,6 +254,11 @@ def stats(
rio_tiler.models.ImageStatistics: bands statistics.
"""
warnings.warn(
"`stats` method will be removed and replaced by `statistics` in rio-tiler v3.0.0",
DeprecationWarning,
)

kwargs = {**self._kwargs, **kwargs}

hist_options = hist_options or {}
Expand All @@ -262,6 +273,48 @@ def stats(
)
return {b: ImageStatistics(**s) for b, s in stats.items()}

def statistics(
self,
categorical: bool = False,
categories: Optional[List[float]] = None,
percentiles: List[int] = [2, 98],
hist_options: Optional[Dict] = None,
max_size: int = 1024,
**kwargs: Any,
) -> Dict[str, BandStatistics]:
"""Return bands statistics from a dataset.
Args:
categorical (bool): treat input data as categorical data. Defaults to False.
categories (list of numbers, optional): list of caterogies to return value for.
percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
hist_options (dict, optional): Options to forward to numpy.histogram function.
max_size (int, optional): Limit the size of the longest dimension of the dataset read, respecting bounds X/Y aspect ratio. Defaults to 1024.
kwargs (optional): Options to forward to `self.preview`.
Returns:
Dict[str, rio_tiler.models.BandStatistics]: bands statistics.
"""
kwargs = {**self._kwargs, **kwargs}

data = self.preview(max_size=max_size, **kwargs)

hist_options = hist_options or {}

stats = get_array_statistics(
data.as_masked(),
categorical=categorical,
categories=categories,
percentiles=percentiles,
**hist_options,
)

return {
f"{data.band_names[ix]}": BandStatistics(**stats[ix])
for ix in range(len(stats))
}

def tile(
self,
tile_x: int,
Expand Down Expand Up @@ -383,7 +436,16 @@ def part(
if bounds_crs and bounds_crs != dst_crs:
bbox = transform_bounds(bounds_crs, dst_crs, *bbox, densify_pts=21)

return ImageData(data, mask, bounds=bbox, crs=dst_crs, assets=[self.filepath],)
return ImageData(
data,
mask,
bounds=bbox,
crs=dst_crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)

def preview(
self,
Expand Down Expand Up @@ -437,7 +499,14 @@ def preview(
data = apply_expression(blocks, bands, data)

return ImageData(
data, mask, bounds=self.bounds, crs=self.crs, assets=[self.filepath],
data,
mask,
bounds=self.bounds,
crs=self.crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)

def point(
Expand Down Expand Up @@ -579,7 +648,14 @@ def read(
data = apply_expression(blocks, bands, data)

return ImageData(
data, mask, bounds=self.bounds, crs=self.crs, assets=[self.filepath],
data,
mask,
bounds=self.bounds,
crs=self.crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)


Expand Down
48 changes: 46 additions & 2 deletions rio_tiler/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""rio-tiler models."""

import itertools
import warnings
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
Expand Down Expand Up @@ -82,6 +83,30 @@ class ImageStatistics(RioTilerBaseModel):
valid_percent: float


class BandStatistics(RioTilerBaseModel):
"""Image statistics"""

min: float
max: float
mean: float
count: float
sum: float
std: float
median: float
majority: float
minority: float
unique: float
histogram: List[List[NumType]]
valid_percent: float
masked_pixels: float
valid_pixels: float

class Config:
"""Config for model."""

extra = "allow" # We allow extra values for `percentiles_{}`


class Metadata(Info):
"""Dataset metadata and statistics."""

Expand Down Expand Up @@ -113,6 +138,7 @@ class ImageData:
bounds: Optional[BoundingBox] = attr.ib(default=None, converter=to_coordsbbox)
crs: Optional[CRS] = attr.ib(default=None)
metadata: Optional[Dict] = attr.ib(factory=dict)
band_names: Optional[List[str]] = attr.ib()

@data.validator
def _validate_data(self, attribute, value):
Expand All @@ -122,6 +148,10 @@ def _validate_data(self, attribute, value):
"ImageData data has to be an array in form of (count, height, width)"
)

@band_names.default
def _default_names(self):
return [f"{ix + 1}" for ix in range(self.count)]

@mask.default
def _default_mask(self):
return numpy.zeros((self.height, self.width), dtype="uint8") + 255
Expand All @@ -141,15 +171,29 @@ def create_from_list(cls, data: Sequence["ImageData"]):
"""
arr = numpy.concatenate([img.data for img in data])
mask = numpy.all([img.mask for img in data], axis=0).astype(numpy.uint8) * 255
assets = [img.assets[0] for img in data if img.assets]
assets = list(
dict.fromkeys(
itertools.chain.from_iterable(
[img.assets for img in data if img.assets]
)
)
)

bounds_values = [img.bounds for img in data if img.bounds]
bounds = bounds_values[0] if bounds_values else None

crs_values = [img.crs for img in data if img.crs]
crs = crs_values[0] if crs_values else None

return cls(arr, mask, assets=assets, crs=crs, bounds=bounds)
band_names = list(
itertools.chain.from_iterable(
[img.band_names for img in data if img.band_names]
)
)

return cls(
arr, mask, assets=assets, crs=crs, bounds=bounds, band_names=band_names
)

def as_masked(self) -> numpy.ma.MaskedArray:
"""return a numpy masked array."""
Expand Down
126 changes: 125 additions & 1 deletion rio_tiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from io import BytesIO
from typing import Any, Dict, Generator, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Generator, List, Optional, Sequence, Tuple, Union

import numpy
from affine import Affine
Expand Down Expand Up @@ -88,6 +88,130 @@ def _stats(
)


def get_bands_names(
indexes: Optional[Sequence[int]] = None,
expression: Optional[str] = None,
count: Optional[int] = None,
) -> List[str]:
"""Define bands names based on expression, indexes or band count."""
if expression:
return expression.split(",")

elif indexes:
return [str(idx) for idx in indexes]

elif count:
return [str(idx + 1) for idx in range(count)]

else:
raise ValueError(
"one of expression or indexes or count must be passed to define band names."
)


def get_array_statistics(
data: numpy.ma.array,
categorical: bool = False,
categories: Optional[List[float]] = None,
percentiles: List[int] = [2, 98],
**kwargs: Any,
) -> List[Dict[Any, Any]]:
"""Calculate per band array statistics.
Args:
data (numpy.ma.ndarray): input masked array data to get the statistics from.
categorical (bool): treat input data as categorical data. Defaults to False.
categories (list of numbers, optional): list of caterogies to return value for.
percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
kwargs (optional): options to forward to `numpy.histogram` function (only applies for non-categorical data).
Returns:
list of dict
Examples:
>>> data = numpy.ma.zeros((1, 256, 256))
>>> get_array_statistics(data)
[
{
'min': 0.0,
'max': 0.0,
'mean': 0.0,
'count': 65536.0,
'sum': 0.0,
'std': 0.0,
'median': 0.0,
'majority': 0.0,
'minority': 0.0,
'unique': 1.0,
'percentile_2': 0.0,
'percentile_98': 0.0,
'histogram': [
[0, 0, 0, 0, 0, 65536, 0, 0, 0, 0],
[-0.5, -0.4, -0.3, -0.19999999999999996, -0.09999999999999998, 0.0, 0.10000000000000009, 0.20000000000000007, 0.30000000000000004, 0.4, 0.5]
],
'valid_pixels': 65536.0,
'masked_pixels': 0.0,
'valid_percent': 100.0
}
]
"""
if len(data.shape) < 3:
data = numpy.expand_dims(data, axis=0)

output: List[Dict[Any, Any]] = []
percentiles_names = [f"percentile_{int(p)}" for p in percentiles]

for b in range(data.shape[0]):
keys, counts = numpy.unique(data[b].compressed(), return_counts=True)

valid_pixels = float(numpy.ma.count(data[b]))
masked_pixels = float(numpy.ma.count_masked(data[b]))
valid_percent = round((valid_pixels / data[b].size) * 100, 2)
info_px = {
"valid_pixels": valid_pixels,
"masked_pixels": masked_pixels,
"valid_percent": valid_percent,
}

if categorical:
out_dict = dict(zip(keys.tolist(), counts.tolist()))
h_keys = (
numpy.array(categories).astype(keys.dtype) if categories else keys
).tolist()
histogram = [
[out_dict[x] for x in h_keys],
h_keys,
]
else:
h_counts, h_keys = numpy.histogram(data[b][~data[b].mask], **kwargs)
histogram = [h_counts.tolist(), h_keys.tolist()]

percentiles_values = numpy.percentile(
data[b].compressed(), percentiles
).tolist()

output.append(
{
"min": float(data[b].min()),
"max": float(data[b].max()),
"mean": float(data[b].mean()),
"count": float(data[b].count()),
"sum": float(data[b].sum()),
"std": float(data[b].std()),
"median": float(numpy.ma.median(data[b])),
"majority": float(keys[counts.tolist().index(counts.max())].tolist()),
"minority": float(keys[counts.tolist().index(counts.min())].tolist()),
"unique": float(counts.size),
**dict(zip(percentiles_names, percentiles_values)),
"histogram": histogram,
**info_px,
}
)

return output


# https://github.com/OSGeo/gdal/blob/b1c9c12ad373e40b955162b45d704070d4ebf7b0/gdal/frmts/ingr/IngrTypes.cpp#L191
def _div_round_up(a: int, b: int) -> int:
return (a // b) if (a % b) == 0 else (a // b) + 1
Expand Down
Loading

0 comments on commit cc50276

Please sign in to comment.