Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new statistics methods and band names in ImageData object #427

Merged
merged 5 commits into from
Oct 12, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 171 additions & 22 deletions rio_tiler/io/base.py

Large diffs are not rendered by default.

74 changes: 71 additions & 3 deletions rio_tiler/io/cogeo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@
from ..constants import WEB_MERCATOR_TMS, WGS84_CRS, BBox, Indexes, NoData
from ..errors import ExpressionMixingWarning, NoOverviewWarning, TileOutsideBounds
from ..expression import apply_expression, parse_expression
from ..models import ImageData, ImageStatistics, Info
from ..utils import create_cutline, has_alpha_band, has_mask_band
from ..models import BandStatistics, ImageData, ImageStatistics, Info
from ..utils import (
create_cutline,
get_array_statistics,
get_bands_names,
has_alpha_band,
has_mask_band,
)
from .base import BaseReader


Expand Down Expand Up @@ -236,6 +242,11 @@ def stats(
rio_tiler.models.ImageStatistics: bands statistics.

"""
warnings.warn(
"`stats` method will be removed and replaced by `statistics` in rio-tiler v3.0.0",
DeprecationWarning,
)

kwargs = {**self._kwargs, **kwargs}

hist_options = hist_options or {}
Expand All @@ -250,6 +261,48 @@ def stats(
)
return {b: ImageStatistics(**s) for b, s in stats.items()}

def statistics(
self,
categorical: bool = False,
categories: Optional[List[float]] = None,
percentiles: List[int] = [2, 98],
hist_options: Optional[Dict] = None,
max_size: int = 1024,
**kwargs: Any,
) -> Dict[str, BandStatistics]:
"""Return bands statistics from a dataset.

Args:
categorical (bool): treat input data as categorical data. Defaults to False.
categories (list of numbers, optional): list of caterogies to return value for.
percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
hist_options (dict, optional): Options to forward to numpy.histogram function.
max_size (int, optional): Limit the size of the longest dimension of the dataset read, respecting bounds X/Y aspect ratio. Defaults to 1024.
kwargs (optional): Options to forward to `self.preview`.

Returns:
Dict[str, rio_tiler.models.BandStatistics]: bands statistics.

"""
kwargs = {**self._kwargs, **kwargs}

data = self.preview(max_size=max_size, **kwargs)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the statistics method will only get stats for the full dataset (using preview) but we can set max_size to control the resolution the user wants


hist_options = hist_options or {}

stats = get_array_statistics(
data.as_masked(),
categorical=categorical,
categories=categories,
percentiles=percentiles,
**hist_options,
)

return {
f"{data.band_names[ix]}": BandStatistics(**stats[ix])
Copy link
Member Author

@vincentsarago vincentsarago Sep 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we use band_names to keep trace of the bands we return

e.g.

with COGReader("cog.tif") as cog:
    print(cog.statistics(expression="b1*2,b1"))

>> {
    'b1*2': BandStatistics(...),
    'b1': BandStatistics(...),
}

for ix in range(len(stats))
}

def tile(
self,
tile_x: int,
Expand Down Expand Up @@ -371,7 +424,16 @@ def part(
if bounds_crs and bounds_crs != dst_crs:
bbox = transform_bounds(bounds_crs, dst_crs, *bbox, densify_pts=21)

return ImageData(data, mask, bounds=bbox, crs=dst_crs, assets=[self.filepath],)
return ImageData(
data,
mask,
bounds=bbox,
crs=dst_crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)

def preview(
self,
Expand Down Expand Up @@ -430,6 +492,9 @@ def preview(
bounds=self.dataset.bounds,
crs=self.dataset.crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)

def point(
Expand Down Expand Up @@ -572,6 +637,9 @@ def read(
bounds=self.dataset.bounds,
crs=self.dataset.crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)


Expand Down
48 changes: 46 additions & 2 deletions rio_tiler/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""rio-tiler models."""

import itertools
import warnings
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
Expand Down Expand Up @@ -83,6 +84,30 @@ class ImageStatistics(RioTilerBaseModel):
valid_percent: float


class BandStatistics(RioTilerBaseModel):
"""Image statistics"""

min: float
max: float
mean: float
count: float
sum: float
std: float
median: float
majority: float
minority: float
unique: float
histogram: List[List[NumType]]
valid_percent: float
masked_pixels: float
valid_pixels: float

class Config:
"""Config for model."""

extra = "allow" # We allow extra values for `percentiles_{}`


class Metadata(Info):
"""Dataset metadata and statistics."""

Expand Down Expand Up @@ -114,6 +139,7 @@ class ImageData:
bounds: Optional[BoundingBox] = attr.ib(default=None, converter=to_coordsbbox)
crs: Optional[CRS] = attr.ib(default=None)
metadata: Optional[Dict] = attr.ib(factory=dict)
band_names: Optional[List[str]] = attr.ib()

@data.validator
def _validate_data(self, attribute, value):
Expand All @@ -123,6 +149,10 @@ def _validate_data(self, attribute, value):
"ImageData data has to be an array in form of (count, height, width)"
)

@band_names.default
def _default_names(self):
return [f"{ix + 1}" for ix in range(self.count)]

@mask.default
def _default_mask(self):
return numpy.zeros((self.height, self.width), dtype="uint8") + 255
Expand All @@ -142,15 +172,29 @@ def create_from_list(cls, data: Sequence["ImageData"]):
"""
arr = numpy.concatenate([img.data for img in data])
mask = numpy.all([img.mask for img in data], axis=0).astype(numpy.uint8) * 255
assets = [img.assets[0] for img in data if img.assets]
assets = list(
dict.fromkeys(
itertools.chain.from_iterable(
[img.assets for img in data if img.assets]
)
)
)

bounds_values = [img.bounds for img in data if img.bounds]
bounds = bounds_values[0] if bounds_values else None

crs_values = [img.crs for img in data if img.crs]
crs = crs_values[0] if crs_values else None

return cls(arr, mask, assets=assets, crs=crs, bounds=bounds)
band_names = list(
itertools.chain.from_iterable(
[img.band_names for img in data if img.band_names]
)
)

return cls(
arr, mask, assets=assets, crs=crs, bounds=bounds, band_names=band_names
)

def as_masked(self) -> numpy.ma.MaskedArray:
"""return a numpy masked array."""
Expand Down
126 changes: 125 additions & 1 deletion rio_tiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from io import BytesIO
from typing import Any, Dict, Generator, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Generator, List, Optional, Sequence, Tuple, Union

import numpy
from affine import Affine
Expand Down Expand Up @@ -88,6 +88,130 @@ def _stats(
)


def get_bands_names(
indexes: Optional[Sequence[int]] = None,
expression: Optional[str] = None,
count: Optional[int] = None,
) -> List[str]:
"""Define bands names based on expression, indexes or band count."""
if expression:
return expression.split(",")

elif indexes:
return [str(idx) for idx in indexes]

elif count:
return [str(idx + 1) for idx in range(count)]

else:
raise ValueError(
"one of expression or indexes or count must be passed to define band names."
)


def get_array_statistics(
data: numpy.ma.array,
categorical: bool = False,
categories: Optional[List[float]] = None,
percentiles: List[int] = [2, 98],
**kwargs: Any,
) -> List[Dict[Any, Any]]:
"""Calculate per band array statistics.

Args:
data (numpy.ma.ndarray): input masked array data to get the statistics from.
categorical (bool): treat input data as categorical data. Defaults to False.
categories (list of numbers, optional): list of caterogies to return value for.
percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
kwargs (optional): options to forward to `numpy.histogram` function (only applies for non-categorical data).

Returns:
list of dict

Examples:
>>> data = numpy.ma.zeros((1, 256, 256))
>>> get_array_statistics(data)
[
{
'min': 0.0,
'max': 0.0,
'mean': 0.0,
'count': 65536.0,
'sum': 0.0,
'std': 0.0,
'median': 0.0,
'majority': 0.0,
'minority': 0.0,
'unique': 1.0,
'percentile_2': 0.0,
'percentile_98': 0.0,
'histogram': [
[0, 0, 0, 0, 0, 65536, 0, 0, 0, 0],
[-0.5, -0.4, -0.3, -0.19999999999999996, -0.09999999999999998, 0.0, 0.10000000000000009, 0.20000000000000007, 0.30000000000000004, 0.4, 0.5]
],
'valid_pixels': 65536.0,
'masked_pixels': 0.0,
'valid_percent': 100.0
}
]

"""
if len(data.shape) < 3:
data = numpy.expand_dims(data, axis=0)

output: List[Dict[Any, Any]] = []
percentiles_names = [f"percentile_{int(p)}" for p in percentiles]

for b in range(data.shape[0]):
keys, counts = numpy.unique(data[b].compressed(), return_counts=True)

valid_pixels = float(numpy.ma.count(data[b]))
masked_pixels = float(numpy.ma.count_masked(data[b]))
valid_percent = round((valid_pixels / data[b].size) * 100, 2)
info_px = {
"valid_pixels": valid_pixels,
"masked_pixels": masked_pixels,
"valid_percent": valid_percent,
}

if categorical:
out_dict = dict(zip(keys.tolist(), counts.tolist()))
h_keys = (
numpy.array(categories).astype(keys.dtype) if categories else keys
).tolist()
histogram = [
[out_dict[x] for x in h_keys],
h_keys,
]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the data is set as categorical the histogram is then set to be the count of each values.

else:
h_counts, h_keys = numpy.histogram(data[b][~data[b].mask], **kwargs)
histogram = [h_counts.tolist(), h_keys.tolist()]

percentiles_values = numpy.percentile(
data[b].compressed(), percentiles
).tolist()

output.append(
{
"min": float(data[b].min()),
"max": float(data[b].max()),
"mean": float(data[b].mean()),
"count": float(data[b].count()),
"sum": float(data[b].sum()),
"std": float(data[b].std()),
"median": float(numpy.ma.median(data[b])),
"majority": float(keys[counts.tolist().index(counts.max())].tolist()),
"minority": float(keys[counts.tolist().index(counts.min())].tolist()),
"unique": float(counts.size),
**dict(zip(percentiles_names, percentiles_values)),
"histogram": histogram,
**info_px,
}
)

return output


# https://github.com/OSGeo/gdal/blob/b1c9c12ad373e40b955162b45d704070d4ebf7b0/gdal/frmts/ingr/IngrTypes.cpp#L191
def _div_round_up(a: int, b: int) -> int:
return (a // b) if (a % b) == 0 else (a // b) + 1
Expand Down
Loading