Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new statistics methods and band names in ImageData object #427

Merged
merged 5 commits into from
Oct 12, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 69 additions & 3 deletions rio_tiler/io/cogeo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@
from ..constants import WEB_MERCATOR_TMS, WGS84_CRS, BBox, Indexes, NoData
from ..errors import ExpressionMixingWarning, NoOverviewWarning, TileOutsideBounds
from ..expression import apply_expression, parse_expression
from ..models import ImageData, ImageStatistics, Info
from ..utils import create_cutline, has_alpha_band, has_mask_band
from ..models import BandStatistics, ImageData, ImageStatistics, Info
from ..utils import (
create_cutline,
get_array_statistics,
get_bands_names,
has_alpha_band,
has_mask_band,
)
from .base import BaseReader


Expand Down Expand Up @@ -236,6 +242,11 @@ def stats(
rio_tiler.models.ImageStatistics: bands statistics.

"""
warnings.warn(
"`stats` method will be removed and replaced by `statistics` in rio-tiler v3.0.0",
DeprecationWarning,
)

kwargs = {**self._kwargs, **kwargs}

hist_options = hist_options or {}
Expand All @@ -250,6 +261,46 @@ def stats(
)
return {b: ImageStatistics(**s) for b, s in stats.items()}

def statistics(
self,
categorical: bool = False,
categories: Optional[List[float]] = None,
percentiles: List[int] = [2, 98],
hist_options: Optional[Dict] = None,
max_size: int = 1024,
**kwargs: Any,
) -> Dict[str, BandStatistics]:
"""Return bands statistics from a COG.

Args:


hist_options (dict, optional): Options to forward to numpy.histogram function.
kwargs (optional): Options to forward to `self.preview`.

Returns:
Dict[str, rio_tiler.models.BandStatistics]: bands statistics.

"""
kwargs = {**self._kwargs, **kwargs}

data = self.preview(max_size=max_size, **kwargs)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the statistics method will only get stats for the full dataset (using preview) but we can set max_size to control the resolution the user wants


hist_options = hist_options or {}

stats = get_array_statistics(
data.as_masked(),
categorical=categorical,
categories=categories,
percentiles=percentiles,
**hist_options,
)

return {
f"{data.band_names[ix]}": BandStatistics(**stats[ix])
Copy link
Member Author

@vincentsarago vincentsarago Sep 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we use band_names to keep trace of the bands we return

e.g.

with COGReader("cog.tif") as cog:
    print(cog.statistics(expression="b1*2,b1"))

>> {
    'b1*2': BandStatistics(...),
    'b1': BandStatistics(...),
}

for ix in range(len(stats))
}

def tile(
self,
tile_x: int,
Expand Down Expand Up @@ -371,7 +422,16 @@ def part(
if bounds_crs and bounds_crs != dst_crs:
bbox = transform_bounds(bounds_crs, dst_crs, *bbox, densify_pts=21)

return ImageData(data, mask, bounds=bbox, crs=dst_crs, assets=[self.filepath],)
return ImageData(
data,
mask,
bounds=bbox,
crs=dst_crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)

def preview(
self,
Expand Down Expand Up @@ -430,6 +490,9 @@ def preview(
bounds=self.dataset.bounds,
crs=self.dataset.crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)

def point(
Expand Down Expand Up @@ -572,6 +635,9 @@ def read(
bounds=self.dataset.bounds,
crs=self.dataset.crs,
assets=[self.filepath],
band_names=get_bands_names(
indexes=indexes, expression=expression, count=data.shape[0]
),
)


Expand Down
29 changes: 29 additions & 0 deletions rio_tiler/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,30 @@ class ImageStatistics(RioTilerBaseModel):
valid_percent: float


class BandStatistics(RioTilerBaseModel):
"""Image statistics"""

min: float
max: float
mean: float
count: float
sum: float
std: float
median: float
majority: float
minority: float
unique: float
histogram: List[List[NumType]]
valid_percent: float
masked_pixels: float
valid_pixels: float

class Config:
"""Config for model."""

extra = "allow"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we allow extra for percentile_{} values



class Metadata(Info):
"""Dataset metadata and statistics."""

Expand Down Expand Up @@ -114,6 +138,7 @@ class ImageData:
bounds: Optional[BoundingBox] = attr.ib(default=None, converter=to_coordsbbox)
crs: Optional[CRS] = attr.ib(default=None)
metadata: Optional[Dict] = attr.ib(factory=dict)
band_names: Optional[List[str]] = attr.ib()

@data.validator
def _validate_data(self, attribute, value):
Expand All @@ -123,6 +148,10 @@ def _validate_data(self, attribute, value):
"ImageData data has to be an array in form of (count, height, width)"
)

@band_names.default
def _default_names(self):
return [f"{ix + 1}" for ix in range(self.count)]

@mask.default
def _default_mask(self):
return numpy.zeros((self.height, self.width), dtype="uint8") + 255
Expand Down
112 changes: 111 additions & 1 deletion rio_tiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from io import BytesIO
from typing import Any, Dict, Generator, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Generator, List, Optional, Sequence, Tuple, Union

import numpy
from affine import Affine
Expand Down Expand Up @@ -88,6 +88,116 @@ def _stats(
)


def get_bands_names(
indexes: Optional[Sequence[int]] = None,
expression: Optional[str] = None,
count: Optional[int] = None,
) -> List[str]:
"""Define bands names based on expression, indexes or band count."""
if expression:
return expression.split(",")

elif indexes:
return [str(idx) for idx in indexes]

elif count:
return [str(idx + 1) for idx in range(count)]

else:
raise ValueError(
"one of expression or indexes or count must be passed to define band names."
)


def get_array_statistics(
data: numpy.ma.array,
categorical: bool = False,
categories: Optional[List[float]] = None,
percentiles: List[int] = [2, 98],
**kwargs: Any,
) -> List[Dict[Any, Any]]:
"""Calculate array statistics.

Args:
data (numpy.ndarray): Input array data to get the stats from.
categorical (bool): treat input data as categorical data. Defaults to False.
categories (list of numbers, optional)
vincentsarago marked this conversation as resolved.
Show resolved Hide resolved
percentiles (list of numbers, optional): List of percentile values to calculate. Defaults to `[2, 98]`.
kwargs (optional): Options to forward to numpy.histogram function (only applies for non-categorical data).

Returns:
list of dict

Examples:
>>> {
'percentiles': [38, 147],
'min': 20,
'max': 180,
'std': 28.123562304138662,
'histogram': [
[1625, 219241, 28344, 15808, 12325, 10687, 8535, 7348, 4656, 1208],
[20.0, 36.0, 52.0, 68.0, 84.0, 100.0, 116.0, 132.0, 148.0, 164.0, 180.0]
],
'valid_percent': 0.5
}
vincentsarago marked this conversation as resolved.
Show resolved Hide resolved

"""
if len(data.shape) < 3:
data = numpy.expand_dims(data, axis=0)

output: List[Dict[Any, Any]] = []
percentiles_names = [f"percentile_{int(p)}" for p in percentiles]

for b in range(data.shape[0]):
keys, counts = numpy.unique(data[b].compressed(), return_counts=True)

valid_pixels = float(numpy.ma.count(data[b]))
masked_pixels = float(numpy.ma.count_masked(data[b]))
valid_percent = round((valid_pixels / data[b].size) * 100, 2)
info_px = {
"valid_pixels": valid_pixels,
"masked_pixels": masked_pixels,
"valid_percent": valid_percent,
}

if categorical:
out_dict = dict(zip(keys.tolist(), counts.tolist()))
h_keys = (
numpy.array(categories).astype(keys.dtype) if categories else keys
).tolist()
histogram = [
[out_dict[x] for x in h_keys],
h_keys,
]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the data is set as categorical the histogram is then set to be the count of each values.

else:
h_counts, h_keys = numpy.histogram(data[b][~data[b].mask], **kwargs)
histogram = [h_counts.tolist(), h_keys.tolist()]

percentiles_values = numpy.percentile(
data[b].compressed(), percentiles
).tolist()

output.append(
{
"min": float(data[b].min()),
"max": float(data[b].max()),
"mean": float(data[b].mean()),
"count": float(data[b].count()),
"sum": float(data[b].sum()),
"std": float(data[b].std()),
"median": float(numpy.ma.median(data[b])),
"majority": float(keys[counts.tolist().index(counts.max())].tolist()),
"minority": float(keys[counts.tolist().index(counts.min())].tolist()),
"unique": float(counts.size),
**dict(zip(percentiles_names, percentiles_values)),
"histogram": histogram,
**info_px,
}
)

return output


# https://github.com/OSGeo/gdal/blob/b1c9c12ad373e40b955162b45d704070d4ebf7b0/gdal/frmts/ingr/IngrTypes.cpp#L191
def _div_round_up(a: int, b: int) -> int:
return (a // b) if (a % b) == 0 else (a // b) + 1
Expand Down