add new statistics methods and band names in ImageData object (#427)

* add new statistics methods and band names in ImageData object * update base classes and tests * MultiBandReader.statistics should use self.preview as COGReader * update tests
cogeotiff · Oct 12, 2021 · cc50276 · cc50276
1 parent 2028c42
commit cc50276
Show file tree

Hide file tree

Showing 9 changed files with 744 additions and 134 deletions.
diff --git a/rio_tiler/io/base.py b/rio_tiler/io/base.py
diff --git a/rio_tiler/io/cogeo.py b/rio_tiler/io/cogeo.py
@@ -20,8 +20,14 @@
 from ..constants import WEB_MERCATOR_TMS, WGS84_CRS, BBox, Indexes, NoData
 from ..errors import ExpressionMixingWarning, NoOverviewWarning, TileOutsideBounds
 from ..expression import apply_expression, parse_expression
-from ..models import ImageData, ImageStatistics, Info
-from ..utils import create_cutline, has_alpha_band, has_mask_band
+from ..models import BandStatistics, ImageData, ImageStatistics, Info
+from ..utils import (
+    create_cutline,
+    get_array_statistics,
+    get_bands_names,
+    has_alpha_band,
+    has_mask_band,
+)
 from .base import BaseReader
 
 
@@ -248,6 +254,11 @@ def stats(
             rio_tiler.models.ImageStatistics: bands statistics.
 
         """
+        warnings.warn(
+            "`stats` method will be removed and replaced by `statistics` in rio-tiler v3.0.0",
+            DeprecationWarning,
+        )
+
         kwargs = {**self._kwargs, **kwargs}
 
         hist_options = hist_options or {}
@@ -262,6 +273,48 @@ def stats(
         )
         return {b: ImageStatistics(**s) for b, s in stats.items()}
 
+    def statistics(
+        self,
+        categorical: bool = False,
+        categories: Optional[List[float]] = None,
+        percentiles: List[int] = [2, 98],
+        hist_options: Optional[Dict] = None,
+        max_size: int = 1024,
+        **kwargs: Any,
+    ) -> Dict[str, BandStatistics]:
+        """Return bands statistics from a dataset.
+
+        Args:
+            categorical (bool): treat input data as categorical data. Defaults to False.
+            categories (list of numbers, optional): list of caterogies to return value for.
+            percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
+            hist_options (dict, optional): Options to forward to numpy.histogram function.
+            max_size (int, optional): Limit the size of the longest dimension of the dataset read, respecting bounds X/Y aspect ratio. Defaults to 1024.
+            kwargs (optional): Options to forward to `self.preview`.
+
+        Returns:
+            Dict[str, rio_tiler.models.BandStatistics]: bands statistics.
+
+        """
+        kwargs = {**self._kwargs, **kwargs}
+
+        data = self.preview(max_size=max_size, **kwargs)
+
+        hist_options = hist_options or {}
+
+        stats = get_array_statistics(
+            data.as_masked(),
+            categorical=categorical,
+            categories=categories,
+            percentiles=percentiles,
+            **hist_options,
+        )
+
+        return {
+            f"{data.band_names[ix]}": BandStatistics(**stats[ix])
+            for ix in range(len(stats))
+        }
+
     def tile(
         self,
         tile_x: int,
@@ -383,7 +436,16 @@ def part(
         if bounds_crs and bounds_crs != dst_crs:
             bbox = transform_bounds(bounds_crs, dst_crs, *bbox, densify_pts=21)
 
-        return ImageData(data, mask, bounds=bbox, crs=dst_crs, assets=[self.filepath],)
+        return ImageData(
+            data,
+            mask,
+            bounds=bbox,
+            crs=dst_crs,
+            assets=[self.filepath],
+            band_names=get_bands_names(
+                indexes=indexes, expression=expression, count=data.shape[0]
+            ),
+        )
 
     def preview(
         self,
@@ -437,7 +499,14 @@ def preview(
             data = apply_expression(blocks, bands, data)
 
         return ImageData(
-            data, mask, bounds=self.bounds, crs=self.crs, assets=[self.filepath],
+            data,
+            mask,
+            bounds=self.bounds,
+            crs=self.crs,
+            assets=[self.filepath],
+            band_names=get_bands_names(
+                indexes=indexes, expression=expression, count=data.shape[0]
+            ),
         )
 
     def point(
@@ -579,7 +648,14 @@ def read(
             data = apply_expression(blocks, bands, data)
 
         return ImageData(
-            data, mask, bounds=self.bounds, crs=self.crs, assets=[self.filepath],
+            data,
+            mask,
+            bounds=self.bounds,
+            crs=self.crs,
+            assets=[self.filepath],
+            band_names=get_bands_names(
+                indexes=indexes, expression=expression, count=data.shape[0]
+            ),
         )
 
 

diff --git a/rio_tiler/models.py b/rio_tiler/models.py
@@ -1,5 +1,6 @@
 """rio-tiler models."""
 
+import itertools
 import warnings
 from enum import Enum
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
@@ -82,6 +83,30 @@ class ImageStatistics(RioTilerBaseModel):
     valid_percent: float
 
 
+class BandStatistics(RioTilerBaseModel):
+    """Image statistics"""
+
+    min: float
+    max: float
+    mean: float
+    count: float
+    sum: float
+    std: float
+    median: float
+    majority: float
+    minority: float
+    unique: float
+    histogram: List[List[NumType]]
+    valid_percent: float
+    masked_pixels: float
+    valid_pixels: float
+
+    class Config:
+        """Config for model."""
+
+        extra = "allow"  # We allow extra values for `percentiles_{}`
+
+
 class Metadata(Info):
     """Dataset metadata and statistics."""
 
@@ -113,6 +138,7 @@ class ImageData:
     bounds: Optional[BoundingBox] = attr.ib(default=None, converter=to_coordsbbox)
     crs: Optional[CRS] = attr.ib(default=None)
     metadata: Optional[Dict] = attr.ib(factory=dict)
+    band_names: Optional[List[str]] = attr.ib()
 
     @data.validator
     def _validate_data(self, attribute, value):
@@ -122,6 +148,10 @@ def _validate_data(self, attribute, value):
                 "ImageData data has to be an array in form of (count, height, width)"
             )
 
+    @band_names.default
+    def _default_names(self):
+        return [f"{ix + 1}" for ix in range(self.count)]
+
     @mask.default
     def _default_mask(self):
         return numpy.zeros((self.height, self.width), dtype="uint8") + 255
@@ -141,15 +171,29 @@ def create_from_list(cls, data: Sequence["ImageData"]):
         """
         arr = numpy.concatenate([img.data for img in data])
         mask = numpy.all([img.mask for img in data], axis=0).astype(numpy.uint8) * 255
-        assets = [img.assets[0] for img in data if img.assets]
+        assets = list(
+            dict.fromkeys(
+                itertools.chain.from_iterable(
+                    [img.assets for img in data if img.assets]
+                )
+            )
+        )
 
         bounds_values = [img.bounds for img in data if img.bounds]
         bounds = bounds_values[0] if bounds_values else None
 
         crs_values = [img.crs for img in data if img.crs]
         crs = crs_values[0] if crs_values else None
 
-        return cls(arr, mask, assets=assets, crs=crs, bounds=bounds)
+        band_names = list(
+            itertools.chain.from_iterable(
+                [img.band_names for img in data if img.band_names]
+            )
+        )
+
+        return cls(
+            arr, mask, assets=assets, crs=crs, bounds=bounds, band_names=band_names
+        )
 
     def as_masked(self) -> numpy.ma.MaskedArray:
         """return a numpy masked array."""

diff --git a/rio_tiler/utils.py b/rio_tiler/utils.py
@@ -2,7 +2,7 @@
 
 import os
 from io import BytesIO
-from typing import Any, Dict, Generator, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, Generator, List, Optional, Sequence, Tuple, Union
 
 import numpy
 from affine import Affine
@@ -88,6 +88,130 @@ def _stats(
     )
 
 
+def get_bands_names(
+    indexes: Optional[Sequence[int]] = None,
+    expression: Optional[str] = None,
+    count: Optional[int] = None,
+) -> List[str]:
+    """Define bands names based on expression, indexes or band count."""
+    if expression:
+        return expression.split(",")
+
+    elif indexes:
+        return [str(idx) for idx in indexes]
+
+    elif count:
+        return [str(idx + 1) for idx in range(count)]
+
+    else:
+        raise ValueError(
+            "one of expression or indexes or count must be passed to define band names."
+        )
+
+
+def get_array_statistics(
+    data: numpy.ma.array,
+    categorical: bool = False,
+    categories: Optional[List[float]] = None,
+    percentiles: List[int] = [2, 98],
+    **kwargs: Any,
+) -> List[Dict[Any, Any]]:
+    """Calculate per band array statistics.
+
+    Args:
+        data (numpy.ma.ndarray): input masked array data to get the statistics from.
+        categorical (bool): treat input data as categorical data. Defaults to False.
+        categories (list of numbers, optional): list of caterogies to return value for.
+        percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
+        kwargs (optional): options to forward to `numpy.histogram` function (only applies for non-categorical data).
+
+    Returns:
+        list of dict
+
+    Examples:
+        >>> data = numpy.ma.zeros((1, 256, 256))
+        >>> get_array_statistics(data)
+        [
+            {
+                'min': 0.0,
+                'max': 0.0,
+                'mean': 0.0,
+                'count': 65536.0,
+                'sum': 0.0,
+                'std': 0.0,
+                'median': 0.0,
+                'majority': 0.0,
+                'minority': 0.0,
+                'unique': 1.0,
+                'percentile_2': 0.0,
+                'percentile_98': 0.0,
+                'histogram': [
+                    [0, 0, 0, 0, 0, 65536, 0, 0, 0, 0],
+                    [-0.5, -0.4, -0.3, -0.19999999999999996, -0.09999999999999998, 0.0, 0.10000000000000009, 0.20000000000000007, 0.30000000000000004, 0.4, 0.5]
+                ],
+                'valid_pixels': 65536.0,
+                'masked_pixels': 0.0,
+                'valid_percent': 100.0
+            }
+        ]
+
+    """
+    if len(data.shape) < 3:
+        data = numpy.expand_dims(data, axis=0)
+
+    output: List[Dict[Any, Any]] = []
+    percentiles_names = [f"percentile_{int(p)}" for p in percentiles]
+
+    for b in range(data.shape[0]):
+        keys, counts = numpy.unique(data[b].compressed(), return_counts=True)
+
+        valid_pixels = float(numpy.ma.count(data[b]))
+        masked_pixels = float(numpy.ma.count_masked(data[b]))
+        valid_percent = round((valid_pixels / data[b].size) * 100, 2)
+        info_px = {
+            "valid_pixels": valid_pixels,
+            "masked_pixels": masked_pixels,
+            "valid_percent": valid_percent,
+        }
+
+        if categorical:
+            out_dict = dict(zip(keys.tolist(), counts.tolist()))
+            h_keys = (
+                numpy.array(categories).astype(keys.dtype) if categories else keys
+            ).tolist()
+            histogram = [
+                [out_dict[x] for x in h_keys],
+                h_keys,
+            ]
+        else:
+            h_counts, h_keys = numpy.histogram(data[b][~data[b].mask], **kwargs)
+            histogram = [h_counts.tolist(), h_keys.tolist()]
+
+        percentiles_values = numpy.percentile(
+            data[b].compressed(), percentiles
+        ).tolist()
+
+        output.append(
+            {
+                "min": float(data[b].min()),
+                "max": float(data[b].max()),
+                "mean": float(data[b].mean()),
+                "count": float(data[b].count()),
+                "sum": float(data[b].sum()),
+                "std": float(data[b].std()),
+                "median": float(numpy.ma.median(data[b])),
+                "majority": float(keys[counts.tolist().index(counts.max())].tolist()),
+                "minority": float(keys[counts.tolist().index(counts.min())].tolist()),
+                "unique": float(counts.size),
+                **dict(zip(percentiles_names, percentiles_values)),
+                "histogram": histogram,
+                **info_px,
+            }
+        )
+
+    return output
+
+
 # https://github.com/OSGeo/gdal/blob/b1c9c12ad373e40b955162b45d704070d4ebf7b0/gdal/frmts/ingr/IngrTypes.cpp#L191
 def _div_round_up(a: int, b: int) -> int:
     return (a // b) if (a % b) == 0 else (a // b) + 1