diff --git a/docs/api/samplers.rst b/docs/api/samplers.rst index a21c451db3d..8934773e076 100644 --- a/docs/api/samplers.rst +++ b/docs/api/samplers.rst @@ -76,6 +76,12 @@ Batch Geo Sampler .. autoclass:: BatchGeoSampler +Utilities +--------- + +.. autofunction:: get_random_bounding_box +.. autofunction:: tile_to_chips + Units ----- diff --git a/tests/samplers/test_single.py b/tests/samplers/test_single.py index 0528dfe78c4..229e81dcb2e 100644 --- a/tests/samplers/test_single.py +++ b/tests/samplers/test_single.py @@ -17,6 +17,7 @@ PreChippedGeoSampler, RandomGeoSampler, Units, + tile_to_chips, ) @@ -182,8 +183,7 @@ def test_iter(self, sampler: GridGeoSampler) -> None: ) def test_len(self, sampler: GridGeoSampler) -> None: - rows = math.ceil((100 - sampler.size[0]) / sampler.stride[0]) + 1 - cols = math.ceil((100 - sampler.size[1]) / sampler.stride[1]) + 1 + rows, cols = tile_to_chips(sampler.roi, sampler.size, sampler.stride) length = rows * cols * 2 # two items in dataset assert len(sampler) == length diff --git a/tests/samplers/test_utils.py b/tests/samplers/test_utils.py new file mode 100644 index 00000000000..9905f99a3ed --- /dev/null +++ b/tests/samplers/test_utils.py @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import math +from typing import Optional, Tuple, Union + +import pytest + +from torchgeo.datasets import BoundingBox +from torchgeo.samplers import tile_to_chips +from torchgeo.samplers.utils import _to_tuple + +MAYBE_TUPLE = Union[float, Tuple[float, float]] + + +@pytest.mark.parametrize( + "size,stride,expected", + [ + # size == bounds + (10, 1, 1), + (10, None, 1), + # stride < size + (8, 1, 3), + (6, 2, 3), + (4, 3, 3), + ((8, 6), (1, 2), (3, 3)), + ((6, 4), (2, 3), (3, 3)), + # stride == size + (3, 3, 4), + (3, None, 4), + # stride > size + (2.5, 3, 4), + ], +) +def test_tile_to_chips( + size: MAYBE_TUPLE, stride: Optional[MAYBE_TUPLE], expected: MAYBE_TUPLE +) -> None: + bounds = BoundingBox(0, 10, 20, 30, 40, 50) + size = _to_tuple(size) + if stride is not None: + stride = _to_tuple(stride) + expected = _to_tuple(expected) + rows, cols = tile_to_chips(bounds, size, stride) + assert math.isclose(rows, expected[0]) + assert math.isclose(cols, expected[1]) diff --git a/torchgeo/samplers/__init__.py b/torchgeo/samplers/__init__.py index 17b63603fe9..ad8256c5f42 100644 --- a/torchgeo/samplers/__init__.py +++ b/torchgeo/samplers/__init__.py @@ -6,6 +6,7 @@ from .batch import BatchGeoSampler, RandomBatchGeoSampler from .constants import Units from .single import GeoSampler, GridGeoSampler, PreChippedGeoSampler, RandomGeoSampler +from .utils import get_random_bounding_box, tile_to_chips __all__ = ( # Samplers @@ -17,6 +18,9 @@ # Base classes "GeoSampler", "BatchGeoSampler", + # Utilities + "get_random_bounding_box", + "tile_to_chips", # Constants "Units", ) diff --git a/torchgeo/samplers/batch.py b/torchgeo/samplers/batch.py index b8b06aa3fe8..b6c960bc0cd 100644 --- a/torchgeo/samplers/batch.py +++ b/torchgeo/samplers/batch.py @@ -12,7 +12,7 @@ from ..datasets import BoundingBox, GeoDataset from .constants import Units -from .utils import _to_tuple, get_random_bounding_box +from .utils import _to_tuple, get_random_bounding_box, tile_to_chips # https://github.com/pytorch/pytorch/issues/60979 # https://github.com/pytorch/pytorch/pull/61045 @@ -62,7 +62,8 @@ class RandomBatchGeoSampler(BatchGeoSampler): """Samples batches of elements from a region of interest randomly. This is particularly useful during training when you want to maximize the size of - the dataset and return as many random :term:`chips ` as possible. + the dataset and return as many random :term:`chips ` as possible. Note that + randomly sampled chips may overlap. """ def __init__( @@ -70,7 +71,7 @@ def __init__( dataset: GeoDataset, size: Union[Tuple[float, float], float], batch_size: int, - length: int, + length: Optional[int] = None, roi: Optional[BoundingBox] = None, units: Units = Units.PIXELS, ) -> None: @@ -88,12 +89,18 @@ def __init__( size: dimensions of each :term:`patch` batch_size: number of samples per batch length: number of samples per epoch + (defaults to approximately the maximal number of non-overlapping + :term:`chips ` of size ``size`` that could be sampled from + the dataset) roi: region of interest to sample from (minx, maxx, miny, maxy, mint, maxt) (defaults to the bounds of ``dataset.index``) units: defines if ``size`` is in pixel or CRS units .. versionchanged:: 0.3 Added ``units`` parameter, changed default to pixel units + + .. versionchanged:: 0.4 + ``length`` parameter is now optional, a reasonable default will be used """ super().__init__(dataset, roi) self.size = _to_tuple(size) @@ -102,7 +109,7 @@ def __init__( self.size = (self.size[0] * self.res, self.size[1] * self.res) self.batch_size = batch_size - self.length = length + self.length = 0 self.hits = [] areas = [] for hit in self.index.intersection(tuple(self.roi), objects=True): @@ -111,8 +118,15 @@ def __init__( bounds.maxx - bounds.minx >= self.size[1] and bounds.maxy - bounds.miny >= self.size[0] ): + if bounds.area > 0: + rows, cols = tile_to_chips(bounds, self.size) + self.length += rows * cols + else: + self.length += 1 self.hits.append(hit) areas.append(bounds.area) + if length is not None: + self.length = length # torch.multinomial requires float probabilities > 0 self.areas = torch.tensor(areas, dtype=torch.float) diff --git a/torchgeo/samplers/single.py b/torchgeo/samplers/single.py index e063d9ecbd4..44a8e1da5b8 100644 --- a/torchgeo/samplers/single.py +++ b/torchgeo/samplers/single.py @@ -4,7 +4,6 @@ """TorchGeo samplers.""" import abc -import math from typing import Callable, Iterable, Iterator, Optional, Tuple, Union import torch @@ -13,7 +12,7 @@ from ..datasets import BoundingBox, GeoDataset from .constants import Units -from .utils import _to_tuple, get_random_bounding_box +from .utils import _to_tuple, get_random_bounding_box, tile_to_chips # https://github.com/pytorch/pytorch/issues/60979 # https://github.com/pytorch/pytorch/pull/61045 @@ -63,7 +62,8 @@ class RandomGeoSampler(GeoSampler): """Samples elements from a region of interest randomly. This is particularly useful during training when you want to maximize the size of - the dataset and return as many random :term:`chips ` as possible. + the dataset and return as many random :term:`chips ` as possible. Note that + randomly sampled chips may overlap. This sampler is not recommended for use with tile-based datasets. Use :class:`RandomBatchGeoSampler` instead. @@ -73,7 +73,7 @@ def __init__( self, dataset: GeoDataset, size: Union[Tuple[float, float], float], - length: int, + length: Optional[int], roi: Optional[BoundingBox] = None, units: Units = Units.PIXELS, ) -> None: @@ -90,12 +90,18 @@ def __init__( dataset: dataset to index from size: dimensions of each :term:`patch` length: number of random samples to draw per epoch + (defaults to approximately the maximal number of non-overlapping + :term:`chips ` of size ``size`` that could be sampled from + the dataset) roi: region of interest to sample from (minx, maxx, miny, maxy, mint, maxt) (defaults to the bounds of ``dataset.index``) units: defines if ``size`` is in pixel or CRS units .. versionchanged:: 0.3 Added ``units`` parameter, changed default to pixel units + + .. versionchanged:: 0.4 + ``length`` parameter is now optional, a reasonable default will be used """ super().__init__(dataset, roi) self.size = _to_tuple(size) @@ -103,7 +109,7 @@ def __init__( if units == Units.PIXELS: self.size = (self.size[0] * self.res, self.size[1] * self.res) - self.length = length + self.length = 0 self.hits = [] areas = [] for hit in self.index.intersection(tuple(self.roi), objects=True): @@ -112,8 +118,15 @@ def __init__( bounds.maxx - bounds.minx >= self.size[1] and bounds.maxy - bounds.miny >= self.size[0] ): + if bounds.area > 0: + rows, cols = tile_to_chips(bounds, self.size) + self.length += rows * cols + else: + self.length += 1 self.hits.append(hit) areas.append(bounds.area) + if length is not None: + self.length = length # torch.multinomial requires float probabilities > 0 self.areas = torch.tensor(areas, dtype=torch.float) @@ -147,7 +160,7 @@ def __len__(self) -> int: class GridGeoSampler(GeoSampler): - r"""Samples elements in a grid-like fashion. + """Samples elements in a grid-like fashion. This is particularly useful during evaluation when you want to make predictions for an entire region of interest. You want to minimize the amount of redundant @@ -162,18 +175,6 @@ class GridGeoSampler(GeoSampler): Note that the stride of the final set of chips in each row/column may be adjusted so that the entire :term:`tile` is sampled without exceeding the bounds of the dataset. - - Let :math:`i` be the size of the input tile. Let :math:`k` be the requested size of - the output patch. Let :math:`s` be the requested stride. Let :math:`o` be the number - of output rows/columns sampled from each tile. :math:`o` can then be computed as: - - .. math:: - - o = \left\lceil \frac{i - k}{s} \right\rceil + 1 - - This is almost identical to relationship 5 in - https://doi.org/10.48550/arXiv.1603.07285. However, we use ceiling instead of floor - because we want to include the final remaining chip. """ def __init__( @@ -224,15 +225,7 @@ def __init__( self.length = 0 for hit in self.hits: bounds = BoundingBox(*hit.bounds) - - rows = ( - math.ceil((bounds.maxy - bounds.miny - self.size[0]) / self.stride[0]) - + 1 - ) - cols = ( - math.ceil((bounds.maxx - bounds.minx - self.size[1]) / self.stride[1]) - + 1 - ) + rows, cols = tile_to_chips(bounds, self.size, self.stride) self.length += rows * cols def __iter__(self) -> Iterator[BoundingBox]: @@ -244,16 +237,7 @@ def __iter__(self) -> Iterator[BoundingBox]: # For each tile... for hit in self.hits: bounds = BoundingBox(*hit.bounds) - - rows = ( - math.ceil((bounds.maxy - bounds.miny - self.size[0]) / self.stride[0]) - + 1 - ) - cols = ( - math.ceil((bounds.maxx - bounds.minx - self.size[1]) / self.stride[1]) - + 1 - ) - + rows, cols = tile_to_chips(bounds, self.size, self.stride) mint = bounds.mint maxt = bounds.maxt diff --git a/torchgeo/samplers/utils.py b/torchgeo/samplers/utils.py index 94a8c5622a1..ecf4cef3110 100644 --- a/torchgeo/samplers/utils.py +++ b/torchgeo/samplers/utils.py @@ -3,7 +3,8 @@ """Common sampler utilities.""" -from typing import Tuple, Union +import math +from typing import Optional, Tuple, Union import torch @@ -66,3 +67,45 @@ def get_random_bounding_box( query = BoundingBox(minx, maxx, miny, maxy, mint, maxt) return query + + +def tile_to_chips( + bounds: BoundingBox, + size: Tuple[float, float], + stride: Optional[Tuple[float, float]] = None, +) -> Tuple[int, int]: + r"""Compute number of :term:`chips ` that can be sampled from a :term:`tile`. + + Let :math:`i` be the size of the input tile. Let :math:`k` be the requested size of + the output patch. Let :math:`s` be the requested stride. Let :math:`o` be the number + of output chips sampled from each tile. :math:`o` can then be computed as: + + .. math:: + + o = \left\lceil \frac{i - k}{s} \right\rceil + 1 + + This is almost identical to relationship 5 in + https://doi.org/10.48550/arXiv.1603.07285. However, we use ceiling instead of floor + because we want to include the final remaining chip in each row/column when bounds + is not an integer multiple of stride. + + Args: + bounds: bounding box of tile + size: size of output patch + stride: stride with which to sample (defaults to ``size``) + + Returns: + the number of rows/columns that can be sampled + + .. versionadded:: 0.4 + """ + if stride is None: + stride = size + + assert stride[0] > 0 + assert stride[1] > 0 + + rows = math.ceil((bounds.maxy - bounds.miny - size[0]) / stride[0]) + 1 + cols = math.ceil((bounds.maxx - bounds.minx - size[1]) / stride[1]) + 1 + + return rows, cols