Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random GeoSamplers: add default length #755

Merged
merged 8 commits into from
Oct 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/api/samplers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ Batch Geo Sampler

.. autoclass:: BatchGeoSampler

Utilities
---------

.. autofunction:: get_random_bounding_box
.. autofunction:: tile_to_chips

Units
-----

Expand Down
4 changes: 2 additions & 2 deletions tests/samplers/test_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
PreChippedGeoSampler,
RandomGeoSampler,
Units,
tile_to_chips,
)


Expand Down Expand Up @@ -182,8 +183,7 @@ def test_iter(self, sampler: GridGeoSampler) -> None:
)

def test_len(self, sampler: GridGeoSampler) -> None:
rows = math.ceil((100 - sampler.size[0]) / sampler.stride[0]) + 1
cols = math.ceil((100 - sampler.size[1]) / sampler.stride[1]) + 1
rows, cols = tile_to_chips(sampler.roi, sampler.size, sampler.stride)
length = rows * cols * 2 # two items in dataset
assert len(sampler) == length

Expand Down
45 changes: 45 additions & 0 deletions tests/samplers/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import math
from typing import Optional, Tuple, Union

import pytest

from torchgeo.datasets import BoundingBox
from torchgeo.samplers import tile_to_chips
from torchgeo.samplers.utils import _to_tuple

MAYBE_TUPLE = Union[float, Tuple[float, float]]


@pytest.mark.parametrize(
"size,stride,expected",
[
# size == bounds
(10, 1, 1),
(10, None, 1),
# stride < size
(8, 1, 3),
(6, 2, 3),
(4, 3, 3),
((8, 6), (1, 2), (3, 3)),
((6, 4), (2, 3), (3, 3)),
# stride == size
(3, 3, 4),
(3, None, 4),
# stride > size
(2.5, 3, 4),
],
)
def test_tile_to_chips(
size: MAYBE_TUPLE, stride: Optional[MAYBE_TUPLE], expected: MAYBE_TUPLE
) -> None:
bounds = BoundingBox(0, 10, 20, 30, 40, 50)
size = _to_tuple(size)
if stride is not None:
stride = _to_tuple(stride)
expected = _to_tuple(expected)
rows, cols = tile_to_chips(bounds, size, stride)
assert math.isclose(rows, expected[0])
assert math.isclose(cols, expected[1])
4 changes: 4 additions & 0 deletions torchgeo/samplers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .batch import BatchGeoSampler, RandomBatchGeoSampler
from .constants import Units
from .single import GeoSampler, GridGeoSampler, PreChippedGeoSampler, RandomGeoSampler
from .utils import get_random_bounding_box, tile_to_chips

__all__ = (
# Samplers
Expand All @@ -17,6 +18,9 @@
# Base classes
"GeoSampler",
"BatchGeoSampler",
# Utilities
"get_random_bounding_box",
"tile_to_chips",
# Constants
"Units",
)
Expand Down
22 changes: 18 additions & 4 deletions torchgeo/samplers/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from ..datasets import BoundingBox, GeoDataset
from .constants import Units
from .utils import _to_tuple, get_random_bounding_box
from .utils import _to_tuple, get_random_bounding_box, tile_to_chips

# https://github.com/pytorch/pytorch/issues/60979
# https://github.com/pytorch/pytorch/pull/61045
Expand Down Expand Up @@ -62,15 +62,16 @@ class RandomBatchGeoSampler(BatchGeoSampler):
"""Samples batches of elements from a region of interest randomly.

This is particularly useful during training when you want to maximize the size of
the dataset and return as many random :term:`chips <chip>` as possible.
the dataset and return as many random :term:`chips <chip>` as possible. Note that
randomly sampled chips may overlap.
"""

def __init__(
self,
dataset: GeoDataset,
size: Union[Tuple[float, float], float],
batch_size: int,
length: int,
length: Optional[int] = None,
adamjstewart marked this conversation as resolved.
Show resolved Hide resolved
roi: Optional[BoundingBox] = None,
units: Units = Units.PIXELS,
) -> None:
Expand All @@ -88,12 +89,18 @@ def __init__(
size: dimensions of each :term:`patch`
batch_size: number of samples per batch
length: number of samples per epoch
(defaults to approximately the maximal number of non-overlapping
:term:`chips <chip>` of size ``size`` that could be sampled from
the dataset)
roi: region of interest to sample from (minx, maxx, miny, maxy, mint, maxt)
(defaults to the bounds of ``dataset.index``)
units: defines if ``size`` is in pixel or CRS units

.. versionchanged:: 0.3
Added ``units`` parameter, changed default to pixel units

.. versionchanged:: 0.4
``length`` parameter is now optional, a reasonable default will be used
"""
super().__init__(dataset, roi)
self.size = _to_tuple(size)
Expand All @@ -102,7 +109,7 @@ def __init__(
self.size = (self.size[0] * self.res, self.size[1] * self.res)

self.batch_size = batch_size
self.length = length
self.length = 0
self.hits = []
areas = []
for hit in self.index.intersection(tuple(self.roi), objects=True):
Expand All @@ -111,8 +118,15 @@ def __init__(
bounds.maxx - bounds.minx >= self.size[1]
and bounds.maxy - bounds.miny >= self.size[0]
):
if bounds.area > 0:
rows, cols = tile_to_chips(bounds, self.size)
self.length += rows * cols
else:
self.length += 1
adamjstewart marked this conversation as resolved.
Show resolved Hide resolved
self.hits.append(hit)
areas.append(bounds.area)
if length is not None:
self.length = length

# torch.multinomial requires float probabilities > 0
self.areas = torch.tensor(areas, dtype=torch.float)
Expand Down
58 changes: 21 additions & 37 deletions torchgeo/samplers/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""TorchGeo samplers."""

import abc
import math
from typing import Callable, Iterable, Iterator, Optional, Tuple, Union

import torch
Expand All @@ -13,7 +12,7 @@

from ..datasets import BoundingBox, GeoDataset
from .constants import Units
from .utils import _to_tuple, get_random_bounding_box
from .utils import _to_tuple, get_random_bounding_box, tile_to_chips

# https://github.com/pytorch/pytorch/issues/60979
# https://github.com/pytorch/pytorch/pull/61045
Expand Down Expand Up @@ -63,7 +62,8 @@ class RandomGeoSampler(GeoSampler):
"""Samples elements from a region of interest randomly.

This is particularly useful during training when you want to maximize the size of
the dataset and return as many random :term:`chips <chip>` as possible.
the dataset and return as many random :term:`chips <chip>` as possible. Note that
randomly sampled chips may overlap.

This sampler is not recommended for use with tile-based datasets. Use
:class:`RandomBatchGeoSampler` instead.
Expand All @@ -73,7 +73,7 @@ def __init__(
self,
dataset: GeoDataset,
size: Union[Tuple[float, float], float],
length: int,
length: Optional[int],
roi: Optional[BoundingBox] = None,
units: Units = Units.PIXELS,
) -> None:
Expand All @@ -90,20 +90,26 @@ def __init__(
dataset: dataset to index from
size: dimensions of each :term:`patch`
length: number of random samples to draw per epoch
(defaults to approximately the maximal number of non-overlapping
:term:`chips <chip>` of size ``size`` that could be sampled from
the dataset)
roi: region of interest to sample from (minx, maxx, miny, maxy, mint, maxt)
(defaults to the bounds of ``dataset.index``)
units: defines if ``size`` is in pixel or CRS units

.. versionchanged:: 0.3
Added ``units`` parameter, changed default to pixel units

.. versionchanged:: 0.4
``length`` parameter is now optional, a reasonable default will be used
"""
super().__init__(dataset, roi)
self.size = _to_tuple(size)

if units == Units.PIXELS:
self.size = (self.size[0] * self.res, self.size[1] * self.res)

self.length = length
self.length = 0
self.hits = []
areas = []
for hit in self.index.intersection(tuple(self.roi), objects=True):
Expand All @@ -112,8 +118,15 @@ def __init__(
bounds.maxx - bounds.minx >= self.size[1]
and bounds.maxy - bounds.miny >= self.size[0]
):
if bounds.area > 0:
rows, cols = tile_to_chips(bounds, self.size)
self.length += rows * cols
else:
self.length += 1
self.hits.append(hit)
areas.append(bounds.area)
if length is not None:
self.length = length

# torch.multinomial requires float probabilities > 0
self.areas = torch.tensor(areas, dtype=torch.float)
Expand Down Expand Up @@ -147,7 +160,7 @@ def __len__(self) -> int:


class GridGeoSampler(GeoSampler):
r"""Samples elements in a grid-like fashion.
"""Samples elements in a grid-like fashion.

This is particularly useful during evaluation when you want to make predictions for
an entire region of interest. You want to minimize the amount of redundant
Expand All @@ -162,18 +175,6 @@ class GridGeoSampler(GeoSampler):

Note that the stride of the final set of chips in each row/column may be adjusted so
that the entire :term:`tile` is sampled without exceeding the bounds of the dataset.

Let :math:`i` be the size of the input tile. Let :math:`k` be the requested size of
the output patch. Let :math:`s` be the requested stride. Let :math:`o` be the number
of output rows/columns sampled from each tile. :math:`o` can then be computed as:

.. math::

o = \left\lceil \frac{i - k}{s} \right\rceil + 1

This is almost identical to relationship 5 in
https://doi.org/10.48550/arXiv.1603.07285. However, we use ceiling instead of floor
because we want to include the final remaining chip.
"""

def __init__(
Expand Down Expand Up @@ -224,15 +225,7 @@ def __init__(
self.length = 0
for hit in self.hits:
bounds = BoundingBox(*hit.bounds)

rows = (
math.ceil((bounds.maxy - bounds.miny - self.size[0]) / self.stride[0])
+ 1
)
cols = (
math.ceil((bounds.maxx - bounds.minx - self.size[1]) / self.stride[1])
+ 1
)
rows, cols = tile_to_chips(bounds, self.size, self.stride)
self.length += rows * cols

def __iter__(self) -> Iterator[BoundingBox]:
Expand All @@ -244,16 +237,7 @@ def __iter__(self) -> Iterator[BoundingBox]:
# For each tile...
for hit in self.hits:
bounds = BoundingBox(*hit.bounds)

rows = (
math.ceil((bounds.maxy - bounds.miny - self.size[0]) / self.stride[0])
+ 1
)
cols = (
math.ceil((bounds.maxx - bounds.minx - self.size[1]) / self.stride[1])
+ 1
)

rows, cols = tile_to_chips(bounds, self.size, self.stride)
mint = bounds.mint
maxt = bounds.maxt

Expand Down
45 changes: 44 additions & 1 deletion torchgeo/samplers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

"""Common sampler utilities."""

from typing import Tuple, Union
import math
from typing import Optional, Tuple, Union

import torch

Expand Down Expand Up @@ -66,3 +67,45 @@ def get_random_bounding_box(

query = BoundingBox(minx, maxx, miny, maxy, mint, maxt)
return query


def tile_to_chips(
bounds: BoundingBox,
size: Tuple[float, float],
stride: Optional[Tuple[float, float]] = None,
) -> Tuple[int, int]:
r"""Compute number of :term:`chips <chip>` that can be sampled from a :term:`tile`.

Let :math:`i` be the size of the input tile. Let :math:`k` be the requested size of
the output patch. Let :math:`s` be the requested stride. Let :math:`o` be the number
of output chips sampled from each tile. :math:`o` can then be computed as:

.. math::

o = \left\lceil \frac{i - k}{s} \right\rceil + 1

This is almost identical to relationship 5 in
https://doi.org/10.48550/arXiv.1603.07285. However, we use ceiling instead of floor
because we want to include the final remaining chip in each row/column when bounds
is not an integer multiple of stride.

Args:
bounds: bounding box of tile
size: size of output patch
stride: stride with which to sample (defaults to ``size``)

Returns:
the number of rows/columns that can be sampled

.. versionadded:: 0.4
"""
if stride is None:
stride = size

assert stride[0] > 0
assert stride[1] > 0

rows = math.ceil((bounds.maxy - bounds.miny - size[0]) / stride[0]) + 1
cols = math.ceil((bounds.maxx - bounds.minx - size[1]) / stride[1]) + 1

return rows, cols