Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding helper for Multiple AOI Extraction, along with other features and nice-to-haves #23

Merged
merged 27 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
654da67
WIP of helper for multiple AOI extraction code
nickrsan Aug 25, 2023
2b09833
Allow EEDLImage class to use a separate task registry than the main one.
nickrsan Aug 29, 2023
61e1446
Accept drive_root_folder as a param on the class itself
nickrsan Aug 29, 2023
7c32213
Fix small error in drive_root_folder path checking
nickrsan Aug 29, 2023
8457820
Adding additional clipping option so we can do a strict clip in addit…
nickrsan Aug 29, 2023
767184d
Fixing mixed indentation
nickrsan Aug 29, 2023
7abaa1d
WIP code that makes helper export behave. Also added mosaic_and_zonal…
nickrsan Aug 29, 2023
f6af5f8
GroupedCollectionExtractor ready for testing
nickrsan Aug 30, 2023
e54cd9c
Fixes related to helpers, requires strict_clip (see comments)
nickrsan Aug 30, 2023
230d2aa
linting and typing fixes and updates
nickrsan Aug 31, 2023
d541f31
Version bump
nickrsan Aug 31, 2023
e6e37d6
Adding typing_extensions to requirements
nickrsan Aug 31, 2023
c48e57c
Correctly install typing-extensions
nickrsan Aug 31, 2023
af37c9e
Missed adding typing-extensions to setup.cfg
nickrsan Aug 31, 2023
7e7094f
Adding typing-extensions to environment.yml
nickrsan Aug 31, 2023
6c2b909
Adding ability for zonal stats code to inject constant values to use …
nickrsan Aug 31, 2023
ef8916f
zonal constant injection works properly now
nickrsan Aug 31, 2023
3afae45
Changes to propagate settings through export helpers, google cloud fixes
nickrsan Aug 31, 2023
da47a76
Linter fixes
nickrsan Aug 31, 2023
c52b62f
fix for nodata value in zonal output filename for centroid extraction
nickrsan Aug 31, 2023
420d1d3
Error handling and resuming
nickrsan Sep 1, 2023
88fd522
Merge pull request #24 from water3d/multi-aoi-extract-error-handling
nickrsan Sep 1, 2023
babb6a0
Fix Google Cloud not displaying after 1k bucket items. Fix logging
nickrsan Sep 1, 2023
44cc0f7
Merge script that handles merging all CSVs in a folder. Full merging …
nickrsan Sep 1, 2023
d6e7037
lint/type fixes
nickrsan Sep 1, 2023
cc50eb7
Additional merge code (not cross-platform yet)
nickrsan Sep 3, 2023
d8e93e0
Fix small linting issue.
AdamJCrawford Sep 5, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ jobs:

- name: mypy check
run: |
pip install mypy types-requests pandas-stubs
mypy -p eedl
pip install mypy types-requests pandas-stubs typing_extensions
mypy -p eedl --enable-incomplete-feature=Unpack
2 changes: 1 addition & 1 deletion eedl/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2023.08.22"
__version__ = "2023.08.31"
43 changes: 43 additions & 0 deletions eedl/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
from pathlib import Path
from typing import Union, Dict

import fiona


def _get_fiona_args(polygon_path: Union[str, Path]) -> Dict[str, Union[str, Path]]:
"""
A simple utility that detects if, maybe, we're dealing with an Esri File Geodatabase. This is the wrong way
to do this, but it'll work in many situations.

:param polygon_path: File location of polygons.
:type polygon_path: Union[str, Path]
:return: Returns the full path and, depending on the file format, the file name in a dictionary.
:rtype: Dict[str, Union[str, Path]]
"""

parts = os.path.split(polygon_path)
# if the folder name ends with .gdb and the "filename" doesn't have an extension, assume it's an FGDB
if (parts[0].endswith(".gdb") or parts[0].endswith(".gpkg")) and "." not in parts[1]:
return {'fp': parts[0], 'layer': parts[1]}
else:
return {'fp': polygon_path}


def safe_fiona_open(features_path: Union[str, Path], **extra_kwargs) -> fiona.Collection:
"""
Handles opening things in fiona in a way that is safe, even for geodatabases where we need
to open the geodatabase itself and specify a layer. The caller is responsible for
ensuring the features are closed (e.g. a try/finally block with a call to features.close()
in the finally block should immediately follow calling this function.
:param features_path: A Path object or string path to open with fiona
:param extra_kwargs: Keyword arguments to directly pass through to fiona. Helpful when trying to filter features, etc
:return:
"""
kwargs = _get_fiona_args(features_path)
main_file_path = kwargs['fp']
del kwargs['fp']

kwargs = {**kwargs, **extra_kwargs}

return fiona.open(main_file_path, **kwargs)
7 changes: 5 additions & 2 deletions eedl/google_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ def get_public_export_urls(bucket_name: str, prefix: str = "") -> List[str]:

base_url = "https://storage.googleapis.com/"
request_url = f"{base_url}{bucket_name}/"
search_url = f"{request_url}?prefix={prefix}" # need to include the prefix here or else we get failures after having more than 1k items

# get the content of the bucket (it needs to be public
listing = requests.get(request_url).text
# get the content of the bucket (it needs to be public)
listing = requests.get(search_url).text

# comes back as an XML listing - don't need to parse the XML, just need the values of the Key elements
pattern = re.compile("<Key>(.*?)</Key>")
Expand All @@ -51,6 +52,8 @@ def download_public_export(bucket_name: str, output_folder: Union[str, Path], pr
# get the urls of items in the bucket with the specified prefix
urls = get_public_export_urls(bucket_name, prefix)

os.makedirs(output_folder, exist_ok=True)

for url in urls:
filename = url.split("/")[-1] # get the filename
output_path = Path(output_folder) / filename # construct the output path
Expand Down
214 changes: 214 additions & 0 deletions eedl/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import os
import itertools
import datetime

from .core import safe_fiona_open
from .image import EEDLImage, TaskRegistry

import ee
from ee import ImageCollection


class GroupedCollectionExtractor():

def __init__(self, **kwargs):
self.keep_image_objects = False # whether or not to store the EEDLImage objects on this class so they can be accessed when it's done. We don't just to not use the RAM on large exports
self.all_images = [] # all the exported images are saved here. They can then be operated on once the extractor is complete
self.skip_existing = True # a feature allowing it to resume from crashes. If the mosaic image exists, it skips doing any processing on the rest of it
self.on_error = "log"

self.collection = None
self.collection_band = None
self.time_start = None
self.time_end = None
self.mosaic_by_date = True
self.areas_of_interest_path = None # the path to a spatial data file readable by Fiona/GEOS that has features defining AOIs to extract individually

self.strict_clip = True # may be necessary for some things to behave, so keeping this as a default to True. People can disable if they know what they're doing (may be faster)
self.export_type = "drive"
self.drive_root_folder = None
self.cloud_bucket = None
self.download_folder = None # local folder name after downloading for processing
self.export_folder = None # drive/cloud export folder name

self.zonal_run = True
self.zonal_areas_of_interest_attr = None # what is the attribute on each of the AOI polygons that tells us what items to use in the zonal extraction
self.zonal_features_path = None # what polygons to use as inputs for zonal stats
self.zonal_features_area_of_interest_attr = None # what field in the zonal features has the value that should match zonal_areas_of_interest_attr?
self.zonal_features_preserve_fields = None # what fields to preserve, as a tuple - typically an ID and anything else you want
self.zonal_stats_to_calc = () # what statistics to output by zonal feature
self.zonal_use_points = False
self.zonal_inject_date: bool = False
self.zonal_inject_group_id: bool = False
self.zonal_nodata_value: int = 0

self.merge_sqlite = True # should we merge all outputs to a single SQLite database
self.merge_grouped_csv = True # should we merge CSV by grouped item
self.merge_final_csv = False # should we merge all output tables

self._all_outputs = list() # for storing the paths to all output csvs

self.max_fiona_features_load = 1000 # threshold where we switch from keeping fiona features in memory as a list to using itertools.tee to split the iterator

for kwarg in kwargs:
setattr(self, kwarg, kwargs[kwarg])

def _single_item_extract(self, image, task_registry, zonal_features, aoi_attr, ee_geom, image_date, aoi_download_folder):
"""
This looks a bit silly here, but we need to construct this here so that we have access
to this method's variables since we can't pass them in and it can't be a class function.
:param image:
:param state:
:return:
"""

filename_description = "alfalfa_et_ensemble"
export_image = EEDLImage(
task_registry=task_registry,
drive_root_folder=self.drive_root_folder,
cloud_bucket=self.cloud_bucket,
filename_description=filename_description
)
export_image.zonal_polygons = zonal_features
export_image.zonal_use_points = self.zonal_use_points
export_image.zonal_keep_fields = self.zonal_features_preserve_fields
export_image.zonal_stats_to_calc = self.zonal_stats_to_calc
export_image.zonal_nodata_value = self.zonal_nodata_value
export_image.date_string = image_date

zonal_inject_constants = {}
if self.zonal_inject_date:
zonal_inject_constants["date"] = image_date
if self.zonal_inject_group_id:
zonal_inject_constants["group_id"] = aoi_attr

export_image.zonal_inject_constants = zonal_inject_constants

filename_suffix = f"{aoi_attr}_{image_date}"
if self.skip_existing and export_image.check_mosaic_exists(aoi_download_folder, self.export_folder, f"{filename_description}_{filename_suffix}"):
print(f"Image {filename_suffix} exists and skip_existing=True. Skipping")
return

export_image.export(image,
export_type=self.export_type,
filename_suffix=filename_suffix,
clip=ee_geom,
strict_clip=self.strict_clip,
folder=self.export_folder, # the folder to export to in Google Drive
) # this all needs some work still so that

def extract(self):
collection = self._get_and_filter_collection()

# now we need to get each polygon to filter the bounds to and make a new collection with filterBounds for just
# that geometry

self._all_outputs = list()
features = safe_fiona_open(self.areas_of_interest_path)
try:
num_complete = 0
for feature in features:
print(f"Number of complete AOIs: {num_complete}")
task_registry = TaskRegistry()

ee_geom = ee.Geometry.Polygon(feature['geometry']['coordinates'][0]) # WARNING: THIS DOESN'T CHECK CRS
aoi_collection = collection.filterBounds(ee_geom)

# get some variables defined for use in extracting the zonal stats
aoi_attr = feature.properties[self.zonal_areas_of_interest_attr] # this is the value we'll search for in the zonal features
zonal_features_query = f"{self.zonal_features_area_of_interest_attr} = '{aoi_attr}'"
aoi_download_folder = os.path.join(self.download_folder, aoi_attr)

fiona_zonal_features = safe_fiona_open(self.zonal_features_path)
try:
zonal_features_filtered = fiona_zonal_features.filter(where=zonal_features_query)

image_list = aoi_collection.toList(aoi_collection.size()).getInfo()
indicies_and_dates = [(im['properties']['system:index'], im['properties']['system:time_start']) for im in image_list]

"""
if len(zonal_features_filtered) < self.max_fiona_features_load:
# zonal_features_filtered = list(zonal_features_filtered) # this *would* be inefficient, but we're going to re-use it so many times, it's not terrible, exce
# using_tee = False
# else:
# using an itertools tee may not be more efficient than a list, but it also might, because
# even if we iterate through all features and all features remain queued for other iterations
# it may not load all attributes, etc, for each feature if fiona lazy loads anything. It won't
# be that much slower in any case, though the complexity of maintaining the code here is something
# to consider
"""
zonal_features_filtered_tee = itertools.tee(zonal_features_filtered, len(image_list))
using_tee = True

for i, image_info in enumerate(indicies_and_dates):
if using_tee:
zonal_features = zonal_features_filtered_tee[i - 1]
else:
zonal_features = zonal_features_filtered

image = aoi_collection.filter(ee.Filter.eq("system:time_start", image_info[1])).first() # get the image from the collection again based on ID
timsetamp_in_seconds = int(str(image_info[1])[:-3]) # we could divide by 1000, but then we'd coerce back from a float. This is precise.
date_string = datetime.datetime.fromtimestamp(timsetamp_in_seconds, tz=datetime.timezone.utc).strftime("%Y-%m-%d")

self._single_item_extract(image, task_registry, zonal_features, aoi_attr, ee_geom, date_string, aoi_download_folder)

# ok, now that we have a collection for the AOI, we need to iterate through all the images
# in the collection as we normally would in a script, but also extract the features of interest for use
# in zonal stats. Right now the zonal stats code only accepts files. We might want to make it accept
# some kind of fiona iterator - can we filter fiona objects by attributes?
# fiona supports SQL queries on open and zonal stats now supports receiving an open fiona object

task_registry.setup_log(os.path.join(self.download_folder, "eedl_processing_error_log.txt"))
task_registry.wait_for_images(aoi_download_folder, sleep_time=15, callback="mosaic_and_zonal", try_again_disk_full=False, on_failure=self.on_error)

if self.keep_image_objects:
self.all_images.extend(task_registry.images)

finally:
fiona_zonal_features.close()

num_complete += 1
finally:
features.close()

def _get_and_filter_collection(self):
collection = ImageCollection(self.collection)

if self.time_start or self.time_end:
collection = collection.filterDate(self.time_start, self.time_end)

if self.collection_band:
collection = collection.select(self.collection_band)

if self.mosaic_by_date: # if we're supposed to take the images in the collection and merge them so that all images on one date are a single image
collection = mosaic_by_date(collection)

return collection


def mosaic_by_date(image_collection):
"""
Adapted to Python from code found via https://gis.stackexchange.com/a/343453/1955
:param image_collection: An image collection
:return: ee.ImageCollection
"""
image_list = image_collection.toList(image_collection.size())

unique_dates = image_list.map(lambda im: ee.Image(im).date().format("YYYY-MM-dd")).distinct()

def _make_mosaicked_image(d):
d = ee.Date(d)

image = image_collection.filterDate(d, d.advance(1, "day")).mosaic()

image_w_props = image.set(
"system:time_start", d.millis(),
"system:id", d.format("YYYY-MM-dd"),
"system:index", d.format("YYYY-MM-dd")
).rename(d.format("YYYY-MM-dd")),

return image_w_props[0]

mosaic_imlist = unique_dates.map(_make_mosaicked_image)

return ee.ImageCollection(mosaic_imlist)
Loading