Skip to content

Commit

Permalink
Code modification and created readme
Browse files Browse the repository at this point in the history
  • Loading branch information
saanikaaa committed Oct 17, 2023
1 parent e3efabc commit 87dba5a
Show file tree
Hide file tree
Showing 23 changed files with 81,951 additions and 0 deletions.
Empty file added scripts/noaa/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions scripts/noaa/gpcc_spi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# GPCC SPI Import

## Overview
These script cover two imports which are:
1. NOAA_GPCC_StandardardizedPrecipitationIndex
2. NOAA_GPCC_StandardardizedPrecipitationIndex_AggPlace


## Scripts
```
download.py
```
Above scipt creates necessary input files with .nc extention required for processing files and store it in a tmp directory. (/tmp/gpcc_spi/)


```
preprocess_gpcc_spi.py
```
Reads downloaded files from tmp directory and create output csv files in tmp directory itself inside 'out' folder.(/tmp/gpcc_spi/out/)
This script generate output for 'NOAA_GPCC_StandardardizedPrecipitationIndex' Import.
Tmcf for this can be taken from 'scripts/noaa/gpcc_spi/gpcc_spi.tmcf'


```
gpcc_spi_aggregation.py
```
Reads the files generated from 'preprocess_gpcc_spi.py' script from tmp directory, aggragte data based on place and create output csv files in tmp directory itself inside 'agg' folder.(/tmp/gpcc_spi/agg/)
This script generate output for 'NOAA_GPCC_StandardardizedPrecipitationIndex_AggPlace' Import.
Tmcf for this can be taken from 'scripts/noaa/gpcc_spi/gpcc_spi_aggregation.tmcf'
Empty file.
168 changes: 168 additions & 0 deletions scripts/noaa/gpcc_spi/create_place_to_grid_area_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processes a list of places into %area of place within 1 degree grids.
Input:
csv with a single column that represents the places where geojson exists in dc.
Output:
A single json file that maps
place id -> list of ({"grid": grid key , "ratio": %area within grid})
"""

from shapely import geometry
import datacommons as dc
import concurrent.futures
from typing import List, Optional
import json
import csv

from absl import flags
from absl import app

FLAGS = flags.FLAGS

flags.DEFINE_string('gpcc_spi_places_with_csv',
'geojson_data/places_with_geojson.csv',
'Path to places with geojson.')
flags.DEFINE_string('gpcc_spi_place_grid_ratio_output_dir',
'geojson_data/place_to_one_degree_grid_mapping.json',
'Path where the result is written to.')

LAT_MIN = -90
LAT_MAX = 90
LNG_MIN = -180
LNG_MAX = 180


def construct_one_degree_grid_polygons() -> List[geometry.box]:
"""Returns a list of all 1 degree grids."""
grids = dict()
for lat in range(LAT_MIN, LAT_MAX):
for lng in range(LNG_MIN, LNG_MAX):
key = f"{lat}^{lng}"
grids[key] = geometry.box(lng, lat, lng + 1, lat + 1)
return grids


def get_place_by_type(parent_places, places_types: List[str]) -> List[str]:
"""Return the place ids of all places contained in a set of parent places."""
all_types_of_places = []
for place_type in places_types:
parent_place_to_places = dc.get_places_in(parent_places, place_type)
for places in parent_place_to_places.values():
for place in places:
all_types_of_places.extend(place)
return all_types_of_places


def places_to_geo_jsons(places: List[str]):
"""Get geojsons for a list of places."""
resp = dc.get_property_values(places, 'geoJsonCoordinates')
geojsons = {}
for p, gj in resp.items():
if not gj:
continue
geojsons[p] = geometry.shape(json.loads(gj[0]))
return geojsons


def fit_shape_in_grids(shape, grids):
containment_data = []
for grid_key, box in grids.items():
# Place is completely inside the grid.
if box.covers(shape):
return [{'grid': grid_key, 'ratio': 1}]

# 100% of the grid belongs to the place.
if shape.covers(box):
containment_data.append({
'grid': grid_key,
'ratio': box.area / shape.area
})
continue

if not shape.intersects(box):
continue
# Part of the place belongs to the grid.
intersection = shape.intersection(box)
ratio = intersection.area / shape.area
containment_data.append({'grid': grid_key, 'ratio': ratio})

return containment_data


def read_input(places_with_csv: str):
places_with_geojson = []
with open(places_with_csv, newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
places_with_geojson.append(row['id'])
return places_with_geojson


def get_geojsons(places_with_geojson: List[str]):
"""Batch call DC api to get geojsons."""
BATCH_SIZE = 100
batch = []

geojsons = dict()
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for place in places_with_geojson:
batch.append(place)
if len(batch) == BATCH_SIZE:
futures.append(executor.submit(places_to_geo_jsons, batch))
batch = []

if batch:
futures.append(executor.submit(places_to_geo_jsons, batch))

for future in concurrent.futures.as_completed(futures):
geojsons_partial = future.result()
geojsons.update(geojsons_partial)

return geojsons


def create_place_to_grid_mapping(places_with_geojson: List[str],
output: Optional[str] = None,
write_results: bool = True):
"""Fit all places in grids."""
geojsons = get_geojsons(places_with_geojson)

one_degree_grids = construct_one_degree_grid_polygons()

place_to_grid_ratios = dict()
for place, geojson in geojsons.items():
grid_ratios = fit_shape_in_grids(geojson, one_degree_grids)
place_to_grid_ratios[place] = grid_ratios

if write_results:
with open(output, 'w') as f:
json.dump(place_to_grid_ratios, f)
return

return place_to_grid_ratios


def main(_):
places_with_geojson = read_input(FLAGS.gpcc_spi_places_with_csv)
create_place_to_grid_mapping(places_with_geojson,
FLAGS.gpcc_spi_place_grid_ratio_output_dir)


if __name__ == "__main__":
app.run(main)
83 changes: 83 additions & 0 deletions scripts/noaa/gpcc_spi/create_place_to_grid_area_mapping_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Unit tests for create_place_to_grid_area_mapping.py
Usage: python -m unittest discover -v -s ../ -p "create_place_to_grid_area_mapping_test.py"
'''
import unittest
from .create_place_to_grid_area_mapping import create_place_to_grid_mapping


class PlaceToGridMappingTest(unittest.TestCase):

def test_create_place_to_grid_mapping(self):
want = {
# San Bernardino County
# maps to many grids.
'geoId/06071': [{
'grid': '33^-118',
'ratio': 0.0033239325139923634
}, {
'grid': '34^-118',
'ratio': 0.1276123022731056
}, {
'grid': '34^-117',
'ratio': 0.18870566451400175
}, {
'grid': '34^-116',
'ratio': 0.18532960437637513
}, {
'grid': '34^-115',
'ratio': 0.10690315005766632
}, {
'grid': '35^-118',
'ratio': 0.09802330851363907
}, {
'grid': '35^-117',
'ratio': 0.1547896428427359
}, {
'grid': '35^-116',
'ratio': 0.12463595784741413
}, {
'grid': '35^-115',
'ratio': 0.010676437061070328
}],
# San Francisco County
# maps to two grids (one is an island).
'geoId/06075': [{
'grid': '37^-124',
'ratio': 0.006601930938213778
}, {
'grid': '37^-123',
'ratio': 0.9933980690617857
}],
# Arlington County
# Fully contained in a grid.
'geoId/51013': [{
'grid': '38^-78',
'ratio': 1
}]
}
places_with_geojson = [
'geoId/06071', # San Bernardino County
'geoId/06075', # San Francisco County
'geoId/51013' # Arlington County
]
got = create_place_to_grid_mapping(places_with_geojson,
write_results=False)
self.assertDictEqual(got, want)


if __name__ == '__main__':
unittest.main()
89 changes: 89 additions & 0 deletions scripts/noaa/gpcc_spi/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""""Downloads GRCC Global Standarized Precipitation Index (SPI) data.
Please see the data readme for release schedule.
https://www.ncei.noaa.gov/pub/data/nidis/gpcc/nidis_gpcc_readme_c20220520.txt
"""

from absl import app
from absl import flags
import concurrent.futures
import logging
import urllib
import urllib.request
import shutil
from typing import List
import os

FLAGS = flags.FLAGS

flags.DEFINE_string('download_dir', '/tmp/gpcc_spi', 'Output directory.')

flags.DEFINE_list('periods', None, (
'Comma separated list of periods. '
'A period is the duration in months for a particular SPI value.'
'Possible values are 1, 2, 3, 6, 9, 12, 24, 36, 48, 60, and 72, defaults to\
all.'))
DEFAULT_PERIODS = ['1', '3', '6', '9', '36', '72']

flags.DEFINE_string('distribution', 'pearson',
('Distribution fucntion used to compute the SPI.'
'One of (pearson, gamma). Defaults to pearson.'))
DEFAULT_DISTRIBUTION = 'pearson'


def download_one(url, path: str):
"""Download a single spi nc file."""
logging.info('Starting to download %s to %s', url, path)
with urllib.request.urlopen(url) as source:
with open(path, 'wb') as dest:
shutil.copyfileobj(source, dest)
logging.info('Finished downloading: %s', path)


def download_all(download_dir: str, distribution: str, periods: List[str]):
"""Download spi nc files for all periods."""
os.makedirs(download_dir, exist_ok=True)

periods = DEFAULT_PERIODS
distribution = DEFAULT_DISTRIBUTION

with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for period in periods:
p = period if int(
period
) >= 10 else f"0{int(period)}" # url format is '01' instead '1'
url = f'https://www.ncei.noaa.gov/pub/data/nidis/gpcc/spi-pearson/\
gpcc-spi-{distribution}-{p}.nc'

dest = os.path.join(download_dir, f'gpcc_spi_{distribution}_{p}.nc')
futures.append(executor.submit(download_one, url, dest))

for future in concurrent.futures.as_completed(futures):
if future.exception():
raise Exception(
"Part of the download failed. Please re-run or fix the\
script.")


def main(_):
"""Download all nc files."""
download_all(FLAGS.download_dir, FLAGS.periods, FLAGS.distribution)


if __name__ == "__main__":
app.run(main)
Loading

0 comments on commit 87dba5a

Please sign in to comment.