Code modification and created readme

datacommonsorg · Oct 17, 2023 · 87dba5a · 87dba5a
1 parent e3efabc
commit 87dba5a
Show file tree

Hide file tree

Showing 23 changed files with 81,951 additions and 0 deletions.
diff --git a/scripts/noaa/__init__.py b/scripts/noaa/__init__.py
diff --git a/scripts/noaa/gpcc_spi/README.md b/scripts/noaa/gpcc_spi/README.md
@@ -0,0 +1,29 @@
+# GPCC SPI Import
+
+## Overview
+These script cover two imports which are:
+1. NOAA_GPCC_StandardardizedPrecipitationIndex
+2. NOAA_GPCC_StandardardizedPrecipitationIndex_AggPlace
+
+
+## Scripts
+```
+download.py
+```
+Above scipt creates necessary input files with .nc extention required for processing files and store it in a tmp directory. (/tmp/gpcc_spi/)
+
+
+```
+preprocess_gpcc_spi.py
+```
+Reads downloaded files from tmp directory and create output csv files in tmp directory itself inside 'out' folder.(/tmp/gpcc_spi/out/)
+This script generate output for 'NOAA_GPCC_StandardardizedPrecipitationIndex' Import.
+Tmcf for this can be taken from 'scripts/noaa/gpcc_spi/gpcc_spi.tmcf'
+
+
+```
+gpcc_spi_aggregation.py
+```
+Reads the files generated from 'preprocess_gpcc_spi.py' script from tmp directory, aggragte data based on place and create output csv files in tmp directory itself inside 'agg' folder.(/tmp/gpcc_spi/agg/)
+This script generate output for 'NOAA_GPCC_StandardardizedPrecipitationIndex_AggPlace' Import.
+Tmcf for this can be taken from 'scripts/noaa/gpcc_spi/gpcc_spi_aggregation.tmcf'
diff --git a/scripts/noaa/gpcc_spi/__init__.py b/scripts/noaa/gpcc_spi/__init__.py
diff --git a/scripts/noaa/gpcc_spi/create_place_to_grid_area_mapping.py b/scripts/noaa/gpcc_spi/create_place_to_grid_area_mapping.py
@@ -0,0 +1,168 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processes a list of places into %area of place within 1 degree grids.
+
+Input:
+  csv with a single column that represents the places where geojson exists in dc.
+
+Output:
+  A single json file that maps
+  place id -> list of ({"grid": grid key , "ratio": %area within grid})
+
+"""
+
+from shapely import geometry
+import datacommons as dc
+import concurrent.futures
+from typing import List, Optional
+import json
+import csv
+
+from absl import flags
+from absl import app
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('gpcc_spi_places_with_csv',
+                    'geojson_data/places_with_geojson.csv',
+                    'Path to places with geojson.')
+flags.DEFINE_string('gpcc_spi_place_grid_ratio_output_dir',
+                    'geojson_data/place_to_one_degree_grid_mapping.json',
+                    'Path where the result is written to.')
+
+LAT_MIN = -90
+LAT_MAX = 90
+LNG_MIN = -180
+LNG_MAX = 180
+
+
+def construct_one_degree_grid_polygons() -> List[geometry.box]:
+    """Returns a list of all 1 degree grids."""
+    grids = dict()
+    for lat in range(LAT_MIN, LAT_MAX):
+        for lng in range(LNG_MIN, LNG_MAX):
+            key = f"{lat}^{lng}"
+            grids[key] = geometry.box(lng, lat, lng + 1, lat + 1)
+    return grids
+
+
+def get_place_by_type(parent_places, places_types: List[str]) -> List[str]:
+    """Return the place ids of all places contained in a set of parent places."""
+    all_types_of_places = []
+    for place_type in places_types:
+        parent_place_to_places = dc.get_places_in(parent_places, place_type)
+        for places in parent_place_to_places.values():
+            for place in places:
+                all_types_of_places.extend(place)
+    return all_types_of_places
+
+
+def places_to_geo_jsons(places: List[str]):
+    """Get geojsons for a list of places."""
+    resp = dc.get_property_values(places, 'geoJsonCoordinates')
+    geojsons = {}
+    for p, gj in resp.items():
+        if not gj:
+            continue
+        geojsons[p] = geometry.shape(json.loads(gj[0]))
+    return geojsons
+
+
+def fit_shape_in_grids(shape, grids):
+    containment_data = []
+    for grid_key, box in grids.items():
+        # Place is completely inside the grid.
+        if box.covers(shape):
+            return [{'grid': grid_key, 'ratio': 1}]
+
+        # 100% of the grid belongs to the place.
+        if shape.covers(box):
+            containment_data.append({
+                'grid': grid_key,
+                'ratio': box.area / shape.area
+            })
+            continue
+
+        if not shape.intersects(box):
+            continue
+        # Part of the place belongs to the grid.
+        intersection = shape.intersection(box)
+        ratio = intersection.area / shape.area
+        containment_data.append({'grid': grid_key, 'ratio': ratio})
+
+    return containment_data
+
+
+def read_input(places_with_csv: str):
+    places_with_geojson = []
+    with open(places_with_csv, newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            places_with_geojson.append(row['id'])
+    return places_with_geojson
+
+
+def get_geojsons(places_with_geojson: List[str]):
+    """Batch call DC api to get geojsons."""
+    BATCH_SIZE = 100
+    batch = []
+
+    geojsons = dict()
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = []
+        for place in places_with_geojson:
+            batch.append(place)
+            if len(batch) == BATCH_SIZE:
+                futures.append(executor.submit(places_to_geo_jsons, batch))
+                batch = []
+
+        if batch:
+            futures.append(executor.submit(places_to_geo_jsons, batch))
+
+    for future in concurrent.futures.as_completed(futures):
+        geojsons_partial = future.result()
+        geojsons.update(geojsons_partial)
+
+    return geojsons
+
+
+def create_place_to_grid_mapping(places_with_geojson: List[str],
+                                 output: Optional[str] = None,
+                                 write_results: bool = True):
+    """Fit all places in grids."""
+    geojsons = get_geojsons(places_with_geojson)
+
+    one_degree_grids = construct_one_degree_grid_polygons()
+
+    place_to_grid_ratios = dict()
+    for place, geojson in geojsons.items():
+        grid_ratios = fit_shape_in_grids(geojson, one_degree_grids)
+        place_to_grid_ratios[place] = grid_ratios
+
+    if write_results:
+        with open(output, 'w') as f:
+            json.dump(place_to_grid_ratios, f)
+        return
+
+    return place_to_grid_ratios
+
+
+def main(_):
+    places_with_geojson = read_input(FLAGS.gpcc_spi_places_with_csv)
+    create_place_to_grid_mapping(places_with_geojson,
+                                 FLAGS.gpcc_spi_place_grid_ratio_output_dir)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/scripts/noaa/gpcc_spi/create_place_to_grid_area_mapping_test.py b/scripts/noaa/gpcc_spi/create_place_to_grid_area_mapping_test.py
@@ -0,0 +1,83 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Unit tests for create_place_to_grid_area_mapping.py
+Usage: python -m unittest discover -v -s ../ -p "create_place_to_grid_area_mapping_test.py"
+'''
+import unittest
+from .create_place_to_grid_area_mapping import create_place_to_grid_mapping
+
+
+class PlaceToGridMappingTest(unittest.TestCase):
+
+    def test_create_place_to_grid_mapping(self):
+        want = {
+            # San Bernardino County
+            # maps to many grids.
+            'geoId/06071': [{
+                'grid': '33^-118',
+                'ratio': 0.0033239325139923634
+            }, {
+                'grid': '34^-118',
+                'ratio': 0.1276123022731056
+            }, {
+                'grid': '34^-117',
+                'ratio': 0.18870566451400175
+            }, {
+                'grid': '34^-116',
+                'ratio': 0.18532960437637513
+            }, {
+                'grid': '34^-115',
+                'ratio': 0.10690315005766632
+            }, {
+                'grid': '35^-118',
+                'ratio': 0.09802330851363907
+            }, {
+                'grid': '35^-117',
+                'ratio': 0.1547896428427359
+            }, {
+                'grid': '35^-116',
+                'ratio': 0.12463595784741413
+            }, {
+                'grid': '35^-115',
+                'ratio': 0.010676437061070328
+            }],
+            # San Francisco County
+            # maps to two grids (one is an island).
+            'geoId/06075': [{
+                'grid': '37^-124',
+                'ratio': 0.006601930938213778
+            }, {
+                'grid': '37^-123',
+                'ratio': 0.9933980690617857
+            }],
+            # Arlington County
+            # Fully contained in a grid.
+            'geoId/51013': [{
+                'grid': '38^-78',
+                'ratio': 1
+            }]
+        }
+        places_with_geojson = [
+            'geoId/06071',  # San Bernardino County
+            'geoId/06075',  # San Francisco County
+            'geoId/51013'  # Arlington County
+        ]
+        got = create_place_to_grid_mapping(places_with_geojson,
+                                           write_results=False)
+        self.assertDictEqual(got, want)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/scripts/noaa/gpcc_spi/download.py b/scripts/noaa/gpcc_spi/download.py
@@ -0,0 +1,89 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""""Downloads GRCC  Global Standarized Precipitation Index (SPI) data.
+
+
+Please see the data readme for release schedule.
+https://www.ncei.noaa.gov/pub/data/nidis/gpcc/nidis_gpcc_readme_c20220520.txt
+"""
+
+from absl import app
+from absl import flags
+import concurrent.futures
+import logging
+import urllib
+import urllib.request
+import shutil
+from typing import List
+import os
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('download_dir', '/tmp/gpcc_spi', 'Output directory.')
+
+flags.DEFINE_list('periods', None, (
+    'Comma separated list of periods. '
+    'A period is the duration in months for a particular SPI value.'
+    'Possible values are 1, 2, 3, 6, 9, 12, 24, 36, 48, 60, and 72, defaults to\
+        all.'))
+DEFAULT_PERIODS = ['1', '3', '6', '9', '36', '72']
+
+flags.DEFINE_string('distribution', 'pearson',
+                    ('Distribution fucntion used to compute the SPI.'
+                     'One of (pearson, gamma). Defaults to pearson.'))
+DEFAULT_DISTRIBUTION = 'pearson'
+
+
+def download_one(url, path: str):
+    """Download a single spi nc file."""
+    logging.info('Starting to download %s to %s', url, path)
+    with urllib.request.urlopen(url) as source:
+        with open(path, 'wb') as dest:
+            shutil.copyfileobj(source, dest)
+    logging.info('Finished downloading: %s', path)
+
+
+def download_all(download_dir: str, distribution: str, periods: List[str]):
+    """Download spi nc files for all periods."""
+    os.makedirs(download_dir, exist_ok=True)
+
+    periods = DEFAULT_PERIODS
+    distribution = DEFAULT_DISTRIBUTION
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = []
+        for period in periods:
+            p = period if int(
+                period
+            ) >= 10 else f"0{int(period)}"  # url format is '01' instead '1'
+            url = f'https://www.ncei.noaa.gov/pub/data/nidis/gpcc/spi-pearson/\
+                gpcc-spi-{distribution}-{p}.nc'
+
+            dest = os.path.join(download_dir, f'gpcc_spi_{distribution}_{p}.nc')
+            futures.append(executor.submit(download_one, url, dest))
+
+        for future in concurrent.futures.as_completed(futures):
+            if future.exception():
+                raise Exception(
+                    "Part of the download failed. Please re-run or fix the\
+                        script.")
+
+
+def main(_):
+    """Download all nc files."""
+    download_all(FLAGS.download_dir, FLAGS.periods, FLAGS.distribution)
+
+
+if __name__ == "__main__":
+    app.run(main)