Merge branch 'datacommonsorg:master' into us_bea_harish

datacommonsorg · Nov 21, 2023 · 667d6a6 · 667d6a6
2 parents 6354261 + 1a2fd81
commit 667d6a6
Show file tree

Hide file tree

Showing 17 changed files with 1,375 additions and 10 deletions.
diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md
@@ -1,9 +1,24 @@
 # UN Stats Sustainable Development Goals
 
-This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats.
+This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date.
 
+## One-time Setup
 
-To generate place mappings:
+Initialize submodules:
+```
+git submodule update --init --remote sdg-dataset
+git submodule update --init --remote sssom-mappings
+```
+
+## Data Refresh 
+
+Update submodules: 
+```
+git submodule update --remote sdg-dataset
+git submodule update --remote sssom-mappings
+```
+
+Generate place mappings:
 ```
 python3 geography.py
 ```
@@ -15,7 +30,7 @@ Produces:
 
 Note that the `place_mappings.csv` is required before running the `process.py` script.
 
-To process data and generate artifacts:
+Process data and generate artifacts:
 ```
 python3 process.py
 ```
@@ -29,6 +44,7 @@ Produces:
   * unit.mcf
 * csv/ folder: 
   * [CODE].csv
+
 (Note that these folders are not included in the repository but can be regenerated by running the script.)
 
 When refreshing the data, the `geography`, `schema`, and `csv` folders might all get updated and will need to be resubmitted to g3. The corresponding TMCF file is `sdg.tmcf`.
@@ -39,4 +55,15 @@ python3 -m unittest discover -v -s ../ -p "*_test.py"
 ```
 
 Notes:
-* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
+* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
+
+## SDMX
+
+As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**.
+
+As a quick overview: 
+* `preprocess.py` downloads all the raw input CSVs to an `input/` folder as well as adds all dimensions and attributes to a `preprocessed/` folder.
+* `cities.py` reads the input CSVs and matches cities with dcids.
+* `process.py` reads the input CSVs and concepts and generates a cleaned CSV and schema.
+* `util.py` has various shared util functions and constants.
+* `m49.csv` has country code mappings.
diff --git a/scripts/un/sdg/geography.py b/scripts/un/sdg/geography.py
@@ -97,7 +97,7 @@ def get_sdg2type(file):
         Map of SDG code -> SDG type.
     '''
     sdg2type = {}
-    with open(file) as f:
+    with open(file, encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
             sdg2type[row['GEOGRAPHY_CODE']] = row['GEOGRAPHY_TYPE']
@@ -140,7 +140,7 @@ def get_un2dc_curated(file):
         Map of UN code -> curated Node.
     '''
     un2dc_curated = {}
-    with open(file) as f:
+    with open(file, encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
 
@@ -204,8 +204,8 @@ def write_un_places(input_geos, output, sdg2type, un2sdg, un2dc_curated):
     '''
     un2dc_generated = {}
     new_subjects = []
-    with open(input_geos) as f_in:
-        with open(output, 'w') as f_out:
+    with open(input_geos, encoding='utf-8') as f_in:
+        with open(output, 'w', encoding='utf-8') as f_out:
             reader = csv.DictReader(f_in)
             for row in reader:
                 subject = row['subject_id']
@@ -288,7 +288,7 @@ def write_un_containment(output, containment, new_subjects):
         new_subjects: List of Nodes for new places.
 
     '''
-    with open(output, 'w') as f:
+    with open(output, 'w', encoding='utf-8') as f:
         for s in sorted(containment):
             c = ''
             for o in containment[s]:
@@ -323,7 +323,7 @@ def write_place_mappings(output, sdg2un, un2dc_curated, un2dc_generated):
         un2dc_curated: Map of UN code -> curated Node.
         un2dc_generated: Map of UN code -> generated Node.
     '''
-    with open(output, 'w') as f:
+    with open(output, 'w', encoding='utf-8') as f:
         writer = csv.DictWriter(f, fieldnames=['sdg', 'dcid'])
         writer.writeheader()
         for code in sorted(sdg2un):

diff --git a/scripts/un/sdg/sdmx/cities.py b/scripts/un/sdg/sdmx/cities.py
@@ -0,0 +1,94 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Finds dcids for cities in input files.
+
+Produces:
+* preprocessed/cities.csv: dcid for each city name
+ 
+Note: For cities where the find entities API did not return a dcid,
+we tried manually searching for the dcid and filled these into the file.
+There are a few city names that are still missing - these are left blank.
+**This script ideally shouldn't need to be run again.**
+Usage: python3 cities.py <API_KEY>
+'''
+import csv
+import requests
+import os
+import sys
+
+BATCH = 1
+
+
+def get_cities(json, api_key):
+    '''Applies find entities API for given json.
+    Args:
+        json: Input json.
+        api_key: API key.
+    Returns:
+        API response.
+    '''
+    return requests.post('https://api.datacommons.org/v1/bulk/find/entities',
+                         headers={
+                             'X-API-Key': api_key
+                         },
+                         json=json).json()
+
+
+def write_cities(file, cities, api_key):
+    '''Writes city dcids and names to file.
+    Args:
+        file: Output file path.
+        cities: List of city dcids to process. 
+        api_key: API key.
+    '''
+    with open(file, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'dcid'])
+        writer.writeheader()
+        city_list = list(cities.keys())
+        for i in range(0, len(city_list), BATCH):
+            json = {
+                'entities': [{
+                    'description': city
+                } for city in city_list[i:i + BATCH]]
+            }
+            response = get_cities(json, api_key)
+            print(response)
+            try:
+                for entity in response['entities']:
+                    dcid = entity['dcids'][0] if 'dcids' in entity else ''
+                    writer.writerow({
+                        'name': cities[entity['description']],
+                        'dcid': dcid
+                    })
+            except KeyError:
+                writer.writerow({'name': cities[city_list[i]], 'dcid': ''})
+
+
+if __name__ == '__main__':
+    cities = set()
+    for file in sorted(os.listdir('input')):
+        code = file.removesuffix('.csv')
+        with open('input/' + file) as f:
+            reader = csv.DictReader(f)
+            if '[Cities]' in reader.fieldnames:
+                for row in reader:
+                    cities.add(row['[Cities]'].replace('_', ' ').title() +
+                               ', ' + row['GeoAreaName'])
+    cities = sorted(cities)
+
+    write_cities('preprocessed/cities2.csv', cities, sys.argv[1])