From 98c0cf10662213edc2699748688a5bae70c0697f Mon Sep 17 00:00:00 2001 From: Natalie Diaz Date: Fri, 29 Sep 2023 20:01:30 +0000 Subject: [PATCH] more tests --- scripts/un/sdg/geography.py | 2 +- scripts/un/sdg/geography/place_mappings.csv | 2 +- scripts/un/sdg/process.py | 35 ++++++++++++++++----- scripts/un/sdg/process_test.py | 27 +++++++++++++--- scripts/un/sdg/util.py | 19 ----------- 5 files changed, 52 insertions(+), 33 deletions(-) diff --git a/scripts/un/sdg/geography.py b/scripts/un/sdg/geography.py index e59fd38ecb..258be2b7ba 100644 --- a/scripts/un/sdg/geography.py +++ b/scripts/un/sdg/geography.py @@ -289,7 +289,7 @@ def write_place_mappings(output, sdg2un, un2dc, un2dc2): with open(output, 'w') as f: writer = csv.DictWriter(f, fieldnames=['sdg', 'dcid']) writer.writeheader() - for code in sdg2un: + for code in sorted(sdg2un): un = sdg2un[code] if un in un2dc: dcid = un2dc[un][0] diff --git a/scripts/un/sdg/geography/place_mappings.csv b/scripts/un/sdg/geography/place_mappings.csv index e3a8d4bbd9..25fb9a8b1a 100644 --- a/scripts/un/sdg/geography/place_mappings.csv +++ b/scripts/un/sdg/geography/place_mappings.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38e88b8180eb477d801693f46a70d2f5d16497850a488be0365100a0c013cbe3 +oid sha256:148d9b119025d471f8bb20b8e1f42ebe4a5de5ed000e8e21e8662b3d70eec241 size 251752 diff --git a/scripts/un/sdg/process.py b/scripts/un/sdg/process.py index 34aed2501d..05b9139280 100644 --- a/scripts/un/sdg/process.py +++ b/scripts/un/sdg/process.py @@ -27,6 +27,7 @@ Usage: python3 process.py ''' import collections +import csv import math import os import pandas as pd @@ -39,18 +40,35 @@ from un.sdg import util -def get_geography(code): +def get_place_mappings(file): + '''Produces map of SDG code -> dcid: + + Args: + file: Input file path. + + Returns: + Map of SDG code -> dcid: + ''' + place_mappings = {} + with open(file) as f: + reader = csv.DictReader(f) + for row in reader: + place_mappings[str(row['sdg'])] = str(row['dcid']) + return place_mappings + + +def get_geography(code, place_mappings): '''Returns dcid of geography. Args: code: Geography code. - type: Geography type. + place_mappings: Map of SDG code -> dcid. Returns: Geography dcid. ''' - if str(code) in util.PLACE_MAPPINGS: - return 'dcs:' + util.PLACE_MAPPINGS[str(code)] + if str(code) in place_mappings: + return 'dcs:' + place_mappings[str(code)] return '' @@ -126,7 +144,7 @@ def fix_encoding(s): return s.encode('utf8').decode('utf8') -def process(input_dir, schema_dir, csv_dir): +def process(input_dir, schema_dir, csv_dir, place_mappings): '''Generates mcf, csv/tmcf artifacts. Produces: @@ -144,6 +162,7 @@ def process(input_dir, schema_dir, csv_dir): input_dir: Path to input xlsx files. schema_dir: Path to output schema files. csv_dir: Path to output csv files. + place_mappings: Map of SDG code -> dcid. ''' with open(os.path.join(schema_dir, 'series.mcf'), 'w') as f_series: with open(os.path.join(schema_dir, 'sdg.textproto'), 'w') as f_vertical: @@ -232,7 +251,8 @@ def process(input_dir, schema_dir, csv_dir): # Format places. df['GEOGRAPHY_CODE'] = df.apply( - lambda x: get_geography(x['GEOGRAPHY_CODE']), axis=1) + lambda x: get_geography(x['GEOGRAPHY_CODE'], place_mappings), + axis=1) df = df[df['GEOGRAPHY_CODE'] != ''] if df.empty: continue @@ -398,4 +418,5 @@ def process(input_dir, schema_dir, csv_dir): if os.path.exists('csv'): shutil.rmtree('csv') os.makedirs('csv') - process('sdg-dataset/output', 'schema', 'csv') + place_mappings = get_place_mappings('geography/place_mappings.csv') + process('sdg-dataset/output', 'schema', 'csv', place_mappings) diff --git a/scripts/un/sdg/process_test.py b/scripts/un/sdg/process_test.py index 86e9934154..946294b8e1 100644 --- a/scripts/un/sdg/process_test.py +++ b/scripts/un/sdg/process_test.py @@ -28,6 +28,21 @@ module_dir_ = os.path.dirname(__file__) +PLACE_MAPPINGS = { + '1': 'Earth', + '2': 'africa', + '4': 'country/AFG', + '5': 'southamerica', + '8': 'country/ALB', + '9': 'oceania', + '11': 'WesternAfrica', + '12': 'country/DZA', + '13': 'CentralAmerica', + '14': 'EasternAfrica', + '840': 'country/USA', + 'AF_MAZAR_E_SHARIF': 'wikidataId/Q130469' +} + def assert_equal_dir(self, result_dir, expected_dir): for root, _, files in os.walk(result_dir): @@ -40,10 +55,12 @@ def assert_equal_dir(self, result_dir, expected_dir): class ProcessTest(unittest.TestCase): def test_get_geography(self): - self.assertEqual(process.get_geography(840), 'dcs:country/USA') - self.assertEqual(process.get_geography('AF_MAZAR_E_SHARIF'), - 'dcs:wikidataId/Q130469') - self.assertEqual(process.get_geography(1), 'dcs:Earth') + self.assertEqual(process.get_geography(840, PLACE_MAPPINGS), + 'dcs:country/USA') + self.assertEqual( + process.get_geography('AF_MAZAR_E_SHARIF', PLACE_MAPPINGS), + 'dcs:wikidataId/Q130469') + self.assertEqual(process.get_geography(1, PLACE_MAPPINGS), 'dcs:Earth') def test_get_measurement_method(self): d = {'NATURE': ['E'], 'OBS_STATUS': ['A'], 'REPORTING_TYPE': ['G']} @@ -74,7 +91,7 @@ def test_process(self): with tempfile.TemporaryDirectory() as tmp_csv: process.process( os.path.join(module_dir_, 'testdata/test_input'), - tmp_schema, tmp_csv) + tmp_schema, tmp_csv, PLACE_MAPPINGS) assert_equal_dir( self, tmp_schema, os.path.join(module_dir_, 'testdata/test_schema')) diff --git a/scripts/un/sdg/util.py b/scripts/un/sdg/util.py index 59841e1018..c8e4486ccb 100644 --- a/scripts/un/sdg/util.py +++ b/scripts/un/sdg/util.py @@ -17,7 +17,6 @@ #import os import re #import sys - ''' sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname( @@ -179,24 +178,6 @@ '48 to 59 months': '4 to 5 years old' } -# Map of SDG code -> dcid. -def get_place_mappings(file): - '''Produces map of SDG code -> dcid: - - Args: - file: Input file path. - - Returns: - Map of SDG code -> dcid: - ''' - place_mappings = {} - with open(file) as f: - reader = csv.DictReader(f) - for row in reader: - place_mappings[str(row['sdg'])] = str(row['dcid']) - return place_mappings -PLACE_MAPPINGS = get_place_mappings('geography/place_mappings.csv') - def format_description(s): '''Formats input with curated style.