From b990b2820854d52affdd951458ef57c4b8130168 Mon Sep 17 00:00:00 2001 From: natalie <77713883+n-h-diaz@users.noreply.github.com> Date: Wed, 8 Nov 2023 08:33:29 -0800 Subject: [PATCH] add old version of sdg scripts (#921) * add old version of sdg scripts * fix * fix * fix --- scripts/un/sdg/README.md | 16 +- scripts/un/sdg/sdmx/cities.py | 94 ++++++++++ scripts/un/sdg/sdmx/m49.tsv | 250 +++++++++++++++++++++++++ scripts/un/sdg/sdmx/preprocess.py | 99 ++++++++++ scripts/un/sdg/sdmx/process.py | 298 ++++++++++++++++++++++++++++++ scripts/un/sdg/sdmx/util.py | 197 ++++++++++++++++++++ 6 files changed, 952 insertions(+), 2 deletions(-) create mode 100644 scripts/un/sdg/sdmx/cities.py create mode 100644 scripts/un/sdg/sdmx/m49.tsv create mode 100644 scripts/un/sdg/sdmx/preprocess.py create mode 100644 scripts/un/sdg/sdmx/process.py create mode 100644 scripts/un/sdg/sdmx/util.py diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md index 747f6b9712..f829b4dc84 100644 --- a/scripts/un/sdg/README.md +++ b/scripts/un/sdg/README.md @@ -1,6 +1,6 @@ # UN Stats Sustainable Development Goals -This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. +This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date. To generate place mappings: @@ -29,6 +29,7 @@ Produces: * unit.mcf * csv/ folder: * [CODE].csv + (Note that these folders are not included in the repository but can be regenerated by running the script.) When refreshing the data, the `geography`, `schema`, and `csv` folders might all get updated and will need to be resubmitted to g3. The corresponding TMCF file is `sdg.tmcf`. @@ -39,4 +40,15 @@ python3 -m unittest discover -v -s ../ -p "*_test.py" ``` Notes: -* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers. \ No newline at end of file +* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers. + +### SDMX + +As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**. + +As a quick overview: +* `preprocess.py` downloads all the raw input CSVs to an `input/` folder as well as adds all dimensions and attributes to a `preprocessed/` folder. +* `cities.py` reads the input CSVs and matches cities with dcids. +* `process.py` reads the input CSVs and concepts and generates a cleaned CSV and schema. +* `util.py` has various shared util functions and constants. +* `m49.csv` has country code mappings. diff --git a/scripts/un/sdg/sdmx/cities.py b/scripts/un/sdg/sdmx/cities.py new file mode 100644 index 0000000000..3c5e0c9f1a --- /dev/null +++ b/scripts/un/sdg/sdmx/cities.py @@ -0,0 +1,94 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Finds dcids for cities in input files. + +Produces: +* preprocessed/cities.csv: dcid for each city name + +Note: For cities where the find entities API did not return a dcid, +we tried manually searching for the dcid and filled these into the file. +There are a few city names that are still missing - these are left blank. +**This script ideally shouldn't need to be run again.** +Usage: python3 cities.py +''' +import csv +import requests +import os +import sys + +BATCH = 1 + + +def get_cities(json, api_key): + '''Applies find entities API for given json. + Args: + json: Input json. + api_key: API key. + Returns: + API response. + ''' + return requests.post('https://api.datacommons.org/v1/bulk/find/entities', + headers={ + 'X-API-Key': api_key + }, + json=json).json() + + +def write_cities(file, cities, api_key): + '''Writes city dcids and names to file. + Args: + file: Output file path. + cities: List of city dcids to process. + api_key: API key. + ''' + with open(file, 'w') as f: + writer = csv.DictWriter(f, fieldnames=['name', 'dcid']) + writer.writeheader() + city_list = list(cities.keys()) + for i in range(0, len(city_list), BATCH): + json = { + 'entities': [{ + 'description': city + } for city in city_list[i:i + BATCH]] + } + response = get_cities(json, api_key) + print(response) + try: + for entity in response['entities']: + dcid = entity['dcids'][0] if 'dcids' in entity else '' + writer.writerow({ + 'name': cities[entity['description']], + 'dcid': dcid + }) + except KeyError: + writer.writerow({'name': cities[city_list[i]], 'dcid': ''}) + + +if __name__ == '__main__': + cities = set() + for file in sorted(os.listdir('input')): + code = file.removesuffix('.csv') + with open('input/' + file) as f: + reader = csv.DictReader(f) + if '[Cities]' in reader.fieldnames: + for row in reader: + cities.add(row['[Cities]'].replace('_', ' ').title() + + ', ' + row['GeoAreaName']) + cities = sorted(cities) + + write_cities('preprocessed/cities2.csv', cities, sys.argv[1]) diff --git a/scripts/un/sdg/sdmx/m49.tsv b/scripts/un/sdg/sdmx/m49.tsv new file mode 100644 index 0000000000..a90189880a --- /dev/null +++ b/scripts/un/sdg/sdmx/m49.tsv @@ -0,0 +1,250 @@ +Country or Area M49 code ISO-alpha3 code +Afghanistan 004 AFG +Åland Islands 248 ALA +Albania 008 ALB +Algeria 012 DZA +American Samoa 016 ASM +Andorra 020 AND +Angola 024 AGO +Anguilla 660 AIA +Antarctica 010 ATA +Antigua and Barbuda 028 ATG +Argentina 032 ARG +Armenia 051 ARM +Aruba 533 ABW +Australia 036 AUS +Austria 040 AUT +Azerbaijan 031 AZE +Bahamas 044 BHS +Bahrain 048 BHR +Bangladesh 050 BGD +Barbados 052 BRB +Belarus 112 BLR +Belgium 056 BEL +Belize 084 BLZ +Benin 204 BEN +Bermuda 060 BMU +Bhutan 064 BTN +Bolivia (Plurinational State of) 068 BOL +Bonaire, Sint Eustatius and Saba 535 BES +Bosnia and Herzegovina 070 BIH +Botswana 072 BWA +Bouvet Island 074 BVT +Brazil 076 BRA +British Indian Ocean Territory 086 IOT +British Virgin Islands 092 VGB +Brunei Darussalam 096 BRN +Bulgaria 100 BGR +Burkina Faso 854 BFA +Burundi 108 BDI +Cabo Verde 132 CPV +Cambodia 116 KHM +Cameroon 120 CMR +Canada 124 CAN +Cayman Islands 136 CYM +Central African Republic 140 CAF +Chad 148 TCD +Chile 152 CHL +China 156 CHN +China, Hong Kong Special Administrative Region 344 HKG +China, Macao Special Administrative Region 446 MAC +Christmas Island 162 CXR +Cocos (Keeling) Islands 166 CCK +Colombia 170 COL +Comoros 174 COM +Congo 178 COG +Cook Islands 184 COK +Costa Rica 188 CRI +Côte d’Ivoire 384 CIV +Croatia 191 HRV +Cuba 192 CUB +Curaçao 531 CUW +Cyprus 196 CYP +Czechia 203 CZE +Democratic People's Republic of Korea 408 PRK +Democratic Republic of the Congo 180 COD +Denmark 208 DNK +Djibouti 262 DJI +Dominica 212 DMA +Dominican Republic 214 DOM +Ecuador 218 ECU +Egypt 818 EGY +El Salvador 222 SLV +Equatorial Guinea 226 GNQ +Eritrea 232 ERI +Estonia 233 EST +Eswatini 748 SWZ +Ethiopia 231 ETH +Falkland Islands (Malvinas) 238 FLK +Faroe Islands 234 FRO +Fiji 242 FJI +Finland 246 FIN +France 250 FRA +French Guiana 254 GUF +French Polynesia 258 PYF +French Southern Territories 260 ATF +Gabon 266 GAB +Gambia 270 GMB +Georgia 268 GEO +Germany 276 DEU +Ghana 288 GHA +Gibraltar 292 GIB +Greece 300 GRC +Greenland 304 GRL +Grenada 308 GRD +Guadeloupe 312 GLP +Guam 316 GUM +Guatemala 320 GTM +Guernsey 831 GGY +Guinea 324 GIN +Guinea-Bissau 624 GNB +Guyana 328 GUY +Haiti 332 HTI +Heard Island and McDonald Islands 334 HMD +Holy See 336 VAT +Honduras 340 HND +Hungary 348 HUN +Iceland 352 ISL +India 356 IND +Indonesia 360 IDN +Iran (Islamic Republic of) 364 IRN +Iraq 368 IRQ +Ireland 372 IRL +Isle of Man 833 IMN +Israel 376 ISR +Italy 380 ITA +Jamaica 388 JAM +Japan 392 JPN +Jersey 832 JEY +Jordan 400 JOR +Kazakhstan 398 KAZ +Kenya 404 KEN +Kiribati 296 KIR +Kuwait 414 KWT +Kyrgyzstan 417 KGZ +Lao People's Democratic Republic 418 LAO +Latvia 428 LVA +Lebanon 422 LBN +Lesotho 426 LSO +Liberia 430 LBR +Libya 434 LBY +Liechtenstein 438 LIE +Lithuania 440 LTU +Luxembourg 442 LUX +Madagascar 450 MDG +Malawi 454 MWI +Malaysia 458 MYS +Maldives 462 MDV +Mali 466 MLI +Malta 470 MLT +Marshall Islands 584 MHL +Martinique 474 MTQ +Mauritania 478 MRT +Mauritius 480 MUS +Mayotte 175 MYT +Mexico 484 MEX +Micronesia (Federated States of) 583 FSM +Monaco 492 MCO +Mongolia 496 MNG +Montenegro 499 MNE +Montserrat 500 MSR +Morocco 504 MAR +Mozambique 508 MOZ +Myanmar 104 MMR +Namibia 516 NAM +Nauru 520 NRU +Nepal 524 NPL +Netherlands 528 NLD +New Caledonia 540 NCL +New Zealand 554 NZL +Nicaragua 558 NIC +Niger 562 NER +Nigeria 566 NGA +Niue 570 NIU +Norfolk Island 574 NFK +North Macedonia 807 MKD +Northern Mariana Islands 580 MNP +Norway 578 NOR +Oman 512 OMN +Pakistan 586 PAK +Palau 585 PLW +Panama 591 PAN +Papua New Guinea 598 PNG +Paraguay 600 PRY +Peru 604 PER +Philippines 608 PHL +Pitcairn 612 PCN +Poland 616 POL +Portugal 620 PRT +Puerto Rico 630 PRI +Qatar 634 QAT +Republic of Korea 410 KOR +Republic of Moldova 498 MDA +Réunion 638 REU +Romania 642 ROU +Russian Federation 643 RUS +Rwanda 646 RWA +Saint Barthélemy 652 BLM +Saint Helena 654 SHN +Saint Kitts and Nevis 659 KNA +Saint Lucia 662 LCA +Saint Martin (French Part) 663 MAF +Saint Pierre and Miquelon 666 SPM +Saint Vincent and the Grenadines 670 VCT +Samoa 882 WSM +San Marino 674 SMR +Sao Tome and Principe 678 STP +Sark 680 +Saudi Arabia 682 SAU +Senegal 686 SEN +Serbia 688 SRB +Seychelles 690 SYC +Sierra Leone 694 SLE +Singapore 702 SGP +Sint Maarten (Dutch part) 534 SXM +Slovakia 703 SVK +Slovenia 705 SVN +Solomon Islands 090 SLB +Somalia 706 SOM +South Africa 710 ZAF +South Georgia and the South Sandwich Islands 239 SGS +South Sudan 728 SSD +Spain 724 ESP +Sri Lanka 144 LKA +State of Palestine 275 PSE +Sudan 729 SDN +Suriname 740 SUR +Svalbard and Jan Mayen Islands 744 SJM +Sweden 752 SWE +Switzerland 756 CHE +Syrian Arab Republic 760 SYR +Tajikistan 762 TJK +Thailand 764 THA +Timor-Leste 626 TLS +Togo 768 TGO +Tokelau 772 TKL +Tonga 776 TON +Trinidad and Tobago 780 TTO +Tunisia 788 TUN +Türkiye 792 TUR +Turkmenistan 795 TKM +Turks and Caicos Islands 796 TCA +Tuvalu 798 TUV +Uganda 800 UGA +Ukraine 804 UKR +United Arab Emirates 784 ARE +United Kingdom of Great Britain and Northern Ireland 826 GBR +United Republic of Tanzania 834 TZA +United States Minor Outlying Islands 581 UMI +United States of America 840 USA +United States Virgin Islands 850 VIR +Uruguay 858 URY +Uzbekistan 860 UZB +Vanuatu 548 VUT +Venezuela (Bolivarian Republic of) 862 VEN +Viet Nam 704 VNM +Wallis and Futuna Islands 876 WLF +Western Sahara 732 ESH +Yemen 887 YEM +Zambia 894 ZMB +Zimbabwe 716 ZWE diff --git a/scripts/un/sdg/sdmx/preprocess.py b/scripts/un/sdg/sdmx/preprocess.py new file mode 100644 index 0000000000..ff4f3067bf --- /dev/null +++ b/scripts/un/sdg/sdmx/preprocess.py @@ -0,0 +1,99 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Downloads data from UN Stats API to be used in further processing. + +Produces: +* input/ directory containing csv files for each series +* preprocessed/attributes.csv: metadata about attributes +* preprocessed/dimensions.csv: metadata about dimensions +* output/series.mcf: MCF for each series +Note: Downloading all the data is very slow and prone to crashes. +This script ideally shouldn't need to be run again. +Usage: python3 preprocess.py +''' +import csv +import os +import requests + +from util import * + +API_PREFIX = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/' +HEADERS = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/octet-stream' +} + + +def add_concepts(code, concept, concept_set): + '''Adds concepts from given series code to concept_set. + Args: + code: Series code. + concept: Type of concept ('Attributes' | 'Dimensions'). + concept_set: Current set of concepts. + ''' + response = requests.get(f'{API_PREFIX}{code}/{concept}').json() + for entry in response: + for c in entry['codes']: + concept_set.add( + (entry['id'], c['code'], c['description'], c['sdmx'])) + + +def write_concepts(file, concept_set): + '''Writes concepts from concept_set to file. + Args: + path: File path to write to. + concept_set: Current set of concepts. + ''' + with open(file, 'w') as f: + writer = csv.writer(f) + for row in sorted(concept_set): + writer.writerow(list(row)) + + +if __name__ == '__main__': + if not os.path.exists('input'): + os.makedirs('input') + if not os.path.exists('preprocessed'): + os.makedirs('preprocessed') + if not os.path.exists('output'): + os.makedirs('output') + + series = requests.get(f'{API_PREFIX}List?allreleases=false').json() + codes = {s['code']: s['description'] for s in series} + + attributes = set() + dimensions = set() + with open('output/series.mcf', 'w') as f_series: + for code in sorted(codes): + print(code) + data = {'seriesCodes': code} + text = requests.post(f'{API_PREFIX}DataCSV', + data=data, + headers=HEADERS).text.rstrip('\x00') + with open(f'input/{code}.csv', 'w') as f_code: + f_code.write(text) + add_concepts(code, 'Attributes', attributes) + add_concepts(code, 'Dimensions', dimensions) + f_series.write( + SERIES_TEMPLATE.format_map({ + 'dcid': 'SDG_' + code, + 'description': format_description(codes[code]) + })) + + write_concepts('preprocessed/attributes.csv', attributes) + write_concepts('preprocessed/dimensions.csv', dimensions) diff --git a/scripts/un/sdg/sdmx/process.py b/scripts/un/sdg/sdmx/process.py new file mode 100644 index 0000000000..5004ccc44a --- /dev/null +++ b/scripts/un/sdg/sdmx/process.py @@ -0,0 +1,298 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Produces CSV/TMCF + schema for UN Stats data. + +Produces: +* output/output.csv: cleaned CSV +* output/measurement_method.csv: measurement methods +* output/schema.mcf: properties and classes +* output/sv.mcf: statistical variables +* output/unit.mcf: units +Usage: python3 preprocess.py +''' +import collections +import csv +import os +import sys + +from util import * + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__))))) + +module_dir_ = os.path.dirname(__file__) + +# Create map of M49 -> ISO-alpha3 for countries. +with open(os.path.join(module_dir_, 'm49.tsv')) as f: + PLACES = {} + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + if not row['ISO-alpha3 code']: # Only countries for now. + continue + PLACES[int(row['M49 code'])] = row['ISO-alpha3 code'] + +# Create map of name -> dcid for supported cities. +with open(os.path.join(module_dir_, 'preprocessed/cities.csv')) as f: + reader = csv.DictReader(f) + CITIES = {row['name']: row['dcid'] for row in reader} + + +def write_templates(file, templates): + '''Write templates to file. + Args: + file: Input file path. + templates: Template strings. + ''' + with open(file, 'w') as f: + for template in sorted(templates): + f.write(template) + + +def add_concepts(file, concepts): + '''Adds concepts from file. + Args: + file: Input file path. + concepts: Dictionary of concepts: concept -> code -> (name, formatted code). + ''' + with open(file) as f: + reader = csv.reader(f) + for row in reader: + + # Skip totals (as indicated by SDMX). + if row[3] == '_T': + continue + concepts[row[0]][row[1]] = (row[2], make_value(row[1])) + + +def get_observation_about(country_code, country_name, city): + '''Returns dcid for place. + Args: + country_code: M49 for country. + country_name: Name of country. + city: Name of city. + Returns: + Dcid of place if found, else empty string. + ''' + if city: + formatted_city = city.replace('_', ' ').title() + ', ' + country_name + if formatted_city in CITIES and CITIES[formatted_city]: + return 'dcs:' + CITIES[formatted_city] + else: + return '' + if country_code in PLACES: + return 'dcs:country/' + PLACES[country_code] + else: + return '' + + +def get_variable_measured(row, properties, concepts): + '''Returns templated string for variable_measured. + Args: + row: Input csv dict row. + properties: List of properties for row. + concepts: Dictionary of concepts. + Returns: + Templated string. + ''' + value_ids = [] + value_descriptions = [] + cprops = '' + for i in properties: + field = i[1:-1] + if not row[i] or field not in concepts or row[i] not in concepts[field]: + continue + value_ids.append(concepts[field][row[i]][1]) + value_descriptions.append(concepts[field][row[i]][0]) + enum = make_property(field) + if field in MAPPED_CONCEPTS: + prop = MAPPED_CONCEPTS[field] + else: + prop = 'sdg_' + enum[0].lower() + enum[1:] + val = enum + 'Enum_' + value_ids[-1] + cprops += f'\n{prop}: dcs:SDG_{val}' + sv = 'sdg/' + '_'.join([row['SeriesCode']] + value_ids) + pvs = ', '.join(value_descriptions) + description = format_description(row['SeriesDescription']) + if pvs: + description += ': ' + pvs + template = SV_TEMPLATE.format_map({ + 'dcid': sv, + 'popType': 'SDG_' + row['SeriesCode'], + 'name': '"' + description + '"', + 'cprops': cprops + }) + return template + + +def get_measurement_method(row, concepts): + '''Returns templated string for measurement_method. + Args: + row: Input csv dict row. + concepts: Dictionary of concepts. + Returns: + Templated string. + ''' + mmethod = '' + description = [] + for concept in [ + '[Nature]', '[Observation Status]', '[Report Ordinal]', + '[Reporting Type]' + ]: + field = concept[1:-1] + if concept in row: + mmethod += '_' + row[concept] + if field in concepts and row[concept] in concepts[field]: + description.append(concepts[field][row[concept]][0]) + if not mmethod: + return '' + mmethod = 'SDG' + mmethod + description = 'SDG Measurement Method: ' + ', '.join( + description) if description else '' + template = MMETHOD_TEMPLATE.format_map({ + 'dcid': mmethod, + 'description': description + }) + return template + + +def get_unit(row): + '''Returns templated string for unit. + Args: + row: Input csv dict row. + Returns: + Templated string. + ''' + if not '[Units]' in row: + return '' + unit = row['[Units]'].replace('^', '') + template = UNIT_TEMPLATE.format_map({ + 'dcid': unit, + 'name': format_unit_name(unit) + }) + return template + + +def write_schema(file, concepts): + '''Writes schema from concepts to file. + Args: + file: Input file path. + concepts: Dictionary of concepts. + ''' + with open(file, 'w') as f: + for concept in sorted(concepts): + if concept in SKIPPED_CONCEPTS: + continue + prop = make_property(concept) + enum = prop + 'Enum' + if concept not in MAPPED_CONCEPTS: + f.write( + PROPERTY_TEMPLATE.format_map({ + 'dcid': prop[0].lower() + prop[1:], + 'name': concept, + 'enum': enum + })) + f.write(ENUM_TEMPLATE.format_map({'enum': enum})) + for k in sorted(concepts[concept]): + v = concepts[concept][k] + f.write( + VALUE_TEMPLATE.format_map({ + 'dcid': v[1], + 'enum': enum, + 'name': v[0][0].upper() + v[0][1:], + })) + + +def process_input_file(file, writer, concepts, svs, measurement_methods, units): + '''Processes one input file and write csv rows. + Args: + file: Input file path. + writer: Csv DictWriter object. + concepts: Dictionary of concepts. + svs: Set of statistical variables. + measurement_methods: Set of measurement methods. + units: Set of units. + ''' + print(f'Starting {file}') + with open(file) as f_in: + reader = csv.DictReader(f_in) + properties = sorted([ + field for field in reader.fieldnames + if field[0] == '[' and field[1:-1] not in SKIPPED_CONCEPTS + ]) + try: + for row in reader: + if not int(row['GeoAreaCode']) in PLACES: + continue + if not is_float(row['Value']) or row['Value'] == 'NaN' or row[ + 'Value'] == 'Nan': + continue + observation_about = get_observation_about( + int(row['GeoAreaCode']), row['GeoAreaName'], + row['[Cities]'] if '[Cities]' in reader.fieldnames else '') + if not observation_about: + continue + sv = get_variable_measured(row, properties, concepts) + svs.add(sv) + measurement_method = get_measurement_method(row, concepts) + if measurement_method: + measurement_methods.add(measurement_method) + unit = get_unit(row) + if unit: + units.add(unit) + writer.writerow({ + 'variable_measured': + 'dcid:' + get_dcid(sv), + 'observation_about': + observation_about, + 'observation_date': + row['TimePeriod'], + 'value': + row['Value'], + 'measurement_method': + 'dcs:' + get_dcid(measurement_method) + if measurement_method else '', + 'unit': + 'dcs:' + get_dcid(unit) if unit else '', + 'scaling_factor': + row['[UnitMultiplier]'] + if '[UnitMultiplier]' in reader.fieldnames else '', + }) + except: + print(f'Finished processing {file}') + + +if __name__ == '__main__': + concepts = collections.defaultdict(dict) + add_concepts('preprocessed/attributes.csv', concepts) + add_concepts('preprocessed/dimensions.csv', concepts) + write_schema('output/schema.mcf', concepts) + + svs = set() + measurement_methods = set() + units = set() + with open('output/output.csv', 'w') as f: + writer = csv.DictWriter(f, fieldnames=FIELDNAMES) + writer.writeheader() + for file in sorted(os.listdir('input')): + process_input_file(os.path.join('input', file), writer, concepts, + svs, measurement_methods, units) + + write_templates('output/measurement_method.mcf', measurement_methods) + write_templates('output/sv.mcf', svs) + write_templates('output/unit.mcf', units) diff --git a/scripts/un/sdg/sdmx/util.py b/scripts/un/sdg/sdmx/util.py new file mode 100644 index 0000000000..061dc14974 --- /dev/null +++ b/scripts/un/sdg/sdmx/util.py @@ -0,0 +1,197 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Shared util functions and constants. +''' +import re + +FIELDNAMES = [ + 'variable_measured', 'observation_about', 'observation_date', 'value', + 'measurement_method', 'unit', 'scaling_factor' +] + +DCID_PREFIX = 'Node: dcid:' +TOTAL = '_T' + +SERIES_TEMPLATE = ''' +Node: dcid:{dcid} +name: "{description}" +typeOf: dcs:SDG_Series +''' +PROPERTY_TEMPLATE = ''' +Node: dcid:sdg_{dcid} +typeOf: schema:Property +domainIncludes: dcs:Thing +rangeIncludes: dcs:SDG_{enum} +name: "{name}" +isProvisional: dcs:True +''' +ENUM_TEMPLATE = ''' +Node: dcid:SDG_{enum} +typeOf: schema:Class +subClassOf: schema:Enumeration +name: "{enum}" +isProvisional: dcs:True +''' +VALUE_TEMPLATE = ''' +Node: dcid:SDG_{enum}_{dcid} +typeOf: dcs:SDG_{enum} +name: "{name}" +isProvisional: dcs:True +''' +SV_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: {name} +populationType: dcs:{popType} +statType: dcs:measuredValue{cprops} +''' +MMETHOD_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:SDG_MeasurementMethodEnum +name: "{dcid}" +description: "{description}" +''' +UNIT_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:UnitOfMeasure +name: "{name}" +description: "SDG Unit: {dcid}" +''' + +# Select concepts will be modeled differently. +SKIPPED_CONCEPTS = { + 'Cities', 'Freq', 'Nature', 'Observation Status', 'Report Ordinal', + 'Reporting Type', 'UnitMultiplier', 'Units' +} + +# Use existing properties when they exist. +# TODO: Also map enums to existing nodes. +MAPPED_CONCEPTS = { + 'Age': 'age', + 'Cause of death': 'causeOfDeath', + 'Disability status': 'disabilityStatus', + 'Education level': 'educationalAttainment', + 'Sex': 'gender', + 'AGE': 'age', + 'CAUSE_OF_DEATH': 'causeOfDeath', + 'DISABILITY_STATUS': 'disabilityStatus', + 'EDUCATION_LEVEL': 'educationalAttainment', + 'SEX': 'gender' +} + +FORMATTED_UNITS = { + 'INDEX': 'idx', + 'NUM_M': '#m', + 'NUMBER': '#', + 'PERCENT': '%', + 'PH': 'pH', + 'TONNES': 't', + 'TONNES_M': 'Metric Tonnes' +} + + +def format_description(s): + '''Formats input with curated style. + Args: + s: Input string. + Returns: + Curated string. + ''' + # Remove <=2 levels of (). + formatted = re.sub('\((?:[^)(]|\([^)(]*\))*\)', '', s) + # Remove <=2 levels of []. + formatted = re.sub('\[(?:[^)(]|\[[^)(]*\])*\]', '', formatted) + # Remove attributes indicated with 'by'. + formatted = formatted.split(', by')[0] + # Remove references indicated by 'million USD'. + formatted = formatted.split(', million USD')[0] + # Remove extra spaces + formatted = formatted.replace(' , ', ', ').replace(' ', ' ').strip() + # Remove trailing commas + if formatted[-1] == ',': + formatted = formatted[:-1] + # Replace 100,000 with 100K + formatted = formatted.replace('100,000', '100K') + # Make ascii + return formatted.replace('Â', + '').replace('’', '\'').replace('₂', '2').replace( + '\xa0', ' ').replace('−', '-') + + +def is_float(element): + '''Checks if value can be interpreted as float. + Args: + element: Input. + Returns: + Whether the value can be cast as a float. + ''' + if element is None: + return False + try: + float(element) + return True + except ValueError: + return False + + +def make_property(s): + '''Formats property string. + Args: + s: Input string. + Returns: + Formatted string. + ''' + return s.title().replace(' ', '').replace('-', + '').replace('_', + '').replace('/', '') + + +def make_value(s): + '''Formats value string. + Args: + s: Input string. + Returns: + Formatted string. + ''' + return s.replace('<=', 'LEQ').replace('<', + 'LT').replace('+', 'GEQ').replace( + ' ', '').replace('_', '') + + +def format_unit_name(dcid): + '''Formats unit name stirng. + Args: + dcid: Input dcid. + Retuns: + Formatted string. + ''' + if dcid in FORMATTED_UNITS: + return FORMATTED_UNITS[dcid] + return dcid.lower().replace('_', ' ').replace('1000000', '1M').replace( + '100000', '100K').replace('10000', '10k') + + +def get_dcid(template): + '''Gets dcid from template. + Args: + template: Input templated string. + Returns: + Dcid. + ''' + return template.split(DCID_PREFIX)[1].split('\n')[0]