From b990b2820854d52affdd951458ef57c4b8130168 Mon Sep 17 00:00:00 2001 From: natalie <77713883+n-h-diaz@users.noreply.github.com> Date: Wed, 8 Nov 2023 08:33:29 -0800 Subject: [PATCH 1/4] add old version of sdg scripts (#921) * add old version of sdg scripts * fix * fix * fix --- scripts/un/sdg/README.md | 16 +- scripts/un/sdg/sdmx/cities.py | 94 ++++++++++ scripts/un/sdg/sdmx/m49.tsv | 250 +++++++++++++++++++++++++ scripts/un/sdg/sdmx/preprocess.py | 99 ++++++++++ scripts/un/sdg/sdmx/process.py | 298 ++++++++++++++++++++++++++++++ scripts/un/sdg/sdmx/util.py | 197 ++++++++++++++++++++ 6 files changed, 952 insertions(+), 2 deletions(-) create mode 100644 scripts/un/sdg/sdmx/cities.py create mode 100644 scripts/un/sdg/sdmx/m49.tsv create mode 100644 scripts/un/sdg/sdmx/preprocess.py create mode 100644 scripts/un/sdg/sdmx/process.py create mode 100644 scripts/un/sdg/sdmx/util.py diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md index 747f6b9712..f829b4dc84 100644 --- a/scripts/un/sdg/README.md +++ b/scripts/un/sdg/README.md @@ -1,6 +1,6 @@ # UN Stats Sustainable Development Goals -This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. +This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date. To generate place mappings: @@ -29,6 +29,7 @@ Produces: * unit.mcf * csv/ folder: * [CODE].csv + (Note that these folders are not included in the repository but can be regenerated by running the script.) When refreshing the data, the `geography`, `schema`, and `csv` folders might all get updated and will need to be resubmitted to g3. The corresponding TMCF file is `sdg.tmcf`. @@ -39,4 +40,15 @@ python3 -m unittest discover -v -s ../ -p "*_test.py" ``` Notes: -* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers. \ No newline at end of file +* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers. + +### SDMX + +As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**. + +As a quick overview: +* `preprocess.py` downloads all the raw input CSVs to an `input/` folder as well as adds all dimensions and attributes to a `preprocessed/` folder. +* `cities.py` reads the input CSVs and matches cities with dcids. +* `process.py` reads the input CSVs and concepts and generates a cleaned CSV and schema. +* `util.py` has various shared util functions and constants. +* `m49.csv` has country code mappings. diff --git a/scripts/un/sdg/sdmx/cities.py b/scripts/un/sdg/sdmx/cities.py new file mode 100644 index 0000000000..3c5e0c9f1a --- /dev/null +++ b/scripts/un/sdg/sdmx/cities.py @@ -0,0 +1,94 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Finds dcids for cities in input files. + +Produces: +* preprocessed/cities.csv: dcid for each city name + +Note: For cities where the find entities API did not return a dcid, +we tried manually searching for the dcid and filled these into the file. +There are a few city names that are still missing - these are left blank. +**This script ideally shouldn't need to be run again.** +Usage: python3 cities.py +''' +import csv +import requests +import os +import sys + +BATCH = 1 + + +def get_cities(json, api_key): + '''Applies find entities API for given json. + Args: + json: Input json. + api_key: API key. + Returns: + API response. + ''' + return requests.post('https://api.datacommons.org/v1/bulk/find/entities', + headers={ + 'X-API-Key': api_key + }, + json=json).json() + + +def write_cities(file, cities, api_key): + '''Writes city dcids and names to file. + Args: + file: Output file path. + cities: List of city dcids to process. + api_key: API key. + ''' + with open(file, 'w') as f: + writer = csv.DictWriter(f, fieldnames=['name', 'dcid']) + writer.writeheader() + city_list = list(cities.keys()) + for i in range(0, len(city_list), BATCH): + json = { + 'entities': [{ + 'description': city + } for city in city_list[i:i + BATCH]] + } + response = get_cities(json, api_key) + print(response) + try: + for entity in response['entities']: + dcid = entity['dcids'][0] if 'dcids' in entity else '' + writer.writerow({ + 'name': cities[entity['description']], + 'dcid': dcid + }) + except KeyError: + writer.writerow({'name': cities[city_list[i]], 'dcid': ''}) + + +if __name__ == '__main__': + cities = set() + for file in sorted(os.listdir('input')): + code = file.removesuffix('.csv') + with open('input/' + file) as f: + reader = csv.DictReader(f) + if '[Cities]' in reader.fieldnames: + for row in reader: + cities.add(row['[Cities]'].replace('_', ' ').title() + + ', ' + row['GeoAreaName']) + cities = sorted(cities) + + write_cities('preprocessed/cities2.csv', cities, sys.argv[1]) diff --git a/scripts/un/sdg/sdmx/m49.tsv b/scripts/un/sdg/sdmx/m49.tsv new file mode 100644 index 0000000000..a90189880a --- /dev/null +++ b/scripts/un/sdg/sdmx/m49.tsv @@ -0,0 +1,250 @@ +Country or Area M49 code ISO-alpha3 code +Afghanistan 004 AFG +Åland Islands 248 ALA +Albania 008 ALB +Algeria 012 DZA +American Samoa 016 ASM +Andorra 020 AND +Angola 024 AGO +Anguilla 660 AIA +Antarctica 010 ATA +Antigua and Barbuda 028 ATG +Argentina 032 ARG +Armenia 051 ARM +Aruba 533 ABW +Australia 036 AUS +Austria 040 AUT +Azerbaijan 031 AZE +Bahamas 044 BHS +Bahrain 048 BHR +Bangladesh 050 BGD +Barbados 052 BRB +Belarus 112 BLR +Belgium 056 BEL +Belize 084 BLZ +Benin 204 BEN +Bermuda 060 BMU +Bhutan 064 BTN +Bolivia (Plurinational State of) 068 BOL +Bonaire, Sint Eustatius and Saba 535 BES +Bosnia and Herzegovina 070 BIH +Botswana 072 BWA +Bouvet Island 074 BVT +Brazil 076 BRA +British Indian Ocean Territory 086 IOT +British Virgin Islands 092 VGB +Brunei Darussalam 096 BRN +Bulgaria 100 BGR +Burkina Faso 854 BFA +Burundi 108 BDI +Cabo Verde 132 CPV +Cambodia 116 KHM +Cameroon 120 CMR +Canada 124 CAN +Cayman Islands 136 CYM +Central African Republic 140 CAF +Chad 148 TCD +Chile 152 CHL +China 156 CHN +China, Hong Kong Special Administrative Region 344 HKG +China, Macao Special Administrative Region 446 MAC +Christmas Island 162 CXR +Cocos (Keeling) Islands 166 CCK +Colombia 170 COL +Comoros 174 COM +Congo 178 COG +Cook Islands 184 COK +Costa Rica 188 CRI +Côte d’Ivoire 384 CIV +Croatia 191 HRV +Cuba 192 CUB +Curaçao 531 CUW +Cyprus 196 CYP +Czechia 203 CZE +Democratic People's Republic of Korea 408 PRK +Democratic Republic of the Congo 180 COD +Denmark 208 DNK +Djibouti 262 DJI +Dominica 212 DMA +Dominican Republic 214 DOM +Ecuador 218 ECU +Egypt 818 EGY +El Salvador 222 SLV +Equatorial Guinea 226 GNQ +Eritrea 232 ERI +Estonia 233 EST +Eswatini 748 SWZ +Ethiopia 231 ETH +Falkland Islands (Malvinas) 238 FLK +Faroe Islands 234 FRO +Fiji 242 FJI +Finland 246 FIN +France 250 FRA +French Guiana 254 GUF +French Polynesia 258 PYF +French Southern Territories 260 ATF +Gabon 266 GAB +Gambia 270 GMB +Georgia 268 GEO +Germany 276 DEU +Ghana 288 GHA +Gibraltar 292 GIB +Greece 300 GRC +Greenland 304 GRL +Grenada 308 GRD +Guadeloupe 312 GLP +Guam 316 GUM +Guatemala 320 GTM +Guernsey 831 GGY +Guinea 324 GIN +Guinea-Bissau 624 GNB +Guyana 328 GUY +Haiti 332 HTI +Heard Island and McDonald Islands 334 HMD +Holy See 336 VAT +Honduras 340 HND +Hungary 348 HUN +Iceland 352 ISL +India 356 IND +Indonesia 360 IDN +Iran (Islamic Republic of) 364 IRN +Iraq 368 IRQ +Ireland 372 IRL +Isle of Man 833 IMN +Israel 376 ISR +Italy 380 ITA +Jamaica 388 JAM +Japan 392 JPN +Jersey 832 JEY +Jordan 400 JOR +Kazakhstan 398 KAZ +Kenya 404 KEN +Kiribati 296 KIR +Kuwait 414 KWT +Kyrgyzstan 417 KGZ +Lao People's Democratic Republic 418 LAO +Latvia 428 LVA +Lebanon 422 LBN +Lesotho 426 LSO +Liberia 430 LBR +Libya 434 LBY +Liechtenstein 438 LIE +Lithuania 440 LTU +Luxembourg 442 LUX +Madagascar 450 MDG +Malawi 454 MWI +Malaysia 458 MYS +Maldives 462 MDV +Mali 466 MLI +Malta 470 MLT +Marshall Islands 584 MHL +Martinique 474 MTQ +Mauritania 478 MRT +Mauritius 480 MUS +Mayotte 175 MYT +Mexico 484 MEX +Micronesia (Federated States of) 583 FSM +Monaco 492 MCO +Mongolia 496 MNG +Montenegro 499 MNE +Montserrat 500 MSR +Morocco 504 MAR +Mozambique 508 MOZ +Myanmar 104 MMR +Namibia 516 NAM +Nauru 520 NRU +Nepal 524 NPL +Netherlands 528 NLD +New Caledonia 540 NCL +New Zealand 554 NZL +Nicaragua 558 NIC +Niger 562 NER +Nigeria 566 NGA +Niue 570 NIU +Norfolk Island 574 NFK +North Macedonia 807 MKD +Northern Mariana Islands 580 MNP +Norway 578 NOR +Oman 512 OMN +Pakistan 586 PAK +Palau 585 PLW +Panama 591 PAN +Papua New Guinea 598 PNG +Paraguay 600 PRY +Peru 604 PER +Philippines 608 PHL +Pitcairn 612 PCN +Poland 616 POL +Portugal 620 PRT +Puerto Rico 630 PRI +Qatar 634 QAT +Republic of Korea 410 KOR +Republic of Moldova 498 MDA +Réunion 638 REU +Romania 642 ROU +Russian Federation 643 RUS +Rwanda 646 RWA +Saint Barthélemy 652 BLM +Saint Helena 654 SHN +Saint Kitts and Nevis 659 KNA +Saint Lucia 662 LCA +Saint Martin (French Part) 663 MAF +Saint Pierre and Miquelon 666 SPM +Saint Vincent and the Grenadines 670 VCT +Samoa 882 WSM +San Marino 674 SMR +Sao Tome and Principe 678 STP +Sark 680 +Saudi Arabia 682 SAU +Senegal 686 SEN +Serbia 688 SRB +Seychelles 690 SYC +Sierra Leone 694 SLE +Singapore 702 SGP +Sint Maarten (Dutch part) 534 SXM +Slovakia 703 SVK +Slovenia 705 SVN +Solomon Islands 090 SLB +Somalia 706 SOM +South Africa 710 ZAF +South Georgia and the South Sandwich Islands 239 SGS +South Sudan 728 SSD +Spain 724 ESP +Sri Lanka 144 LKA +State of Palestine 275 PSE +Sudan 729 SDN +Suriname 740 SUR +Svalbard and Jan Mayen Islands 744 SJM +Sweden 752 SWE +Switzerland 756 CHE +Syrian Arab Republic 760 SYR +Tajikistan 762 TJK +Thailand 764 THA +Timor-Leste 626 TLS +Togo 768 TGO +Tokelau 772 TKL +Tonga 776 TON +Trinidad and Tobago 780 TTO +Tunisia 788 TUN +Türkiye 792 TUR +Turkmenistan 795 TKM +Turks and Caicos Islands 796 TCA +Tuvalu 798 TUV +Uganda 800 UGA +Ukraine 804 UKR +United Arab Emirates 784 ARE +United Kingdom of Great Britain and Northern Ireland 826 GBR +United Republic of Tanzania 834 TZA +United States Minor Outlying Islands 581 UMI +United States of America 840 USA +United States Virgin Islands 850 VIR +Uruguay 858 URY +Uzbekistan 860 UZB +Vanuatu 548 VUT +Venezuela (Bolivarian Republic of) 862 VEN +Viet Nam 704 VNM +Wallis and Futuna Islands 876 WLF +Western Sahara 732 ESH +Yemen 887 YEM +Zambia 894 ZMB +Zimbabwe 716 ZWE diff --git a/scripts/un/sdg/sdmx/preprocess.py b/scripts/un/sdg/sdmx/preprocess.py new file mode 100644 index 0000000000..ff4f3067bf --- /dev/null +++ b/scripts/un/sdg/sdmx/preprocess.py @@ -0,0 +1,99 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Downloads data from UN Stats API to be used in further processing. + +Produces: +* input/ directory containing csv files for each series +* preprocessed/attributes.csv: metadata about attributes +* preprocessed/dimensions.csv: metadata about dimensions +* output/series.mcf: MCF for each series +Note: Downloading all the data is very slow and prone to crashes. +This script ideally shouldn't need to be run again. +Usage: python3 preprocess.py +''' +import csv +import os +import requests + +from util import * + +API_PREFIX = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/' +HEADERS = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/octet-stream' +} + + +def add_concepts(code, concept, concept_set): + '''Adds concepts from given series code to concept_set. + Args: + code: Series code. + concept: Type of concept ('Attributes' | 'Dimensions'). + concept_set: Current set of concepts. + ''' + response = requests.get(f'{API_PREFIX}{code}/{concept}').json() + for entry in response: + for c in entry['codes']: + concept_set.add( + (entry['id'], c['code'], c['description'], c['sdmx'])) + + +def write_concepts(file, concept_set): + '''Writes concepts from concept_set to file. + Args: + path: File path to write to. + concept_set: Current set of concepts. + ''' + with open(file, 'w') as f: + writer = csv.writer(f) + for row in sorted(concept_set): + writer.writerow(list(row)) + + +if __name__ == '__main__': + if not os.path.exists('input'): + os.makedirs('input') + if not os.path.exists('preprocessed'): + os.makedirs('preprocessed') + if not os.path.exists('output'): + os.makedirs('output') + + series = requests.get(f'{API_PREFIX}List?allreleases=false').json() + codes = {s['code']: s['description'] for s in series} + + attributes = set() + dimensions = set() + with open('output/series.mcf', 'w') as f_series: + for code in sorted(codes): + print(code) + data = {'seriesCodes': code} + text = requests.post(f'{API_PREFIX}DataCSV', + data=data, + headers=HEADERS).text.rstrip('\x00') + with open(f'input/{code}.csv', 'w') as f_code: + f_code.write(text) + add_concepts(code, 'Attributes', attributes) + add_concepts(code, 'Dimensions', dimensions) + f_series.write( + SERIES_TEMPLATE.format_map({ + 'dcid': 'SDG_' + code, + 'description': format_description(codes[code]) + })) + + write_concepts('preprocessed/attributes.csv', attributes) + write_concepts('preprocessed/dimensions.csv', dimensions) diff --git a/scripts/un/sdg/sdmx/process.py b/scripts/un/sdg/sdmx/process.py new file mode 100644 index 0000000000..5004ccc44a --- /dev/null +++ b/scripts/un/sdg/sdmx/process.py @@ -0,0 +1,298 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Produces CSV/TMCF + schema for UN Stats data. + +Produces: +* output/output.csv: cleaned CSV +* output/measurement_method.csv: measurement methods +* output/schema.mcf: properties and classes +* output/sv.mcf: statistical variables +* output/unit.mcf: units +Usage: python3 preprocess.py +''' +import collections +import csv +import os +import sys + +from util import * + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__))))) + +module_dir_ = os.path.dirname(__file__) + +# Create map of M49 -> ISO-alpha3 for countries. +with open(os.path.join(module_dir_, 'm49.tsv')) as f: + PLACES = {} + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + if not row['ISO-alpha3 code']: # Only countries for now. + continue + PLACES[int(row['M49 code'])] = row['ISO-alpha3 code'] + +# Create map of name -> dcid for supported cities. +with open(os.path.join(module_dir_, 'preprocessed/cities.csv')) as f: + reader = csv.DictReader(f) + CITIES = {row['name']: row['dcid'] for row in reader} + + +def write_templates(file, templates): + '''Write templates to file. + Args: + file: Input file path. + templates: Template strings. + ''' + with open(file, 'w') as f: + for template in sorted(templates): + f.write(template) + + +def add_concepts(file, concepts): + '''Adds concepts from file. + Args: + file: Input file path. + concepts: Dictionary of concepts: concept -> code -> (name, formatted code). + ''' + with open(file) as f: + reader = csv.reader(f) + for row in reader: + + # Skip totals (as indicated by SDMX). + if row[3] == '_T': + continue + concepts[row[0]][row[1]] = (row[2], make_value(row[1])) + + +def get_observation_about(country_code, country_name, city): + '''Returns dcid for place. + Args: + country_code: M49 for country. + country_name: Name of country. + city: Name of city. + Returns: + Dcid of place if found, else empty string. + ''' + if city: + formatted_city = city.replace('_', ' ').title() + ', ' + country_name + if formatted_city in CITIES and CITIES[formatted_city]: + return 'dcs:' + CITIES[formatted_city] + else: + return '' + if country_code in PLACES: + return 'dcs:country/' + PLACES[country_code] + else: + return '' + + +def get_variable_measured(row, properties, concepts): + '''Returns templated string for variable_measured. + Args: + row: Input csv dict row. + properties: List of properties for row. + concepts: Dictionary of concepts. + Returns: + Templated string. + ''' + value_ids = [] + value_descriptions = [] + cprops = '' + for i in properties: + field = i[1:-1] + if not row[i] or field not in concepts or row[i] not in concepts[field]: + continue + value_ids.append(concepts[field][row[i]][1]) + value_descriptions.append(concepts[field][row[i]][0]) + enum = make_property(field) + if field in MAPPED_CONCEPTS: + prop = MAPPED_CONCEPTS[field] + else: + prop = 'sdg_' + enum[0].lower() + enum[1:] + val = enum + 'Enum_' + value_ids[-1] + cprops += f'\n{prop}: dcs:SDG_{val}' + sv = 'sdg/' + '_'.join([row['SeriesCode']] + value_ids) + pvs = ', '.join(value_descriptions) + description = format_description(row['SeriesDescription']) + if pvs: + description += ': ' + pvs + template = SV_TEMPLATE.format_map({ + 'dcid': sv, + 'popType': 'SDG_' + row['SeriesCode'], + 'name': '"' + description + '"', + 'cprops': cprops + }) + return template + + +def get_measurement_method(row, concepts): + '''Returns templated string for measurement_method. + Args: + row: Input csv dict row. + concepts: Dictionary of concepts. + Returns: + Templated string. + ''' + mmethod = '' + description = [] + for concept in [ + '[Nature]', '[Observation Status]', '[Report Ordinal]', + '[Reporting Type]' + ]: + field = concept[1:-1] + if concept in row: + mmethod += '_' + row[concept] + if field in concepts and row[concept] in concepts[field]: + description.append(concepts[field][row[concept]][0]) + if not mmethod: + return '' + mmethod = 'SDG' + mmethod + description = 'SDG Measurement Method: ' + ', '.join( + description) if description else '' + template = MMETHOD_TEMPLATE.format_map({ + 'dcid': mmethod, + 'description': description + }) + return template + + +def get_unit(row): + '''Returns templated string for unit. + Args: + row: Input csv dict row. + Returns: + Templated string. + ''' + if not '[Units]' in row: + return '' + unit = row['[Units]'].replace('^', '') + template = UNIT_TEMPLATE.format_map({ + 'dcid': unit, + 'name': format_unit_name(unit) + }) + return template + + +def write_schema(file, concepts): + '''Writes schema from concepts to file. + Args: + file: Input file path. + concepts: Dictionary of concepts. + ''' + with open(file, 'w') as f: + for concept in sorted(concepts): + if concept in SKIPPED_CONCEPTS: + continue + prop = make_property(concept) + enum = prop + 'Enum' + if concept not in MAPPED_CONCEPTS: + f.write( + PROPERTY_TEMPLATE.format_map({ + 'dcid': prop[0].lower() + prop[1:], + 'name': concept, + 'enum': enum + })) + f.write(ENUM_TEMPLATE.format_map({'enum': enum})) + for k in sorted(concepts[concept]): + v = concepts[concept][k] + f.write( + VALUE_TEMPLATE.format_map({ + 'dcid': v[1], + 'enum': enum, + 'name': v[0][0].upper() + v[0][1:], + })) + + +def process_input_file(file, writer, concepts, svs, measurement_methods, units): + '''Processes one input file and write csv rows. + Args: + file: Input file path. + writer: Csv DictWriter object. + concepts: Dictionary of concepts. + svs: Set of statistical variables. + measurement_methods: Set of measurement methods. + units: Set of units. + ''' + print(f'Starting {file}') + with open(file) as f_in: + reader = csv.DictReader(f_in) + properties = sorted([ + field for field in reader.fieldnames + if field[0] == '[' and field[1:-1] not in SKIPPED_CONCEPTS + ]) + try: + for row in reader: + if not int(row['GeoAreaCode']) in PLACES: + continue + if not is_float(row['Value']) or row['Value'] == 'NaN' or row[ + 'Value'] == 'Nan': + continue + observation_about = get_observation_about( + int(row['GeoAreaCode']), row['GeoAreaName'], + row['[Cities]'] if '[Cities]' in reader.fieldnames else '') + if not observation_about: + continue + sv = get_variable_measured(row, properties, concepts) + svs.add(sv) + measurement_method = get_measurement_method(row, concepts) + if measurement_method: + measurement_methods.add(measurement_method) + unit = get_unit(row) + if unit: + units.add(unit) + writer.writerow({ + 'variable_measured': + 'dcid:' + get_dcid(sv), + 'observation_about': + observation_about, + 'observation_date': + row['TimePeriod'], + 'value': + row['Value'], + 'measurement_method': + 'dcs:' + get_dcid(measurement_method) + if measurement_method else '', + 'unit': + 'dcs:' + get_dcid(unit) if unit else '', + 'scaling_factor': + row['[UnitMultiplier]'] + if '[UnitMultiplier]' in reader.fieldnames else '', + }) + except: + print(f'Finished processing {file}') + + +if __name__ == '__main__': + concepts = collections.defaultdict(dict) + add_concepts('preprocessed/attributes.csv', concepts) + add_concepts('preprocessed/dimensions.csv', concepts) + write_schema('output/schema.mcf', concepts) + + svs = set() + measurement_methods = set() + units = set() + with open('output/output.csv', 'w') as f: + writer = csv.DictWriter(f, fieldnames=FIELDNAMES) + writer.writeheader() + for file in sorted(os.listdir('input')): + process_input_file(os.path.join('input', file), writer, concepts, + svs, measurement_methods, units) + + write_templates('output/measurement_method.mcf', measurement_methods) + write_templates('output/sv.mcf', svs) + write_templates('output/unit.mcf', units) diff --git a/scripts/un/sdg/sdmx/util.py b/scripts/un/sdg/sdmx/util.py new file mode 100644 index 0000000000..061dc14974 --- /dev/null +++ b/scripts/un/sdg/sdmx/util.py @@ -0,0 +1,197 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''This script does not use the most up-to-date schema format. +It should only be used as an illustration of the SDMX -> MCF mapping. +Do not actually run! + +Shared util functions and constants. +''' +import re + +FIELDNAMES = [ + 'variable_measured', 'observation_about', 'observation_date', 'value', + 'measurement_method', 'unit', 'scaling_factor' +] + +DCID_PREFIX = 'Node: dcid:' +TOTAL = '_T' + +SERIES_TEMPLATE = ''' +Node: dcid:{dcid} +name: "{description}" +typeOf: dcs:SDG_Series +''' +PROPERTY_TEMPLATE = ''' +Node: dcid:sdg_{dcid} +typeOf: schema:Property +domainIncludes: dcs:Thing +rangeIncludes: dcs:SDG_{enum} +name: "{name}" +isProvisional: dcs:True +''' +ENUM_TEMPLATE = ''' +Node: dcid:SDG_{enum} +typeOf: schema:Class +subClassOf: schema:Enumeration +name: "{enum}" +isProvisional: dcs:True +''' +VALUE_TEMPLATE = ''' +Node: dcid:SDG_{enum}_{dcid} +typeOf: dcs:SDG_{enum} +name: "{name}" +isProvisional: dcs:True +''' +SV_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: {name} +populationType: dcs:{popType} +statType: dcs:measuredValue{cprops} +''' +MMETHOD_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:SDG_MeasurementMethodEnum +name: "{dcid}" +description: "{description}" +''' +UNIT_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:UnitOfMeasure +name: "{name}" +description: "SDG Unit: {dcid}" +''' + +# Select concepts will be modeled differently. +SKIPPED_CONCEPTS = { + 'Cities', 'Freq', 'Nature', 'Observation Status', 'Report Ordinal', + 'Reporting Type', 'UnitMultiplier', 'Units' +} + +# Use existing properties when they exist. +# TODO: Also map enums to existing nodes. +MAPPED_CONCEPTS = { + 'Age': 'age', + 'Cause of death': 'causeOfDeath', + 'Disability status': 'disabilityStatus', + 'Education level': 'educationalAttainment', + 'Sex': 'gender', + 'AGE': 'age', + 'CAUSE_OF_DEATH': 'causeOfDeath', + 'DISABILITY_STATUS': 'disabilityStatus', + 'EDUCATION_LEVEL': 'educationalAttainment', + 'SEX': 'gender' +} + +FORMATTED_UNITS = { + 'INDEX': 'idx', + 'NUM_M': '#m', + 'NUMBER': '#', + 'PERCENT': '%', + 'PH': 'pH', + 'TONNES': 't', + 'TONNES_M': 'Metric Tonnes' +} + + +def format_description(s): + '''Formats input with curated style. + Args: + s: Input string. + Returns: + Curated string. + ''' + # Remove <=2 levels of (). + formatted = re.sub('\((?:[^)(]|\([^)(]*\))*\)', '', s) + # Remove <=2 levels of []. + formatted = re.sub('\[(?:[^)(]|\[[^)(]*\])*\]', '', formatted) + # Remove attributes indicated with 'by'. + formatted = formatted.split(', by')[0] + # Remove references indicated by 'million USD'. + formatted = formatted.split(', million USD')[0] + # Remove extra spaces + formatted = formatted.replace(' , ', ', ').replace(' ', ' ').strip() + # Remove trailing commas + if formatted[-1] == ',': + formatted = formatted[:-1] + # Replace 100,000 with 100K + formatted = formatted.replace('100,000', '100K') + # Make ascii + return formatted.replace('Â', + '').replace('’', '\'').replace('₂', '2').replace( + '\xa0', ' ').replace('−', '-') + + +def is_float(element): + '''Checks if value can be interpreted as float. + Args: + element: Input. + Returns: + Whether the value can be cast as a float. + ''' + if element is None: + return False + try: + float(element) + return True + except ValueError: + return False + + +def make_property(s): + '''Formats property string. + Args: + s: Input string. + Returns: + Formatted string. + ''' + return s.title().replace(' ', '').replace('-', + '').replace('_', + '').replace('/', '') + + +def make_value(s): + '''Formats value string. + Args: + s: Input string. + Returns: + Formatted string. + ''' + return s.replace('<=', 'LEQ').replace('<', + 'LT').replace('+', 'GEQ').replace( + ' ', '').replace('_', '') + + +def format_unit_name(dcid): + '''Formats unit name stirng. + Args: + dcid: Input dcid. + Retuns: + Formatted string. + ''' + if dcid in FORMATTED_UNITS: + return FORMATTED_UNITS[dcid] + return dcid.lower().replace('_', ' ').replace('1000000', '1M').replace( + '100000', '100K').replace('10000', '10k') + + +def get_dcid(template): + '''Gets dcid from template. + Args: + template: Input templated string. + Returns: + Dcid. + ''' + return template.split(DCID_PREFIX)[1].split('\n')[0] From a251463aa00f195d432cee08aa5b950c89f8761b Mon Sep 17 00:00:00 2001 From: natalie <77713883+n-h-diaz@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:10:30 -0800 Subject: [PATCH 2/4] update sdg readme (#922) * add old version of sdg scripts * fix * fix * fix * address comments * fix * update sdg readme --- scripts/un/sdg/README.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md index f829b4dc84..d2d4f24722 100644 --- a/scripts/un/sdg/README.md +++ b/scripts/un/sdg/README.md @@ -2,8 +2,23 @@ This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date. +## One-time Setup -To generate place mappings: +Initialize submodules: +``` +git submodule update --init --remote sdg-dataset +git submodule update --init --remote sssom-mappings +``` + +## Data Refresh + +Update submodules: +``` +git submodule update --remote sdg-dataset +git submodule update --remote sssom-mappings +``` + +Generate place mappings: ``` python3 geography.py ``` @@ -15,7 +30,7 @@ Produces: Note that the `place_mappings.csv` is required before running the `process.py` script. -To process data and generate artifacts: +Process data and generate artifacts: ``` python3 process.py ``` @@ -42,7 +57,7 @@ python3 -m unittest discover -v -s ../ -p "*_test.py" Notes: * We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers. -### SDMX +## SDMX As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**. From f34efbf27c9b0090ec9d4e69ceafa717a7032c84 Mon Sep 17 00:00:00 2001 From: Luis Gonzalez Date: Thu, 9 Nov 2023 12:28:00 -0500 Subject: [PATCH 3/4] fix utf encoding on geography.py (#923) --- scripts/un/sdg/geography.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/un/sdg/geography.py b/scripts/un/sdg/geography.py index ce551bc0e9..e8d6706724 100644 --- a/scripts/un/sdg/geography.py +++ b/scripts/un/sdg/geography.py @@ -97,7 +97,7 @@ def get_sdg2type(file): Map of SDG code -> SDG type. ''' sdg2type = {} - with open(file) as f: + with open(file, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: sdg2type[row['GEOGRAPHY_CODE']] = row['GEOGRAPHY_TYPE'] @@ -140,7 +140,7 @@ def get_un2dc_curated(file): Map of UN code -> curated Node. ''' un2dc_curated = {} - with open(file) as f: + with open(file, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: @@ -204,8 +204,8 @@ def write_un_places(input_geos, output, sdg2type, un2sdg, un2dc_curated): ''' un2dc_generated = {} new_subjects = [] - with open(input_geos) as f_in: - with open(output, 'w') as f_out: + with open(input_geos, encoding='utf-8') as f_in: + with open(output, 'w', encoding='utf-8') as f_out: reader = csv.DictReader(f_in) for row in reader: subject = row['subject_id'] @@ -288,7 +288,7 @@ def write_un_containment(output, containment, new_subjects): new_subjects: List of Nodes for new places. ''' - with open(output, 'w') as f: + with open(output, 'w', encoding='utf-8') as f: for s in sorted(containment): c = '' for o in containment[s]: @@ -323,7 +323,7 @@ def write_place_mappings(output, sdg2un, un2dc_curated, un2dc_generated): un2dc_curated: Map of UN code -> curated Node. un2dc_generated: Map of UN code -> generated Node. ''' - with open(output, 'w') as f: + with open(output, 'w', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['sdg', 'dcid']) writer.writeheader() for code in sorted(sdg2un): From 1a2fd8185aa15615adbf63af255f31fc1da01d07 Mon Sep 17 00:00:00 2001 From: natalie <77713883+n-h-diaz@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:55:06 -0800 Subject: [PATCH 4/4] add scripts for HUD_IncomeLimits import (#924) * add scripts for HUD_IncomeLimits import * fix * fix * comments * fix * fix --- scripts/us_hud/__init__.py | 0 scripts/us_hud/income/README.md | 18 ++ scripts/us_hud/income/__init__.py | 0 scripts/us_hud/income/match_bq.csv | 189 ++++++++++++++++++ scripts/us_hud/income/process.py | 132 ++++++++++++ scripts/us_hud/income/process_test.py | 55 +++++ scripts/us_hud/income/testdata/__init__.py | 0 .../income/testdata/expected_output_2006.csv | 2 + .../us_hud/income/testdata/output_2006.csv | 2 + .../income/testdata/test_input_2006.csv | 2 + 10 files changed, 400 insertions(+) create mode 100644 scripts/us_hud/__init__.py create mode 100644 scripts/us_hud/income/README.md create mode 100644 scripts/us_hud/income/__init__.py create mode 100644 scripts/us_hud/income/match_bq.csv create mode 100644 scripts/us_hud/income/process.py create mode 100644 scripts/us_hud/income/process_test.py create mode 100644 scripts/us_hud/income/testdata/__init__.py create mode 100644 scripts/us_hud/income/testdata/expected_output_2006.csv create mode 100644 scripts/us_hud/income/testdata/output_2006.csv create mode 100644 scripts/us_hud/income/testdata/test_input_2006.csv diff --git a/scripts/us_hud/__init__.py b/scripts/us_hud/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/us_hud/income/README.md b/scripts/us_hud/income/README.md new file mode 100644 index 0000000000..57522f078c --- /dev/null +++ b/scripts/us_hud/income/README.md @@ -0,0 +1,18 @@ +# Income Limits + +This import includes median income for households of different sizes for the 80th and 150th (computed) percentiles from the [HUD Income Limits dataset](https://www.huduser.gov/portal/datasets/il.html). + +To generate artifacts: + +``` +python3 process.py +``` + +This will produce a folder `csv/` with cleaned CSVs `output_[YEAR].csv`. + +The `match_bq.csv` file contains places that have additional dcids that we would like to generate stats for. + +To run unit tests: +``` +python3 -m unittest discover -v -s ../ -p "*_test.py" +``` diff --git a/scripts/us_hud/income/__init__.py b/scripts/us_hud/income/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/us_hud/income/match_bq.csv b/scripts/us_hud/income/match_bq.csv new file mode 100644 index 0000000000..20e0286b24 --- /dev/null +++ b/scripts/us_hud/income/match_bq.csv @@ -0,0 +1,189 @@ +fips,city +geoId/02110,geoId/0236400 +geoId/02220,geoId/0270540 +geoId/02275,geoId/0286380 +geoId/0900108070,geoId/0908000 +geoId/0900118500,geoId/0918430 +geoId/0900156060,geoId/0955990 +geoId/0900168170,geoId/0968100 +geoId/0900173070,geoId/0973000 +geoId/0900174190,geoId/0974260 +geoId/0900308490,geoId/0908420 +geoId/0900322630,geoId/0922700 +geoId/0900337070,geoId/0937000 +geoId/0900350440,geoId/0950370 +geoId/0900382590,geoId/0982660 +geoId/0900576570,geoId/0976500 +geoId/0900747360,geoId/0947290 +geoId/0900901220,geoId/0901150 +geoId/0900919550,geoId/0919480 +geoId/0900946520,geoId/0946450 +geoId/0900947535,geoId/0947515 +geoId/0900949950,geoId/0949880 +geoId/0900952070,geoId/0952000 +geoId/0900980070,geoId/0980000 +geoId/0900982870,geoId/0982800 +geoId/0901152350,geoId/0952280 +geoId/0901156270,geoId/0956200 +geoId/2300102060,geoId/2302060 +geoId/2300138740,geoId/2338740 +geoId/2300310565,geoId/2310565 +geoId/2300360825,geoId/2360825 +geoId/2300560545,geoId/2360545 +geoId/2300571990,geoId/2371990 +geoId/2300582105,geoId/2382105 +geoId/2300923200,geoId/2323200 +geoId/2301102100,geoId/2302100 +geoId/2301127085,geoId/2327085 +geoId/2301130550,geoId/2330550 +geoId/2301180740,geoId/2380740 +geoId/2301363590,geoId/2363590 +geoId/2301902795,geoId/2302795 +geoId/2301906925,geoId/2306925 +geoId/2301955225,geoId/2355225 +geoId/2302303355,geoId/2303355 +geoId/2302703950,geoId/2303950 +geoId/2302909585,geoId/2309585 +geoId/2302921730,geoId/2321730 +geoId/2303104860,geoId/2304860 +geoId/2303164675,geoId/2364675 +geoId/2303165725,geoId/2365725 +geoId/24510,geoId/2404000 +geoId/2500346225,geoId/2546225 +geoId/2500353960,geoId/2553960 +geoId/2500502690,geoId/2502690 +geoId/2500523000,geoId/2523000 +geoId/2500545000,geoId/2545000 +geoId/2500562430,geoId/2562465 +geoId/2500569170,geoId/2569170 +geoId/2500905595,geoId/2505595 +geoId/2500916250,geoId/2516285 +geoId/2500926150,geoId/2526150 +geoId/2500929405,geoId/2529405 +geoId/2500934550,geoId/2534550 +geoId/2500937490,geoId/2537490 +geoId/2500938400,geoId/2538435 +geoId/2500943580,geoId/2543615 +geoId/2500945245,geoId/2545245 +geoId/2500952490,geoId/2552490 +geoId/2500959105,geoId/2559105 +geoId/2500960015,geoId/2560050 +geoId/2500968645,geoId/2568680 +geoId/2501313660,geoId/2513660 +geoId/2501330840,geoId/2530840 +geoId/2501336300,geoId/2536335 +geoId/2501352144,geoId/2552144 +geoId/2501367000,geoId/2567000 +geoId/2501376030,geoId/2576030 +geoId/2501546330,geoId/2546330 +geoId/2501701605,geoId/2501640 +geoId/2501705070,geoId/2505105 +geoId/2501709840,geoId/2509875 +geoId/2501711000,geoId/2511000 +geoId/2501721990,geoId/2521990 +geoId/2501724960,geoId/2524960 +geoId/2501735215,geoId/2535250 +geoId/2501737000,geoId/2537000 +geoId/2501737875,geoId/2537875 +geoId/2501738715,geoId/2538715 +geoId/2501739625,geoId/2539660 +geoId/2501739835,geoId/2539835 +geoId/2501740115,geoId/2540115 +geoId/2501745560,geoId/2545560 +geoId/2501756130,geoId/2556165 +geoId/2501762535,geoId/2562535 +geoId/2501767665,geoId/2567700 +geoId/2501772215,geoId/2572250 +geoId/2501772600,geoId/2572600 +geoId/2501780510,geoId/2580545 +geoId/2501781035,geoId/2581035 +geoId/2502109175,geoId/2509210 +geoId/2502130455,geoId/2530420 +geoId/2502141690,geoId/2541725 +geoId/2502144105,geoId/2544140 +geoId/2502150250,geoId/2550285 +geoId/2502155745,geoId/2555745 +geoId/2502155955,geoId/2555990 +geoId/2502174175,geoId/2574210 +geoId/2502178972,geoId/2578972 +geoId/2502300170,geoId/2500135 +geoId/2502309000,geoId/2509000 +geoId/2502331645,geoId/2531680 +geoId/2502507000,geoId/2507000 +geoId/2502513205,geoId/2513205 +geoId/2502556585,geoId/2556585 +geoId/2502581005,geoId/2581005 +geoId/2502723875,geoId/2523875 +geoId/2502725485,geoId/2525485 +geoId/2502735075,geoId/2535075 +geoId/2502763345,geoId/2563345 +geoId/2502782000,geoId/2582000 +geoId/29510,geoId/2965000 +geoId/32510,geoId/3209700 +geoId/3300140180,geoId/3340180 +geoId/3300539300,geoId/3339300 +geoId/3300705140,geoId/3305140 +geoId/3300941300,geoId/3341300 +geoId/3301145140,geoId/3345140 +geoId/3301150260,geoId/3350260 +geoId/3301314200,geoId/3314200 +geoId/3301327380,geoId/3327380 +geoId/3301562900,geoId/3362900 +geoId/3301718820,geoId/3318820 +geoId/3301765140,geoId/3365140 +geoId/3301769940,geoId/3369940 +geoId/3301912900,geoId/3312900 +geoId/4400374300,geoId/4474300 +geoId/4400549960,geoId/4449960 +geoId/4400714140,geoId/4414140 +geoId/4400719180,geoId/4419180 +geoId/4400722960,geoId/4422960 +geoId/4400754640,geoId/4454640 +geoId/4400759000,geoId/4459000 +geoId/4400780780,geoId/4480780 +geoId/5000174650,geoId/5074650 +geoId/5000710675,geoId/5010675 +geoId/5000766175,geoId/5066175 +geoId/5000785150,geoId/5085150 +geoId/5001161675,geoId/5061675 +geoId/5001948850,geoId/5048850 +geoId/5002161225,geoId/5061225 +geoId/5002303175,geoId/5003175 +geoId/5002346000,geoId/5046000 +geoId/51510,geoId/5101000 +geoId/51520,geoId/5109816 +geoId/51530,geoId/5111032 +geoId/51550,geoId/5116000 +geoId/51570,geoId/5118448 +geoId/51580,geoId/5119728 +geoId/51590,geoId/5121344 +geoId/51595,geoId/5125808 +geoId/51600,geoId/5126496 +geoId/51610,geoId/5127200 +geoId/51620,geoId/5129600 +geoId/51630,geoId/5129744 +geoId/51640,geoId/5130208 +geoId/51650,geoId/5135000 +geoId/51660,geoId/5135624 +geoId/51670,geoId/5138424 +geoId/51678,geoId/5145512 +geoId/51680,geoId/5147672 +geoId/51683,geoId/5148952 +geoId/51685,geoId/5148968 +geoId/51690,geoId/5149784 +geoId/51700,geoId/5156000 +geoId/51710,geoId/5157000 +geoId/51720,geoId/5157688 +geoId/51730,geoId/5161832 +geoId/51735,geoId/5163768 +geoId/51740,geoId/5164000 +geoId/51750,geoId/5165392 +geoId/51760,geoId/5167000 +geoId/51770,geoId/5168000 +geoId/51775,geoId/5170000 +geoId/51790,geoId/5175216 +geoId/51800,geoId/5176432 +geoId/51810,geoId/5182000 +geoId/51820,geoId/5183680 +geoId/51830,geoId/5186160 +geoId/51840,geoId/5186720 diff --git a/scripts/us_hud/income/process.py b/scripts/us_hud/income/process.py new file mode 100644 index 0000000000..fb9fc767b9 --- /dev/null +++ b/scripts/us_hud/income/process.py @@ -0,0 +1,132 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''Generates cleaned CSVs for HUD Income Limits data. + +Produces: +* csv/output_[YEAR].csv + +Usage: +python3 process.py +''' +import csv +import datetime +import os +import pandas as pd +from absl import app +from absl import flags + +FLAGS = flags.FLAGS +flags.DEFINE_string('income_output_dir', 'csv', 'Path to write cleaned CSVs.') + +URL_PREFIX = 'https://www.huduser.gov/portal/datasets/il/il' + + +def get_url(year): + '''Return xls url for year. + + Args: + year: Input year. + + Returns: + xls url for given year. + ''' + if year < 2006: + return '' + suffix = str(year)[-2:] + if year >= 2016: + return f'{URL_PREFIX}{suffix}/Section8-FY{suffix}.xlsx' + elif year == 2015: + return f'{URL_PREFIX}15/Section8_Rev.xlsx' + elif year == 2014: + return f'{URL_PREFIX}14/Poverty.xls' + elif year == 2011: + return f'{URL_PREFIX}11/Section8_v3.xls' + elif year >= 2009: + return f'{URL_PREFIX}{suffix}/Section8.xls' + elif year == 2008: + return f'{URL_PREFIX}08/Section8_FY08.xls' + elif year == 2007: + return f'{URL_PREFIX}07/Section8-rev.xls' + elif year == 2006: + return f'{URL_PREFIX}06/Section8FY2006.xls' + else: + return '' + + +def compute_150(df, person): + '''Compute 150th percentile income in-place. + + Args: + df: Input dataframe (will be modified). + person: Number of people in household. + ''' + df[f'l150_{person}'] = df.apply( + lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1) + + +def process(year, matches, output_dir): + '''Generate cleaned CSV. + + Args: + year: Input year. + matches: Map of fips dcid -> city dcid. + output_dir: Directory to write cleaned CSV. + ''' + url = get_url(year) + try: + df = pd.read_excel(url) + except: + print(f'No file found for {url}.') + return + if 'fips2010' in df: + df = df.rename(columns={'fips2010': 'fips'}) + + # Filter to 80th percentile income stats for each household size. + df = df.loc[:, [ + 'fips', 'l80_1', 'l80_2', 'l80_3', 'l80_4', 'l80_5', 'l80_6', 'l80_7', + 'l80_8' + ]] + + df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10), + axis=1) + df['fips'] = df.apply(lambda x: x['fips'][:-5] + if x['fips'][-5:] == '99999' else x['fips'], + axis=1) + for i in range(1, 9): + compute_150(df, i) + df['year'] = [year for i in range(len(df))] + + # Add stats for matching dcids. + df_match = df.copy().loc[df['fips'].isin(matches)] + if not df_match.empty: + df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1) + df = pd.concat([df, df_match]) + + df.to_csv(os.path.join(output_dir, f'output_{year}.csv'), index=False) + + +def main(argv): + with open('match_bq.csv') as f: + reader = csv.DictReader(f) + matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader} + if not os.path.exists(FLAGS.income_output_dir): + os.makedirs(FLAGS.income_output_dir) + today = datetime.date.today() + for year in range(2006, today.year): + print(year) + process(year, matches, FLAGS.income_output_dir) + + +if __name__ == '__main__': + app.run(main) diff --git a/scripts/us_hud/income/process_test.py b/scripts/us_hud/income/process_test.py new file mode 100644 index 0000000000..6a2e68f13d --- /dev/null +++ b/scripts/us_hud/income/process_test.py @@ -0,0 +1,55 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''Tests for process.py. + +Usage: python3 -m unittest discover -v -s ../ -p "process_test.py" +''' +import os +import pandas as pd +import sys +import unittest +from unittest.mock import patch + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__))))) +from us_hud.income import process + +module_dir_ = os.path.dirname(__file__) + +TEST_DIR = os.path.join(module_dir_, 'testdata') + + +class ProcessTest(unittest.TestCase): + + def test_get_url(self): + self.assertEqual( + process.get_url(2022), + 'https://www.huduser.gov/portal/datasets/il/il22/Section8-FY22.xlsx' + ) + self.assertEqual(process.get_url(1997), '') + + def test_compute_150(self): + pass + + @patch('pandas.read_excel') + def test_process(self, mock_df): + mock_df.return_value = pd.DataFrame( + pd.read_csv(os.path.join(TEST_DIR, 'test_input_2006.csv'))) + matches = {'dcs:geoId/02110': 'dcs:geoId/0236400'} + process.process(2006, matches, TEST_DIR) + with open(os.path.join(TEST_DIR, 'output_2006.csv')) as result: + with open(os.path.join(TEST_DIR, + 'expected_output_2006.csv')) as expected: + self.assertEqual(result.read(), expected.read()) diff --git a/scripts/us_hud/income/testdata/__init__.py b/scripts/us_hud/income/testdata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/us_hud/income/testdata/expected_output_2006.csv b/scripts/us_hud/income/testdata/expected_output_2006.csv new file mode 100644 index 0000000000..76e466eb5c --- /dev/null +++ b/scripts/us_hud/income/testdata/expected_output_2006.csv @@ -0,0 +1,2 @@ +fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year +dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006 diff --git a/scripts/us_hud/income/testdata/output_2006.csv b/scripts/us_hud/income/testdata/output_2006.csv new file mode 100644 index 0000000000..76e466eb5c --- /dev/null +++ b/scripts/us_hud/income/testdata/output_2006.csv @@ -0,0 +1,2 @@ +fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year +dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006 diff --git a/scripts/us_hud/income/testdata/test_input_2006.csv b/scripts/us_hud/income/testdata/test_input_2006.csv new file mode 100644 index 0000000000..5eb7f16ff1 --- /dev/null +++ b/scripts/us_hud/income/testdata/test_input_2006.csv @@ -0,0 +1,2 @@ +State_Alpha,fips,State,County_Town_Name,County,Metro_Area_Name,CBSASub,County_Name,median1999,median2006,State_Name,l50_1,l50_2,l50_3,l50_4,l50_5,l50_6,l50_7,l50_8,msa,l30_1,l30_2,l30_3,l30_4,l30_5,l30_6,l30_7,l30_8,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,metro +AL,100199999,1,Autauga County,1,"Montgomery, AL MSA",METRO33860M33860,Autauga County,45182,55900,Alabama,19550,22350,25150,27950,30200,32400,34650,36900,5240,11750,13400,15100,16750,18100,19450,20750,22100,31300,35750,40250,44700,48300,51850,55450,59000,1