From b990b2820854d52affdd951458ef57c4b8130168 Mon Sep 17 00:00:00 2001
From: natalie <77713883+n-h-diaz@users.noreply.github.com>
Date: Wed, 8 Nov 2023 08:33:29 -0800
Subject: [PATCH 1/4] add old version of sdg scripts (#921)

* add old version of sdg scripts

* fix

* fix

* fix
---
 scripts/un/sdg/README.md          |  16 +-
 scripts/un/sdg/sdmx/cities.py     |  94 ++++++++++
 scripts/un/sdg/sdmx/m49.tsv       | 250 +++++++++++++++++++++++++
 scripts/un/sdg/sdmx/preprocess.py |  99 ++++++++++
 scripts/un/sdg/sdmx/process.py    | 298 ++++++++++++++++++++++++++++++
 scripts/un/sdg/sdmx/util.py       | 197 ++++++++++++++++++++
 6 files changed, 952 insertions(+), 2 deletions(-)
 create mode 100644 scripts/un/sdg/sdmx/cities.py
 create mode 100644 scripts/un/sdg/sdmx/m49.tsv
 create mode 100644 scripts/un/sdg/sdmx/preprocess.py
 create mode 100644 scripts/un/sdg/sdmx/process.py
 create mode 100644 scripts/un/sdg/sdmx/util.py

diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md
index 747f6b9712..f829b4dc84 100644
--- a/scripts/un/sdg/README.md
+++ b/scripts/un/sdg/README.md
@@ -1,6 +1,6 @@
 # UN Stats Sustainable Development Goals
 
-This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats.
+This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date.
 
 
 To generate place mappings:
@@ -29,6 +29,7 @@ Produces:
   * unit.mcf
 * csv/ folder: 
   * [CODE].csv
+
 (Note that these folders are not included in the repository but can be regenerated by running the script.)
 
 When refreshing the data, the `geography`, `schema`, and `csv` folders might all get updated and will need to be resubmitted to g3. The corresponding TMCF file is `sdg.tmcf`.
@@ -39,4 +40,15 @@ python3 -m unittest discover -v -s ../ -p "*_test.py"
 ```
 
 Notes:
-* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
\ No newline at end of file
+* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
+
+### SDMX
+
+As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**.
+
+As a quick overview: 
+* `preprocess.py` downloads all the raw input CSVs to an `input/` folder as well as adds all dimensions and attributes to a `preprocessed/` folder.
+* `cities.py` reads the input CSVs and matches cities with dcids.
+* `process.py` reads the input CSVs and concepts and generates a cleaned CSV and schema.
+* `util.py` has various shared util functions and constants.
+* `m49.csv` has country code mappings.
diff --git a/scripts/un/sdg/sdmx/cities.py b/scripts/un/sdg/sdmx/cities.py
new file mode 100644
index 0000000000..3c5e0c9f1a
--- /dev/null
+++ b/scripts/un/sdg/sdmx/cities.py
@@ -0,0 +1,94 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Finds dcids for cities in input files.
+
+Produces:
+* preprocessed/cities.csv: dcid for each city name
+ 
+Note: For cities where the find entities API did not return a dcid,
+we tried manually searching for the dcid and filled these into the file.
+There are a few city names that are still missing - these are left blank.
+**This script ideally shouldn't need to be run again.**
+Usage: python3 cities.py <API_KEY>
+'''
+import csv
+import requests
+import os
+import sys
+
+BATCH = 1
+
+
+def get_cities(json, api_key):
+    '''Applies find entities API for given json.
+    Args:
+        json: Input json.
+        api_key: API key.
+    Returns:
+        API response.
+    '''
+    return requests.post('https://api.datacommons.org/v1/bulk/find/entities',
+                         headers={
+                             'X-API-Key': api_key
+                         },
+                         json=json).json()
+
+
+def write_cities(file, cities, api_key):
+    '''Writes city dcids and names to file.
+    Args:
+        file: Output file path.
+        cities: List of city dcids to process. 
+        api_key: API key.
+    '''
+    with open(file, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'dcid'])
+        writer.writeheader()
+        city_list = list(cities.keys())
+        for i in range(0, len(city_list), BATCH):
+            json = {
+                'entities': [{
+                    'description': city
+                } for city in city_list[i:i + BATCH]]
+            }
+            response = get_cities(json, api_key)
+            print(response)
+            try:
+                for entity in response['entities']:
+                    dcid = entity['dcids'][0] if 'dcids' in entity else ''
+                    writer.writerow({
+                        'name': cities[entity['description']],
+                        'dcid': dcid
+                    })
+            except KeyError:
+                writer.writerow({'name': cities[city_list[i]], 'dcid': ''})
+
+
+if __name__ == '__main__':
+    cities = set()
+    for file in sorted(os.listdir('input')):
+        code = file.removesuffix('.csv')
+        with open('input/' + file) as f:
+            reader = csv.DictReader(f)
+            if '[Cities]' in reader.fieldnames:
+                for row in reader:
+                    cities.add(row['[Cities]'].replace('_', ' ').title() +
+                               ', ' + row['GeoAreaName'])
+    cities = sorted(cities)
+
+    write_cities('preprocessed/cities2.csv', cities, sys.argv[1])
diff --git a/scripts/un/sdg/sdmx/m49.tsv b/scripts/un/sdg/sdmx/m49.tsv
new file mode 100644
index 0000000000..a90189880a
--- /dev/null
+++ b/scripts/un/sdg/sdmx/m49.tsv
@@ -0,0 +1,250 @@
+Country or Area	M49 code	ISO-alpha3 code
+Afghanistan	004	AFG
+Åland Islands	248	ALA
+Albania	008	ALB
+Algeria	012	DZA
+American Samoa	016	ASM
+Andorra	020	AND
+Angola	024	AGO
+Anguilla	660	AIA
+Antarctica	010	ATA
+Antigua and Barbuda	028	ATG
+Argentina	032	ARG
+Armenia	051	ARM
+Aruba	533	ABW
+Australia	036	AUS
+Austria	040	AUT
+Azerbaijan	031	AZE
+Bahamas	044	BHS
+Bahrain	048	BHR
+Bangladesh	050	BGD
+Barbados	052	BRB
+Belarus	112	BLR
+Belgium	056	BEL
+Belize	084	BLZ
+Benin	204	BEN
+Bermuda	060	BMU
+Bhutan	064	BTN
+Bolivia (Plurinational State of)	068	BOL
+Bonaire, Sint Eustatius and Saba	535	BES
+Bosnia and Herzegovina	070	BIH
+Botswana	072	BWA
+Bouvet Island	074	BVT
+Brazil	076	BRA
+British Indian Ocean Territory	086	IOT
+British Virgin Islands	092	VGB
+Brunei Darussalam	096	BRN
+Bulgaria	100	BGR
+Burkina Faso	854	BFA
+Burundi	108	BDI
+Cabo Verde	132	CPV
+Cambodia	116	KHM
+Cameroon	120	CMR
+Canada	124	CAN
+Cayman Islands	136	CYM
+Central African Republic	140	CAF
+Chad	148	TCD
+Chile	152	CHL
+China	156	CHN
+China, Hong Kong Special Administrative Region	344	HKG
+China, Macao Special Administrative Region	446	MAC
+Christmas Island	162	CXR
+Cocos (Keeling) Islands	166	CCK
+Colombia	170	COL
+Comoros	174	COM
+Congo	178	COG
+Cook Islands	184	COK
+Costa Rica	188	CRI
+Côte d’Ivoire	384	CIV
+Croatia	191	HRV
+Cuba	192	CUB
+Curaçao	531	CUW
+Cyprus	196	CYP
+Czechia	203	CZE
+Democratic People's Republic of Korea	408	PRK
+Democratic Republic of the Congo	180	COD
+Denmark	208	DNK
+Djibouti	262	DJI
+Dominica	212	DMA
+Dominican Republic	214	DOM
+Ecuador	218	ECU
+Egypt	818	EGY
+El Salvador	222	SLV
+Equatorial Guinea	226	GNQ
+Eritrea	232	ERI
+Estonia	233	EST
+Eswatini	748	SWZ
+Ethiopia	231	ETH
+Falkland Islands (Malvinas)	238	FLK
+Faroe Islands	234	FRO
+Fiji	242	FJI
+Finland	246	FIN
+France	250	FRA
+French Guiana	254	GUF
+French Polynesia	258	PYF
+French Southern Territories	260	ATF
+Gabon	266	GAB
+Gambia	270	GMB
+Georgia	268	GEO
+Germany	276	DEU
+Ghana	288	GHA
+Gibraltar	292	GIB
+Greece	300	GRC
+Greenland	304	GRL
+Grenada	308	GRD
+Guadeloupe	312	GLP
+Guam	316	GUM
+Guatemala	320	GTM
+Guernsey	831	GGY
+Guinea	324	GIN
+Guinea-Bissau	624	GNB
+Guyana	328	GUY
+Haiti	332	HTI
+Heard Island and McDonald Islands	334	HMD
+Holy See	336	VAT
+Honduras	340	HND
+Hungary	348	HUN
+Iceland	352	ISL
+India	356	IND
+Indonesia	360	IDN
+Iran (Islamic Republic of)	364	IRN
+Iraq	368	IRQ
+Ireland	372	IRL
+Isle of Man	833	IMN
+Israel	376	ISR
+Italy	380	ITA
+Jamaica	388	JAM
+Japan	392	JPN
+Jersey	832	JEY
+Jordan	400	JOR
+Kazakhstan	398	KAZ
+Kenya	404	KEN
+Kiribati	296	KIR
+Kuwait	414	KWT
+Kyrgyzstan	417	KGZ
+Lao People's Democratic Republic	418	LAO
+Latvia	428	LVA
+Lebanon	422	LBN
+Lesotho	426	LSO
+Liberia	430	LBR
+Libya	434	LBY
+Liechtenstein	438	LIE
+Lithuania	440	LTU
+Luxembourg	442	LUX
+Madagascar	450	MDG
+Malawi	454	MWI
+Malaysia	458	MYS
+Maldives	462	MDV
+Mali	466	MLI
+Malta	470	MLT
+Marshall Islands	584	MHL
+Martinique	474	MTQ
+Mauritania	478	MRT
+Mauritius	480	MUS
+Mayotte	175	MYT
+Mexico	484	MEX
+Micronesia (Federated States of)	583	FSM
+Monaco	492	MCO
+Mongolia	496	MNG
+Montenegro	499	MNE
+Montserrat	500	MSR
+Morocco	504	MAR
+Mozambique	508	MOZ
+Myanmar	104	MMR
+Namibia	516	NAM
+Nauru	520	NRU
+Nepal	524	NPL
+Netherlands	528	NLD
+New Caledonia	540	NCL
+New Zealand	554	NZL
+Nicaragua	558	NIC
+Niger	562	NER
+Nigeria	566	NGA
+Niue	570	NIU
+Norfolk Island	574	NFK
+North Macedonia	807	MKD
+Northern Mariana Islands	580	MNP
+Norway	578	NOR
+Oman	512	OMN
+Pakistan	586	PAK
+Palau	585	PLW
+Panama	591	PAN
+Papua New Guinea	598	PNG
+Paraguay	600	PRY
+Peru	604	PER
+Philippines	608	PHL
+Pitcairn	612	PCN
+Poland	616	POL
+Portugal	620	PRT
+Puerto Rico	630	PRI
+Qatar	634	QAT
+Republic of Korea	410	KOR
+Republic of Moldova	498	MDA
+Réunion	638	REU
+Romania	642	ROU
+Russian Federation	643	RUS
+Rwanda	646	RWA
+Saint Barthélemy	652	BLM
+Saint Helena	654	SHN
+Saint Kitts and Nevis	659	KNA
+Saint Lucia	662	LCA
+Saint Martin (French Part)	663	MAF
+Saint Pierre and Miquelon	666	SPM
+Saint Vincent and the Grenadines	670	VCT
+Samoa	882	WSM
+San Marino	674	SMR
+Sao Tome and Principe	678	STP
+Sark	680	
+Saudi Arabia	682	SAU
+Senegal	686	SEN
+Serbia	688	SRB
+Seychelles	690	SYC
+Sierra Leone	694	SLE
+Singapore	702	SGP
+Sint Maarten (Dutch part)	534	SXM
+Slovakia	703	SVK
+Slovenia	705	SVN
+Solomon Islands	090	SLB
+Somalia	706	SOM
+South Africa	710	ZAF
+South Georgia and the South Sandwich Islands	239	SGS
+South Sudan	728	SSD
+Spain	724	ESP
+Sri Lanka	144	LKA
+State of Palestine	275	PSE
+Sudan	729	SDN
+Suriname	740	SUR
+Svalbard and Jan Mayen Islands	744	SJM
+Sweden	752	SWE
+Switzerland	756	CHE
+Syrian Arab Republic	760	SYR
+Tajikistan	762	TJK
+Thailand	764	THA
+Timor-Leste	626	TLS
+Togo	768	TGO
+Tokelau	772	TKL
+Tonga	776	TON
+Trinidad and Tobago	780	TTO
+Tunisia	788	TUN
+Türkiye	792	TUR
+Turkmenistan	795	TKM
+Turks and Caicos Islands	796	TCA
+Tuvalu	798	TUV
+Uganda	800	UGA
+Ukraine	804	UKR
+United Arab Emirates	784	ARE
+United Kingdom of Great Britain and Northern Ireland	826	GBR
+United Republic of Tanzania	834	TZA
+United States Minor Outlying Islands	581	UMI
+United States of America	840	USA
+United States Virgin Islands	850	VIR
+Uruguay	858	URY
+Uzbekistan	860	UZB
+Vanuatu	548	VUT
+Venezuela (Bolivarian Republic of)	862	VEN
+Viet Nam	704	VNM
+Wallis and Futuna Islands	876	WLF
+Western Sahara	732	ESH
+Yemen	887	YEM
+Zambia	894	ZMB
+Zimbabwe	716	ZWE
diff --git a/scripts/un/sdg/sdmx/preprocess.py b/scripts/un/sdg/sdmx/preprocess.py
new file mode 100644
index 0000000000..ff4f3067bf
--- /dev/null
+++ b/scripts/un/sdg/sdmx/preprocess.py
@@ -0,0 +1,99 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Downloads data from UN Stats API to be used in further processing.
+
+Produces:
+* input/ directory containing csv files for each series
+* preprocessed/attributes.csv: metadata about attributes
+* preprocessed/dimensions.csv: metadata about dimensions
+* output/series.mcf: MCF for each series
+Note: Downloading all the data is very slow and prone to crashes.
+This script ideally shouldn't need to be run again.
+Usage: python3 preprocess.py
+'''
+import csv
+import os
+import requests
+
+from util import *
+
+API_PREFIX = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/'
+HEADERS = {
+    'Content-Type': 'application/x-www-form-urlencoded',
+    'Accept': 'application/octet-stream'
+}
+
+
+def add_concepts(code, concept, concept_set):
+    '''Adds concepts from given series code to concept_set.
+    Args:
+        code: Series code.
+        concept: Type of concept ('Attributes' | 'Dimensions').
+        concept_set: Current set of concepts.
+    '''
+    response = requests.get(f'{API_PREFIX}{code}/{concept}').json()
+    for entry in response:
+        for c in entry['codes']:
+            concept_set.add(
+                (entry['id'], c['code'], c['description'], c['sdmx']))
+
+
+def write_concepts(file, concept_set):
+    '''Writes concepts from concept_set to file.
+    Args:
+        path: File path to write to.
+        concept_set: Current set of concepts.
+    '''
+    with open(file, 'w') as f:
+        writer = csv.writer(f)
+        for row in sorted(concept_set):
+            writer.writerow(list(row))
+
+
+if __name__ == '__main__':
+    if not os.path.exists('input'):
+        os.makedirs('input')
+    if not os.path.exists('preprocessed'):
+        os.makedirs('preprocessed')
+    if not os.path.exists('output'):
+        os.makedirs('output')
+
+    series = requests.get(f'{API_PREFIX}List?allreleases=false').json()
+    codes = {s['code']: s['description'] for s in series}
+
+    attributes = set()
+    dimensions = set()
+    with open('output/series.mcf', 'w') as f_series:
+        for code in sorted(codes):
+            print(code)
+            data = {'seriesCodes': code}
+            text = requests.post(f'{API_PREFIX}DataCSV',
+                                 data=data,
+                                 headers=HEADERS).text.rstrip('\x00')
+            with open(f'input/{code}.csv', 'w') as f_code:
+                f_code.write(text)
+            add_concepts(code, 'Attributes', attributes)
+            add_concepts(code, 'Dimensions', dimensions)
+            f_series.write(
+                SERIES_TEMPLATE.format_map({
+                    'dcid': 'SDG_' + code,
+                    'description': format_description(codes[code])
+                }))
+
+    write_concepts('preprocessed/attributes.csv', attributes)
+    write_concepts('preprocessed/dimensions.csv', dimensions)
diff --git a/scripts/un/sdg/sdmx/process.py b/scripts/un/sdg/sdmx/process.py
new file mode 100644
index 0000000000..5004ccc44a
--- /dev/null
+++ b/scripts/un/sdg/sdmx/process.py
@@ -0,0 +1,298 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Produces CSV/TMCF + schema for UN Stats data.
+
+Produces:
+* output/output.csv: cleaned CSV
+* output/measurement_method.csv: measurement methods
+* output/schema.mcf: properties and classes
+* output/sv.mcf: statistical variables
+* output/unit.mcf: units
+Usage: python3 preprocess.py
+'''
+import collections
+import csv
+import os
+import sys
+
+from util import *
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.abspath(__file__)))))
+
+module_dir_ = os.path.dirname(__file__)
+
+# Create map of M49 -> ISO-alpha3 for countries.
+with open(os.path.join(module_dir_, 'm49.tsv')) as f:
+    PLACES = {}
+    reader = csv.DictReader(f, delimiter='\t')
+    for row in reader:
+        if not row['ISO-alpha3 code']:  # Only countries for now.
+            continue
+        PLACES[int(row['M49 code'])] = row['ISO-alpha3 code']
+
+# Create map of name -> dcid for supported cities.
+with open(os.path.join(module_dir_, 'preprocessed/cities.csv')) as f:
+    reader = csv.DictReader(f)
+    CITIES = {row['name']: row['dcid'] for row in reader}
+
+
+def write_templates(file, templates):
+    '''Write templates to file.
+    Args:
+        file: Input file path.
+        templates: Template strings.
+    '''
+    with open(file, 'w') as f:
+        for template in sorted(templates):
+            f.write(template)
+
+
+def add_concepts(file, concepts):
+    '''Adds concepts from file.
+    Args:
+        file: Input file path.
+        concepts: Dictionary of concepts: concept -> code -> (name, formatted code).
+    '''
+    with open(file) as f:
+        reader = csv.reader(f)
+        for row in reader:
+
+            # Skip totals (as indicated by SDMX).
+            if row[3] == '_T':
+                continue
+            concepts[row[0]][row[1]] = (row[2], make_value(row[1]))
+
+
+def get_observation_about(country_code, country_name, city):
+    '''Returns dcid for place.
+    Args:
+        country_code: M49 for country.
+        country_name: Name of country.
+        city: Name of city.
+    Returns:
+        Dcid of place if found, else empty string.
+    '''
+    if city:
+        formatted_city = city.replace('_', ' ').title() + ', ' + country_name
+        if formatted_city in CITIES and CITIES[formatted_city]:
+            return 'dcs:' + CITIES[formatted_city]
+        else:
+            return ''
+    if country_code in PLACES:
+        return 'dcs:country/' + PLACES[country_code]
+    else:
+        return ''
+
+
+def get_variable_measured(row, properties, concepts):
+    '''Returns templated string for variable_measured.
+    Args:
+        row: Input csv dict row.
+        properties: List of properties for row.
+        concepts: Dictionary of concepts.
+    Returns:
+        Templated string.
+    '''
+    value_ids = []
+    value_descriptions = []
+    cprops = ''
+    for i in properties:
+        field = i[1:-1]
+        if not row[i] or field not in concepts or row[i] not in concepts[field]:
+            continue
+        value_ids.append(concepts[field][row[i]][1])
+        value_descriptions.append(concepts[field][row[i]][0])
+        enum = make_property(field)
+        if field in MAPPED_CONCEPTS:
+            prop = MAPPED_CONCEPTS[field]
+        else:
+            prop = 'sdg_' + enum[0].lower() + enum[1:]
+        val = enum + 'Enum_' + value_ids[-1]
+        cprops += f'\n{prop}: dcs:SDG_{val}'
+    sv = 'sdg/' + '_'.join([row['SeriesCode']] + value_ids)
+    pvs = ', '.join(value_descriptions)
+    description = format_description(row['SeriesDescription'])
+    if pvs:
+        description += ': ' + pvs
+    template = SV_TEMPLATE.format_map({
+        'dcid': sv,
+        'popType': 'SDG_' + row['SeriesCode'],
+        'name': '"' + description + '"',
+        'cprops': cprops
+    })
+    return template
+
+
+def get_measurement_method(row, concepts):
+    '''Returns templated string for measurement_method.
+    Args:
+        row: Input csv dict row.
+        concepts: Dictionary of concepts.
+    Returns:
+        Templated string.
+    '''
+    mmethod = ''
+    description = []
+    for concept in [
+            '[Nature]', '[Observation Status]', '[Report Ordinal]',
+            '[Reporting Type]'
+    ]:
+        field = concept[1:-1]
+        if concept in row:
+            mmethod += '_' + row[concept]
+            if field in concepts and row[concept] in concepts[field]:
+                description.append(concepts[field][row[concept]][0])
+    if not mmethod:
+        return ''
+    mmethod = 'SDG' + mmethod
+    description = 'SDG Measurement Method: ' + ', '.join(
+        description) if description else ''
+    template = MMETHOD_TEMPLATE.format_map({
+        'dcid': mmethod,
+        'description': description
+    })
+    return template
+
+
+def get_unit(row):
+    '''Returns templated string for unit.
+    Args:
+        row: Input csv dict row.
+    Returns:
+        Templated string.
+    '''
+    if not '[Units]' in row:
+        return ''
+    unit = row['[Units]'].replace('^', '')
+    template = UNIT_TEMPLATE.format_map({
+        'dcid': unit,
+        'name': format_unit_name(unit)
+    })
+    return template
+
+
+def write_schema(file, concepts):
+    '''Writes schema from concepts to file.
+    Args:
+        file: Input file path.
+        concepts: Dictionary of concepts.
+    '''
+    with open(file, 'w') as f:
+        for concept in sorted(concepts):
+            if concept in SKIPPED_CONCEPTS:
+                continue
+            prop = make_property(concept)
+            enum = prop + 'Enum'
+            if concept not in MAPPED_CONCEPTS:
+                f.write(
+                    PROPERTY_TEMPLATE.format_map({
+                        'dcid': prop[0].lower() + prop[1:],
+                        'name': concept,
+                        'enum': enum
+                    }))
+            f.write(ENUM_TEMPLATE.format_map({'enum': enum}))
+            for k in sorted(concepts[concept]):
+                v = concepts[concept][k]
+                f.write(
+                    VALUE_TEMPLATE.format_map({
+                        'dcid': v[1],
+                        'enum': enum,
+                        'name': v[0][0].upper() + v[0][1:],
+                    }))
+
+
+def process_input_file(file, writer, concepts, svs, measurement_methods, units):
+    '''Processes one input file and write csv rows.
+    Args:
+        file: Input file path.
+        writer: Csv DictWriter object.
+        concepts: Dictionary of concepts.
+        svs: Set of statistical variables.
+        measurement_methods: Set of measurement methods.
+        units: Set of units.
+    '''
+    print(f'Starting {file}')
+    with open(file) as f_in:
+        reader = csv.DictReader(f_in)
+        properties = sorted([
+            field for field in reader.fieldnames
+            if field[0] == '[' and field[1:-1] not in SKIPPED_CONCEPTS
+        ])
+        try:
+            for row in reader:
+                if not int(row['GeoAreaCode']) in PLACES:
+                    continue
+                if not is_float(row['Value']) or row['Value'] == 'NaN' or row[
+                        'Value'] == 'Nan':
+                    continue
+                observation_about = get_observation_about(
+                    int(row['GeoAreaCode']), row['GeoAreaName'],
+                    row['[Cities]'] if '[Cities]' in reader.fieldnames else '')
+                if not observation_about:
+                    continue
+                sv = get_variable_measured(row, properties, concepts)
+                svs.add(sv)
+                measurement_method = get_measurement_method(row, concepts)
+                if measurement_method:
+                    measurement_methods.add(measurement_method)
+                unit = get_unit(row)
+                if unit:
+                    units.add(unit)
+                writer.writerow({
+                    'variable_measured':
+                        'dcid:' + get_dcid(sv),
+                    'observation_about':
+                        observation_about,
+                    'observation_date':
+                        row['TimePeriod'],
+                    'value':
+                        row['Value'],
+                    'measurement_method':
+                        'dcs:' + get_dcid(measurement_method)
+                        if measurement_method else '',
+                    'unit':
+                        'dcs:' + get_dcid(unit) if unit else '',
+                    'scaling_factor':
+                        row['[UnitMultiplier]']
+                        if '[UnitMultiplier]' in reader.fieldnames else '',
+                })
+        except:
+            print(f'Finished processing {file}')
+
+
+if __name__ == '__main__':
+    concepts = collections.defaultdict(dict)
+    add_concepts('preprocessed/attributes.csv', concepts)
+    add_concepts('preprocessed/dimensions.csv', concepts)
+    write_schema('output/schema.mcf', concepts)
+
+    svs = set()
+    measurement_methods = set()
+    units = set()
+    with open('output/output.csv', 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
+        writer.writeheader()
+        for file in sorted(os.listdir('input')):
+            process_input_file(os.path.join('input', file), writer, concepts,
+                               svs, measurement_methods, units)
+
+    write_templates('output/measurement_method.mcf', measurement_methods)
+    write_templates('output/sv.mcf', svs)
+    write_templates('output/unit.mcf', units)
diff --git a/scripts/un/sdg/sdmx/util.py b/scripts/un/sdg/sdmx/util.py
new file mode 100644
index 0000000000..061dc14974
--- /dev/null
+++ b/scripts/un/sdg/sdmx/util.py
@@ -0,0 +1,197 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Shared util functions and constants.
+'''
+import re
+
+FIELDNAMES = [
+    'variable_measured', 'observation_about', 'observation_date', 'value',
+    'measurement_method', 'unit', 'scaling_factor'
+]
+
+DCID_PREFIX = 'Node: dcid:'
+TOTAL = '_T'
+
+SERIES_TEMPLATE = '''
+Node: dcid:{dcid}
+name: "{description}"
+typeOf: dcs:SDG_Series
+'''
+PROPERTY_TEMPLATE = '''
+Node: dcid:sdg_{dcid}
+typeOf: schema:Property
+domainIncludes: dcs:Thing
+rangeIncludes: dcs:SDG_{enum}
+name: "{name}"
+isProvisional: dcs:True
+'''
+ENUM_TEMPLATE = '''
+Node: dcid:SDG_{enum}
+typeOf: schema:Class
+subClassOf: schema:Enumeration
+name: "{enum}"
+isProvisional: dcs:True
+'''
+VALUE_TEMPLATE = '''
+Node: dcid:SDG_{enum}_{dcid}
+typeOf: dcs:SDG_{enum}
+name: "{name}"
+isProvisional: dcs:True
+'''
+SV_TEMPLATE = '''
+Node: dcid:{dcid}
+typeOf: dcs:StatisticalVariable
+measuredProperty: dcs:value
+name: {name}
+populationType: dcs:{popType}
+statType: dcs:measuredValue{cprops}
+'''
+MMETHOD_TEMPLATE = '''
+Node: dcid:{dcid}
+typeOf: dcs:SDG_MeasurementMethodEnum
+name: "{dcid}"
+description: "{description}"
+'''
+UNIT_TEMPLATE = '''
+Node: dcid:{dcid}
+typeOf: dcs:UnitOfMeasure
+name: "{name}"
+description: "SDG Unit: {dcid}"
+'''
+
+# Select concepts will be modeled differently.
+SKIPPED_CONCEPTS = {
+    'Cities', 'Freq', 'Nature', 'Observation Status', 'Report Ordinal',
+    'Reporting Type', 'UnitMultiplier', 'Units'
+}
+
+# Use existing properties when they exist.
+# TODO: Also map enums to existing nodes.
+MAPPED_CONCEPTS = {
+    'Age': 'age',
+    'Cause of death': 'causeOfDeath',
+    'Disability status': 'disabilityStatus',
+    'Education level': 'educationalAttainment',
+    'Sex': 'gender',
+    'AGE': 'age',
+    'CAUSE_OF_DEATH': 'causeOfDeath',
+    'DISABILITY_STATUS': 'disabilityStatus',
+    'EDUCATION_LEVEL': 'educationalAttainment',
+    'SEX': 'gender'
+}
+
+FORMATTED_UNITS = {
+    'INDEX': 'idx',
+    'NUM_M': '#m',
+    'NUMBER': '#',
+    'PERCENT': '%',
+    'PH': 'pH',
+    'TONNES': 't',
+    'TONNES_M': 'Metric Tonnes'
+}
+
+
+def format_description(s):
+    '''Formats input with curated style.
+    Args:
+        s: Input string.
+    Returns:
+        Curated string.
+    '''
+    # Remove <=2 levels of ().
+    formatted = re.sub('\((?:[^)(]|\([^)(]*\))*\)', '', s)
+    # Remove <=2 levels of [].
+    formatted = re.sub('\[(?:[^)(]|\[[^)(]*\])*\]', '', formatted)
+    # Remove attributes indicated with 'by'.
+    formatted = formatted.split(', by')[0]
+    # Remove references indicated by 'million USD'.
+    formatted = formatted.split(', million USD')[0]
+    # Remove extra spaces
+    formatted = formatted.replace(' , ', ', ').replace('  ', ' ').strip()
+    # Remove trailing commas
+    if formatted[-1] == ',':
+        formatted = formatted[:-1]
+    # Replace 100,000 with 100K
+    formatted = formatted.replace('100,000', '100K')
+    # Make ascii
+    return formatted.replace('Â',
+                             '').replace('’', '\'').replace('₂', '2').replace(
+                                 '\xa0', ' ').replace('−', '-')
+
+
+def is_float(element):
+    '''Checks if value can be interpreted as float.
+    Args:
+      element: Input.
+    Returns:
+      Whether the value can be cast as a float.
+    '''
+    if element is None:
+        return False
+    try:
+        float(element)
+        return True
+    except ValueError:
+        return False
+
+
+def make_property(s):
+    '''Formats property string.
+    Args:
+      s: Input string.
+    Returns:
+      Formatted string.
+    '''
+    return s.title().replace(' ', '').replace('-',
+                                              '').replace('_',
+                                                          '').replace('/', '')
+
+
+def make_value(s):
+    '''Formats value string.
+    Args:
+      s: Input string.
+    Returns:
+      Formatted string.
+    '''
+    return s.replace('<=', 'LEQ').replace('<',
+                                          'LT').replace('+', 'GEQ').replace(
+                                              ' ', '').replace('_', '')
+
+
+def format_unit_name(dcid):
+    '''Formats unit name stirng.
+  Args:
+    dcid: Input dcid.
+  Retuns:
+    Formatted string.
+  '''
+    if dcid in FORMATTED_UNITS:
+        return FORMATTED_UNITS[dcid]
+    return dcid.lower().replace('_', ' ').replace('1000000', '1M').replace(
+        '100000', '100K').replace('10000', '10k')
+
+
+def get_dcid(template):
+    '''Gets dcid from template.
+    Args:
+        template: Input templated string.
+    Returns:
+        Dcid.
+    '''
+    return template.split(DCID_PREFIX)[1].split('\n')[0]

From a251463aa00f195d432cee08aa5b950c89f8761b Mon Sep 17 00:00:00 2001
From: natalie <77713883+n-h-diaz@users.noreply.github.com>
Date: Wed, 8 Nov 2023 11:10:30 -0800
Subject: [PATCH 2/4] update sdg readme (#922)

* add old version of sdg scripts

* fix

* fix

* fix

* address comments

* fix

* update sdg readme
---
 scripts/un/sdg/README.md | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md
index f829b4dc84..d2d4f24722 100644
--- a/scripts/un/sdg/README.md
+++ b/scripts/un/sdg/README.md
@@ -2,8 +2,23 @@
 
 This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date.
 
+## One-time Setup
 
-To generate place mappings:
+Initialize submodules:
+```
+git submodule update --init --remote sdg-dataset
+git submodule update --init --remote sssom-mappings
+```
+
+## Data Refresh 
+
+Update submodules: 
+```
+git submodule update --remote sdg-dataset
+git submodule update --remote sssom-mappings
+```
+
+Generate place mappings:
 ```
 python3 geography.py
 ```
@@ -15,7 +30,7 @@ Produces:
 
 Note that the `place_mappings.csv` is required before running the `process.py` script.
 
-To process data and generate artifacts:
+Process data and generate artifacts:
 ```
 python3 process.py
 ```
@@ -42,7 +57,7 @@ python3 -m unittest discover -v -s ../ -p "*_test.py"
 Notes:
 * We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
 
-### SDMX
+## SDMX
 
 As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**.
 

From f34efbf27c9b0090ec9d4e69ceafa717a7032c84 Mon Sep 17 00:00:00 2001
From: Luis Gonzalez <gonzalezmorales@users.noreply.github.com>
Date: Thu, 9 Nov 2023 12:28:00 -0500
Subject: [PATCH 3/4] fix utf encoding on geography.py (#923)

---
 scripts/un/sdg/geography.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/un/sdg/geography.py b/scripts/un/sdg/geography.py
index ce551bc0e9..e8d6706724 100644
--- a/scripts/un/sdg/geography.py
+++ b/scripts/un/sdg/geography.py
@@ -97,7 +97,7 @@ def get_sdg2type(file):
         Map of SDG code -> SDG type.
     '''
     sdg2type = {}
-    with open(file) as f:
+    with open(file, encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
             sdg2type[row['GEOGRAPHY_CODE']] = row['GEOGRAPHY_TYPE']
@@ -140,7 +140,7 @@ def get_un2dc_curated(file):
         Map of UN code -> curated Node.
     '''
     un2dc_curated = {}
-    with open(file) as f:
+    with open(file, encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
 
@@ -204,8 +204,8 @@ def write_un_places(input_geos, output, sdg2type, un2sdg, un2dc_curated):
     '''
     un2dc_generated = {}
     new_subjects = []
-    with open(input_geos) as f_in:
-        with open(output, 'w') as f_out:
+    with open(input_geos, encoding='utf-8') as f_in:
+        with open(output, 'w', encoding='utf-8') as f_out:
             reader = csv.DictReader(f_in)
             for row in reader:
                 subject = row['subject_id']
@@ -288,7 +288,7 @@ def write_un_containment(output, containment, new_subjects):
         new_subjects: List of Nodes for new places.
 
     '''
-    with open(output, 'w') as f:
+    with open(output, 'w', encoding='utf-8') as f:
         for s in sorted(containment):
             c = ''
             for o in containment[s]:
@@ -323,7 +323,7 @@ def write_place_mappings(output, sdg2un, un2dc_curated, un2dc_generated):
         un2dc_curated: Map of UN code -> curated Node.
         un2dc_generated: Map of UN code -> generated Node.
     '''
-    with open(output, 'w') as f:
+    with open(output, 'w', encoding='utf-8') as f:
         writer = csv.DictWriter(f, fieldnames=['sdg', 'dcid'])
         writer.writeheader()
         for code in sorted(sdg2un):

From 1a2fd8185aa15615adbf63af255f31fc1da01d07 Mon Sep 17 00:00:00 2001
From: natalie <77713883+n-h-diaz@users.noreply.github.com>
Date: Thu, 16 Nov 2023 15:55:06 -0800
Subject: [PATCH 4/4] add scripts for HUD_IncomeLimits import (#924)

* add scripts for HUD_IncomeLimits import

* fix

* fix

* comments

* fix

* fix
---
 scripts/us_hud/__init__.py                    |   0
 scripts/us_hud/income/README.md               |  18 ++
 scripts/us_hud/income/__init__.py             |   0
 scripts/us_hud/income/match_bq.csv            | 189 ++++++++++++++++++
 scripts/us_hud/income/process.py              | 132 ++++++++++++
 scripts/us_hud/income/process_test.py         |  55 +++++
 scripts/us_hud/income/testdata/__init__.py    |   0
 .../income/testdata/expected_output_2006.csv  |   2 +
 .../us_hud/income/testdata/output_2006.csv    |   2 +
 .../income/testdata/test_input_2006.csv       |   2 +
 10 files changed, 400 insertions(+)
 create mode 100644 scripts/us_hud/__init__.py
 create mode 100644 scripts/us_hud/income/README.md
 create mode 100644 scripts/us_hud/income/__init__.py
 create mode 100644 scripts/us_hud/income/match_bq.csv
 create mode 100644 scripts/us_hud/income/process.py
 create mode 100644 scripts/us_hud/income/process_test.py
 create mode 100644 scripts/us_hud/income/testdata/__init__.py
 create mode 100644 scripts/us_hud/income/testdata/expected_output_2006.csv
 create mode 100644 scripts/us_hud/income/testdata/output_2006.csv
 create mode 100644 scripts/us_hud/income/testdata/test_input_2006.csv

diff --git a/scripts/us_hud/__init__.py b/scripts/us_hud/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/us_hud/income/README.md b/scripts/us_hud/income/README.md
new file mode 100644
index 0000000000..57522f078c
--- /dev/null
+++ b/scripts/us_hud/income/README.md
@@ -0,0 +1,18 @@
+# Income Limits
+
+This import includes median income for households of different sizes for the 80th and 150th (computed) percentiles from the [HUD Income Limits dataset](https://www.huduser.gov/portal/datasets/il.html).
+
+To generate artifacts: 
+
+```
+python3 process.py
+```
+
+This will produce a folder `csv/` with cleaned CSVs `output_[YEAR].csv`.
+
+The `match_bq.csv` file contains places that have additional dcids that we would like to generate stats for.
+
+To run unit tests: 
+```
+python3 -m unittest discover -v -s ../ -p "*_test.py"
+```
diff --git a/scripts/us_hud/income/__init__.py b/scripts/us_hud/income/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/us_hud/income/match_bq.csv b/scripts/us_hud/income/match_bq.csv
new file mode 100644
index 0000000000..20e0286b24
--- /dev/null
+++ b/scripts/us_hud/income/match_bq.csv
@@ -0,0 +1,189 @@
+fips,city
+geoId/02110,geoId/0236400
+geoId/02220,geoId/0270540
+geoId/02275,geoId/0286380
+geoId/0900108070,geoId/0908000
+geoId/0900118500,geoId/0918430
+geoId/0900156060,geoId/0955990
+geoId/0900168170,geoId/0968100
+geoId/0900173070,geoId/0973000
+geoId/0900174190,geoId/0974260
+geoId/0900308490,geoId/0908420
+geoId/0900322630,geoId/0922700
+geoId/0900337070,geoId/0937000
+geoId/0900350440,geoId/0950370
+geoId/0900382590,geoId/0982660
+geoId/0900576570,geoId/0976500
+geoId/0900747360,geoId/0947290
+geoId/0900901220,geoId/0901150
+geoId/0900919550,geoId/0919480
+geoId/0900946520,geoId/0946450
+geoId/0900947535,geoId/0947515
+geoId/0900949950,geoId/0949880
+geoId/0900952070,geoId/0952000
+geoId/0900980070,geoId/0980000
+geoId/0900982870,geoId/0982800
+geoId/0901152350,geoId/0952280
+geoId/0901156270,geoId/0956200
+geoId/2300102060,geoId/2302060
+geoId/2300138740,geoId/2338740
+geoId/2300310565,geoId/2310565
+geoId/2300360825,geoId/2360825
+geoId/2300560545,geoId/2360545
+geoId/2300571990,geoId/2371990
+geoId/2300582105,geoId/2382105
+geoId/2300923200,geoId/2323200
+geoId/2301102100,geoId/2302100
+geoId/2301127085,geoId/2327085
+geoId/2301130550,geoId/2330550
+geoId/2301180740,geoId/2380740
+geoId/2301363590,geoId/2363590
+geoId/2301902795,geoId/2302795
+geoId/2301906925,geoId/2306925
+geoId/2301955225,geoId/2355225
+geoId/2302303355,geoId/2303355
+geoId/2302703950,geoId/2303950
+geoId/2302909585,geoId/2309585
+geoId/2302921730,geoId/2321730
+geoId/2303104860,geoId/2304860
+geoId/2303164675,geoId/2364675
+geoId/2303165725,geoId/2365725
+geoId/24510,geoId/2404000
+geoId/2500346225,geoId/2546225
+geoId/2500353960,geoId/2553960
+geoId/2500502690,geoId/2502690
+geoId/2500523000,geoId/2523000
+geoId/2500545000,geoId/2545000
+geoId/2500562430,geoId/2562465
+geoId/2500569170,geoId/2569170
+geoId/2500905595,geoId/2505595
+geoId/2500916250,geoId/2516285
+geoId/2500926150,geoId/2526150
+geoId/2500929405,geoId/2529405
+geoId/2500934550,geoId/2534550
+geoId/2500937490,geoId/2537490
+geoId/2500938400,geoId/2538435
+geoId/2500943580,geoId/2543615
+geoId/2500945245,geoId/2545245
+geoId/2500952490,geoId/2552490
+geoId/2500959105,geoId/2559105
+geoId/2500960015,geoId/2560050
+geoId/2500968645,geoId/2568680
+geoId/2501313660,geoId/2513660
+geoId/2501330840,geoId/2530840
+geoId/2501336300,geoId/2536335
+geoId/2501352144,geoId/2552144
+geoId/2501367000,geoId/2567000
+geoId/2501376030,geoId/2576030
+geoId/2501546330,geoId/2546330
+geoId/2501701605,geoId/2501640
+geoId/2501705070,geoId/2505105
+geoId/2501709840,geoId/2509875
+geoId/2501711000,geoId/2511000
+geoId/2501721990,geoId/2521990
+geoId/2501724960,geoId/2524960
+geoId/2501735215,geoId/2535250
+geoId/2501737000,geoId/2537000
+geoId/2501737875,geoId/2537875
+geoId/2501738715,geoId/2538715
+geoId/2501739625,geoId/2539660
+geoId/2501739835,geoId/2539835
+geoId/2501740115,geoId/2540115
+geoId/2501745560,geoId/2545560
+geoId/2501756130,geoId/2556165
+geoId/2501762535,geoId/2562535
+geoId/2501767665,geoId/2567700
+geoId/2501772215,geoId/2572250
+geoId/2501772600,geoId/2572600
+geoId/2501780510,geoId/2580545
+geoId/2501781035,geoId/2581035
+geoId/2502109175,geoId/2509210
+geoId/2502130455,geoId/2530420
+geoId/2502141690,geoId/2541725
+geoId/2502144105,geoId/2544140
+geoId/2502150250,geoId/2550285
+geoId/2502155745,geoId/2555745
+geoId/2502155955,geoId/2555990
+geoId/2502174175,geoId/2574210
+geoId/2502178972,geoId/2578972
+geoId/2502300170,geoId/2500135
+geoId/2502309000,geoId/2509000
+geoId/2502331645,geoId/2531680
+geoId/2502507000,geoId/2507000
+geoId/2502513205,geoId/2513205
+geoId/2502556585,geoId/2556585
+geoId/2502581005,geoId/2581005
+geoId/2502723875,geoId/2523875
+geoId/2502725485,geoId/2525485
+geoId/2502735075,geoId/2535075
+geoId/2502763345,geoId/2563345
+geoId/2502782000,geoId/2582000
+geoId/29510,geoId/2965000
+geoId/32510,geoId/3209700
+geoId/3300140180,geoId/3340180
+geoId/3300539300,geoId/3339300
+geoId/3300705140,geoId/3305140
+geoId/3300941300,geoId/3341300
+geoId/3301145140,geoId/3345140
+geoId/3301150260,geoId/3350260
+geoId/3301314200,geoId/3314200
+geoId/3301327380,geoId/3327380
+geoId/3301562900,geoId/3362900
+geoId/3301718820,geoId/3318820
+geoId/3301765140,geoId/3365140
+geoId/3301769940,geoId/3369940
+geoId/3301912900,geoId/3312900
+geoId/4400374300,geoId/4474300
+geoId/4400549960,geoId/4449960
+geoId/4400714140,geoId/4414140
+geoId/4400719180,geoId/4419180
+geoId/4400722960,geoId/4422960
+geoId/4400754640,geoId/4454640
+geoId/4400759000,geoId/4459000
+geoId/4400780780,geoId/4480780
+geoId/5000174650,geoId/5074650
+geoId/5000710675,geoId/5010675
+geoId/5000766175,geoId/5066175
+geoId/5000785150,geoId/5085150
+geoId/5001161675,geoId/5061675
+geoId/5001948850,geoId/5048850
+geoId/5002161225,geoId/5061225
+geoId/5002303175,geoId/5003175
+geoId/5002346000,geoId/5046000
+geoId/51510,geoId/5101000
+geoId/51520,geoId/5109816
+geoId/51530,geoId/5111032
+geoId/51550,geoId/5116000
+geoId/51570,geoId/5118448
+geoId/51580,geoId/5119728
+geoId/51590,geoId/5121344
+geoId/51595,geoId/5125808
+geoId/51600,geoId/5126496
+geoId/51610,geoId/5127200
+geoId/51620,geoId/5129600
+geoId/51630,geoId/5129744
+geoId/51640,geoId/5130208
+geoId/51650,geoId/5135000
+geoId/51660,geoId/5135624
+geoId/51670,geoId/5138424
+geoId/51678,geoId/5145512
+geoId/51680,geoId/5147672
+geoId/51683,geoId/5148952
+geoId/51685,geoId/5148968
+geoId/51690,geoId/5149784
+geoId/51700,geoId/5156000
+geoId/51710,geoId/5157000
+geoId/51720,geoId/5157688
+geoId/51730,geoId/5161832
+geoId/51735,geoId/5163768
+geoId/51740,geoId/5164000
+geoId/51750,geoId/5165392
+geoId/51760,geoId/5167000
+geoId/51770,geoId/5168000
+geoId/51775,geoId/5170000
+geoId/51790,geoId/5175216
+geoId/51800,geoId/5176432
+geoId/51810,geoId/5182000
+geoId/51820,geoId/5183680
+geoId/51830,geoId/5186160
+geoId/51840,geoId/5186720
diff --git a/scripts/us_hud/income/process.py b/scripts/us_hud/income/process.py
new file mode 100644
index 0000000000..fb9fc767b9
--- /dev/null
+++ b/scripts/us_hud/income/process.py
@@ -0,0 +1,132 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''Generates cleaned CSVs for HUD Income Limits data.
+
+Produces: 
+* csv/output_[YEAR].csv
+
+Usage:
+python3 process.py
+'''
+import csv
+import datetime
+import os
+import pandas as pd
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('income_output_dir', 'csv', 'Path to write cleaned CSVs.')
+
+URL_PREFIX = 'https://www.huduser.gov/portal/datasets/il/il'
+
+
+def get_url(year):
+    '''Return xls url for year.
+
+  Args:
+    year: Input year.
+
+  Returns:
+    xls url for given year.
+  '''
+    if year < 2006:
+        return ''
+    suffix = str(year)[-2:]
+    if year >= 2016:
+        return f'{URL_PREFIX}{suffix}/Section8-FY{suffix}.xlsx'
+    elif year == 2015:
+        return f'{URL_PREFIX}15/Section8_Rev.xlsx'
+    elif year == 2014:
+        return f'{URL_PREFIX}14/Poverty.xls'
+    elif year == 2011:
+        return f'{URL_PREFIX}11/Section8_v3.xls'
+    elif year >= 2009:
+        return f'{URL_PREFIX}{suffix}/Section8.xls'
+    elif year == 2008:
+        return f'{URL_PREFIX}08/Section8_FY08.xls'
+    elif year == 2007:
+        return f'{URL_PREFIX}07/Section8-rev.xls'
+    elif year == 2006:
+        return f'{URL_PREFIX}06/Section8FY2006.xls'
+    else:
+        return ''
+
+
+def compute_150(df, person):
+    '''Compute 150th percentile income in-place.
+
+  Args:
+    df: Input dataframe (will be modified).
+    person: Number of people in household.
+  '''
+    df[f'l150_{person}'] = df.apply(
+        lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1)
+
+
+def process(year, matches, output_dir):
+    '''Generate cleaned CSV.
+
+  Args:
+    year: Input year.
+    matches: Map of fips dcid -> city dcid.
+    output_dir: Directory to write cleaned CSV.
+  '''
+    url = get_url(year)
+    try:
+        df = pd.read_excel(url)
+    except:
+        print(f'No file found for {url}.')
+        return
+    if 'fips2010' in df:
+        df = df.rename(columns={'fips2010': 'fips'})
+
+    # Filter to 80th percentile income stats for each household size.
+    df = df.loc[:, [
+        'fips', 'l80_1', 'l80_2', 'l80_3', 'l80_4', 'l80_5', 'l80_6', 'l80_7',
+        'l80_8'
+    ]]
+
+    df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10),
+                          axis=1)
+    df['fips'] = df.apply(lambda x: x['fips'][:-5]
+                          if x['fips'][-5:] == '99999' else x['fips'],
+                          axis=1)
+    for i in range(1, 9):
+        compute_150(df, i)
+    df['year'] = [year for i in range(len(df))]
+
+    # Add stats for matching dcids.
+    df_match = df.copy().loc[df['fips'].isin(matches)]
+    if not df_match.empty:
+        df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1)
+        df = pd.concat([df, df_match])
+
+    df.to_csv(os.path.join(output_dir, f'output_{year}.csv'), index=False)
+
+
+def main(argv):
+    with open('match_bq.csv') as f:
+        reader = csv.DictReader(f)
+        matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader}
+    if not os.path.exists(FLAGS.income_output_dir):
+        os.makedirs(FLAGS.income_output_dir)
+    today = datetime.date.today()
+    for year in range(2006, today.year):
+        print(year)
+        process(year, matches, FLAGS.income_output_dir)
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/scripts/us_hud/income/process_test.py b/scripts/us_hud/income/process_test.py
new file mode 100644
index 0000000000..6a2e68f13d
--- /dev/null
+++ b/scripts/us_hud/income/process_test.py
@@ -0,0 +1,55 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''Tests for process.py.
+
+Usage: python3 -m unittest discover -v -s ../ -p "process_test.py"
+'''
+import os
+import pandas as pd
+import sys
+import unittest
+from unittest.mock import patch
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.abspath(__file__)))))
+from us_hud.income import process
+
+module_dir_ = os.path.dirname(__file__)
+
+TEST_DIR = os.path.join(module_dir_, 'testdata')
+
+
+class ProcessTest(unittest.TestCase):
+
+    def test_get_url(self):
+        self.assertEqual(
+            process.get_url(2022),
+            'https://www.huduser.gov/portal/datasets/il/il22/Section8-FY22.xlsx'
+        )
+        self.assertEqual(process.get_url(1997), '')
+
+    def test_compute_150(self):
+        pass
+
+    @patch('pandas.read_excel')
+    def test_process(self, mock_df):
+        mock_df.return_value = pd.DataFrame(
+            pd.read_csv(os.path.join(TEST_DIR, 'test_input_2006.csv')))
+        matches = {'dcs:geoId/02110': 'dcs:geoId/0236400'}
+        process.process(2006, matches, TEST_DIR)
+        with open(os.path.join(TEST_DIR, 'output_2006.csv')) as result:
+            with open(os.path.join(TEST_DIR,
+                                   'expected_output_2006.csv')) as expected:
+                self.assertEqual(result.read(), expected.read())
diff --git a/scripts/us_hud/income/testdata/__init__.py b/scripts/us_hud/income/testdata/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/us_hud/income/testdata/expected_output_2006.csv b/scripts/us_hud/income/testdata/expected_output_2006.csv
new file mode 100644
index 0000000000..76e466eb5c
--- /dev/null
+++ b/scripts/us_hud/income/testdata/expected_output_2006.csv
@@ -0,0 +1,2 @@
+fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year
+dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006
diff --git a/scripts/us_hud/income/testdata/output_2006.csv b/scripts/us_hud/income/testdata/output_2006.csv
new file mode 100644
index 0000000000..76e466eb5c
--- /dev/null
+++ b/scripts/us_hud/income/testdata/output_2006.csv
@@ -0,0 +1,2 @@
+fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year
+dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006
diff --git a/scripts/us_hud/income/testdata/test_input_2006.csv b/scripts/us_hud/income/testdata/test_input_2006.csv
new file mode 100644
index 0000000000..5eb7f16ff1
--- /dev/null
+++ b/scripts/us_hud/income/testdata/test_input_2006.csv
@@ -0,0 +1,2 @@
+State_Alpha,fips,State,County_Town_Name,County,Metro_Area_Name,CBSASub,County_Name,median1999,median2006,State_Name,l50_1,l50_2,l50_3,l50_4,l50_5,l50_6,l50_7,l50_8,msa,l30_1,l30_2,l30_3,l30_4,l30_5,l30_6,l30_7,l30_8,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,metro
+AL,100199999,1,Autauga County,1,"Montgomery, AL MSA",METRO33860M33860,Autauga County,45182,55900,Alabama,19550,22350,25150,27950,30200,32400,34650,36900,5240,11750,13400,15100,16750,18100,19450,20750,22100,31300,35750,40250,44700,48300,51850,55450,59000,1