From b990b2820854d52affdd951458ef57c4b8130168 Mon Sep 17 00:00:00 2001
From: natalie <77713883+n-h-diaz@users.noreply.github.com>
Date: Wed, 8 Nov 2023 08:33:29 -0800
Subject: [PATCH] add old version of sdg scripts (#921)

* add old version of sdg scripts

* fix

* fix

* fix
---
 scripts/un/sdg/README.md          |  16 +-
 scripts/un/sdg/sdmx/cities.py     |  94 ++++++++++
 scripts/un/sdg/sdmx/m49.tsv       | 250 +++++++++++++++++++++++++
 scripts/un/sdg/sdmx/preprocess.py |  99 ++++++++++
 scripts/un/sdg/sdmx/process.py    | 298 ++++++++++++++++++++++++++++++
 scripts/un/sdg/sdmx/util.py       | 197 ++++++++++++++++++++
 6 files changed, 952 insertions(+), 2 deletions(-)
 create mode 100644 scripts/un/sdg/sdmx/cities.py
 create mode 100644 scripts/un/sdg/sdmx/m49.tsv
 create mode 100644 scripts/un/sdg/sdmx/preprocess.py
 create mode 100644 scripts/un/sdg/sdmx/process.py
 create mode 100644 scripts/un/sdg/sdmx/util.py

diff --git a/scripts/un/sdg/README.md b/scripts/un/sdg/README.md
index 747f6b9712..f829b4dc84 100644
--- a/scripts/un/sdg/README.md
+++ b/scripts/un/sdg/README.md
@@ -1,6 +1,6 @@
 # UN Stats Sustainable Development Goals
 
-This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats.
+This import includes data from the [UN SDG Global Database](https://unstats.un.org/sdgs/dataportal). Data is read from the submodule `sdg-dataset` which is managed by UN Stats. Geography mappings are read from the submodule `sssom-mappings` which is also managed by UN Stats. Please ensure the submodules stay up to date.
 
 
 To generate place mappings:
@@ -29,6 +29,7 @@ Produces:
   * unit.mcf
 * csv/ folder: 
   * [CODE].csv
+
 (Note that these folders are not included in the repository but can be regenerated by running the script.)
 
 When refreshing the data, the `geography`, `schema`, and `csv` folders might all get updated and will need to be resubmitted to g3. The corresponding TMCF file is `sdg.tmcf`.
@@ -39,4 +40,15 @@ python3 -m unittest discover -v -s ../ -p "*_test.py"
 ```
 
 Notes:
-* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
\ No newline at end of file
+* We currently drop certain series and variables (refer to `util.py` for the list) which have been identified by UN as potentially containing outliers.
+
+### SDMX
+
+As reference, we provide an earlier version of the import scripts that utilized the UN API (which uses SDMX) in the `sdmx/` folder. Please note that these scripts may have errors and do not use the most up-to-date schema format, so should only be used as an illustration of the SDMX -> MCF mapping and **should not actually be run**.
+
+As a quick overview: 
+* `preprocess.py` downloads all the raw input CSVs to an `input/` folder as well as adds all dimensions and attributes to a `preprocessed/` folder.
+* `cities.py` reads the input CSVs and matches cities with dcids.
+* `process.py` reads the input CSVs and concepts and generates a cleaned CSV and schema.
+* `util.py` has various shared util functions and constants.
+* `m49.csv` has country code mappings.
diff --git a/scripts/un/sdg/sdmx/cities.py b/scripts/un/sdg/sdmx/cities.py
new file mode 100644
index 0000000000..3c5e0c9f1a
--- /dev/null
+++ b/scripts/un/sdg/sdmx/cities.py
@@ -0,0 +1,94 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Finds dcids for cities in input files.
+
+Produces:
+* preprocessed/cities.csv: dcid for each city name
+ 
+Note: For cities where the find entities API did not return a dcid,
+we tried manually searching for the dcid and filled these into the file.
+There are a few city names that are still missing - these are left blank.
+**This script ideally shouldn't need to be run again.**
+Usage: python3 cities.py <API_KEY>
+'''
+import csv
+import requests
+import os
+import sys
+
+BATCH = 1
+
+
+def get_cities(json, api_key):
+    '''Applies find entities API for given json.
+    Args:
+        json: Input json.
+        api_key: API key.
+    Returns:
+        API response.
+    '''
+    return requests.post('https://api.datacommons.org/v1/bulk/find/entities',
+                         headers={
+                             'X-API-Key': api_key
+                         },
+                         json=json).json()
+
+
+def write_cities(file, cities, api_key):
+    '''Writes city dcids and names to file.
+    Args:
+        file: Output file path.
+        cities: List of city dcids to process. 
+        api_key: API key.
+    '''
+    with open(file, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'dcid'])
+        writer.writeheader()
+        city_list = list(cities.keys())
+        for i in range(0, len(city_list), BATCH):
+            json = {
+                'entities': [{
+                    'description': city
+                } for city in city_list[i:i + BATCH]]
+            }
+            response = get_cities(json, api_key)
+            print(response)
+            try:
+                for entity in response['entities']:
+                    dcid = entity['dcids'][0] if 'dcids' in entity else ''
+                    writer.writerow({
+                        'name': cities[entity['description']],
+                        'dcid': dcid
+                    })
+            except KeyError:
+                writer.writerow({'name': cities[city_list[i]], 'dcid': ''})
+
+
+if __name__ == '__main__':
+    cities = set()
+    for file in sorted(os.listdir('input')):
+        code = file.removesuffix('.csv')
+        with open('input/' + file) as f:
+            reader = csv.DictReader(f)
+            if '[Cities]' in reader.fieldnames:
+                for row in reader:
+                    cities.add(row['[Cities]'].replace('_', ' ').title() +
+                               ', ' + row['GeoAreaName'])
+    cities = sorted(cities)
+
+    write_cities('preprocessed/cities2.csv', cities, sys.argv[1])
diff --git a/scripts/un/sdg/sdmx/m49.tsv b/scripts/un/sdg/sdmx/m49.tsv
new file mode 100644
index 0000000000..a90189880a
--- /dev/null
+++ b/scripts/un/sdg/sdmx/m49.tsv
@@ -0,0 +1,250 @@
+Country or Area	M49 code	ISO-alpha3 code
+Afghanistan	004	AFG
+Åland Islands	248	ALA
+Albania	008	ALB
+Algeria	012	DZA
+American Samoa	016	ASM
+Andorra	020	AND
+Angola	024	AGO
+Anguilla	660	AIA
+Antarctica	010	ATA
+Antigua and Barbuda	028	ATG
+Argentina	032	ARG
+Armenia	051	ARM
+Aruba	533	ABW
+Australia	036	AUS
+Austria	040	AUT
+Azerbaijan	031	AZE
+Bahamas	044	BHS
+Bahrain	048	BHR
+Bangladesh	050	BGD
+Barbados	052	BRB
+Belarus	112	BLR
+Belgium	056	BEL
+Belize	084	BLZ
+Benin	204	BEN
+Bermuda	060	BMU
+Bhutan	064	BTN
+Bolivia (Plurinational State of)	068	BOL
+Bonaire, Sint Eustatius and Saba	535	BES
+Bosnia and Herzegovina	070	BIH
+Botswana	072	BWA
+Bouvet Island	074	BVT
+Brazil	076	BRA
+British Indian Ocean Territory	086	IOT
+British Virgin Islands	092	VGB
+Brunei Darussalam	096	BRN
+Bulgaria	100	BGR
+Burkina Faso	854	BFA
+Burundi	108	BDI
+Cabo Verde	132	CPV
+Cambodia	116	KHM
+Cameroon	120	CMR
+Canada	124	CAN
+Cayman Islands	136	CYM
+Central African Republic	140	CAF
+Chad	148	TCD
+Chile	152	CHL
+China	156	CHN
+China, Hong Kong Special Administrative Region	344	HKG
+China, Macao Special Administrative Region	446	MAC
+Christmas Island	162	CXR
+Cocos (Keeling) Islands	166	CCK
+Colombia	170	COL
+Comoros	174	COM
+Congo	178	COG
+Cook Islands	184	COK
+Costa Rica	188	CRI
+Côte d’Ivoire	384	CIV
+Croatia	191	HRV
+Cuba	192	CUB
+Curaçao	531	CUW
+Cyprus	196	CYP
+Czechia	203	CZE
+Democratic People's Republic of Korea	408	PRK
+Democratic Republic of the Congo	180	COD
+Denmark	208	DNK
+Djibouti	262	DJI
+Dominica	212	DMA
+Dominican Republic	214	DOM
+Ecuador	218	ECU
+Egypt	818	EGY
+El Salvador	222	SLV
+Equatorial Guinea	226	GNQ
+Eritrea	232	ERI
+Estonia	233	EST
+Eswatini	748	SWZ
+Ethiopia	231	ETH
+Falkland Islands (Malvinas)	238	FLK
+Faroe Islands	234	FRO
+Fiji	242	FJI
+Finland	246	FIN
+France	250	FRA
+French Guiana	254	GUF
+French Polynesia	258	PYF
+French Southern Territories	260	ATF
+Gabon	266	GAB
+Gambia	270	GMB
+Georgia	268	GEO
+Germany	276	DEU
+Ghana	288	GHA
+Gibraltar	292	GIB
+Greece	300	GRC
+Greenland	304	GRL
+Grenada	308	GRD
+Guadeloupe	312	GLP
+Guam	316	GUM
+Guatemala	320	GTM
+Guernsey	831	GGY
+Guinea	324	GIN
+Guinea-Bissau	624	GNB
+Guyana	328	GUY
+Haiti	332	HTI
+Heard Island and McDonald Islands	334	HMD
+Holy See	336	VAT
+Honduras	340	HND
+Hungary	348	HUN
+Iceland	352	ISL
+India	356	IND
+Indonesia	360	IDN
+Iran (Islamic Republic of)	364	IRN
+Iraq	368	IRQ
+Ireland	372	IRL
+Isle of Man	833	IMN
+Israel	376	ISR
+Italy	380	ITA
+Jamaica	388	JAM
+Japan	392	JPN
+Jersey	832	JEY
+Jordan	400	JOR
+Kazakhstan	398	KAZ
+Kenya	404	KEN
+Kiribati	296	KIR
+Kuwait	414	KWT
+Kyrgyzstan	417	KGZ
+Lao People's Democratic Republic	418	LAO
+Latvia	428	LVA
+Lebanon	422	LBN
+Lesotho	426	LSO
+Liberia	430	LBR
+Libya	434	LBY
+Liechtenstein	438	LIE
+Lithuania	440	LTU
+Luxembourg	442	LUX
+Madagascar	450	MDG
+Malawi	454	MWI
+Malaysia	458	MYS
+Maldives	462	MDV
+Mali	466	MLI
+Malta	470	MLT
+Marshall Islands	584	MHL
+Martinique	474	MTQ
+Mauritania	478	MRT
+Mauritius	480	MUS
+Mayotte	175	MYT
+Mexico	484	MEX
+Micronesia (Federated States of)	583	FSM
+Monaco	492	MCO
+Mongolia	496	MNG
+Montenegro	499	MNE
+Montserrat	500	MSR
+Morocco	504	MAR
+Mozambique	508	MOZ
+Myanmar	104	MMR
+Namibia	516	NAM
+Nauru	520	NRU
+Nepal	524	NPL
+Netherlands	528	NLD
+New Caledonia	540	NCL
+New Zealand	554	NZL
+Nicaragua	558	NIC
+Niger	562	NER
+Nigeria	566	NGA
+Niue	570	NIU
+Norfolk Island	574	NFK
+North Macedonia	807	MKD
+Northern Mariana Islands	580	MNP
+Norway	578	NOR
+Oman	512	OMN
+Pakistan	586	PAK
+Palau	585	PLW
+Panama	591	PAN
+Papua New Guinea	598	PNG
+Paraguay	600	PRY
+Peru	604	PER
+Philippines	608	PHL
+Pitcairn	612	PCN
+Poland	616	POL
+Portugal	620	PRT
+Puerto Rico	630	PRI
+Qatar	634	QAT
+Republic of Korea	410	KOR
+Republic of Moldova	498	MDA
+Réunion	638	REU
+Romania	642	ROU
+Russian Federation	643	RUS
+Rwanda	646	RWA
+Saint Barthélemy	652	BLM
+Saint Helena	654	SHN
+Saint Kitts and Nevis	659	KNA
+Saint Lucia	662	LCA
+Saint Martin (French Part)	663	MAF
+Saint Pierre and Miquelon	666	SPM
+Saint Vincent and the Grenadines	670	VCT
+Samoa	882	WSM
+San Marino	674	SMR
+Sao Tome and Principe	678	STP
+Sark	680	
+Saudi Arabia	682	SAU
+Senegal	686	SEN
+Serbia	688	SRB
+Seychelles	690	SYC
+Sierra Leone	694	SLE
+Singapore	702	SGP
+Sint Maarten (Dutch part)	534	SXM
+Slovakia	703	SVK
+Slovenia	705	SVN
+Solomon Islands	090	SLB
+Somalia	706	SOM
+South Africa	710	ZAF
+South Georgia and the South Sandwich Islands	239	SGS
+South Sudan	728	SSD
+Spain	724	ESP
+Sri Lanka	144	LKA
+State of Palestine	275	PSE
+Sudan	729	SDN
+Suriname	740	SUR
+Svalbard and Jan Mayen Islands	744	SJM
+Sweden	752	SWE
+Switzerland	756	CHE
+Syrian Arab Republic	760	SYR
+Tajikistan	762	TJK
+Thailand	764	THA
+Timor-Leste	626	TLS
+Togo	768	TGO
+Tokelau	772	TKL
+Tonga	776	TON
+Trinidad and Tobago	780	TTO
+Tunisia	788	TUN
+Türkiye	792	TUR
+Turkmenistan	795	TKM
+Turks and Caicos Islands	796	TCA
+Tuvalu	798	TUV
+Uganda	800	UGA
+Ukraine	804	UKR
+United Arab Emirates	784	ARE
+United Kingdom of Great Britain and Northern Ireland	826	GBR
+United Republic of Tanzania	834	TZA
+United States Minor Outlying Islands	581	UMI
+United States of America	840	USA
+United States Virgin Islands	850	VIR
+Uruguay	858	URY
+Uzbekistan	860	UZB
+Vanuatu	548	VUT
+Venezuela (Bolivarian Republic of)	862	VEN
+Viet Nam	704	VNM
+Wallis and Futuna Islands	876	WLF
+Western Sahara	732	ESH
+Yemen	887	YEM
+Zambia	894	ZMB
+Zimbabwe	716	ZWE
diff --git a/scripts/un/sdg/sdmx/preprocess.py b/scripts/un/sdg/sdmx/preprocess.py
new file mode 100644
index 0000000000..ff4f3067bf
--- /dev/null
+++ b/scripts/un/sdg/sdmx/preprocess.py
@@ -0,0 +1,99 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Downloads data from UN Stats API to be used in further processing.
+
+Produces:
+* input/ directory containing csv files for each series
+* preprocessed/attributes.csv: metadata about attributes
+* preprocessed/dimensions.csv: metadata about dimensions
+* output/series.mcf: MCF for each series
+Note: Downloading all the data is very slow and prone to crashes.
+This script ideally shouldn't need to be run again.
+Usage: python3 preprocess.py
+'''
+import csv
+import os
+import requests
+
+from util import *
+
+API_PREFIX = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/'
+HEADERS = {
+    'Content-Type': 'application/x-www-form-urlencoded',
+    'Accept': 'application/octet-stream'
+}
+
+
+def add_concepts(code, concept, concept_set):
+    '''Adds concepts from given series code to concept_set.
+    Args:
+        code: Series code.
+        concept: Type of concept ('Attributes' | 'Dimensions').
+        concept_set: Current set of concepts.
+    '''
+    response = requests.get(f'{API_PREFIX}{code}/{concept}').json()
+    for entry in response:
+        for c in entry['codes']:
+            concept_set.add(
+                (entry['id'], c['code'], c['description'], c['sdmx']))
+
+
+def write_concepts(file, concept_set):
+    '''Writes concepts from concept_set to file.
+    Args:
+        path: File path to write to.
+        concept_set: Current set of concepts.
+    '''
+    with open(file, 'w') as f:
+        writer = csv.writer(f)
+        for row in sorted(concept_set):
+            writer.writerow(list(row))
+
+
+if __name__ == '__main__':
+    if not os.path.exists('input'):
+        os.makedirs('input')
+    if not os.path.exists('preprocessed'):
+        os.makedirs('preprocessed')
+    if not os.path.exists('output'):
+        os.makedirs('output')
+
+    series = requests.get(f'{API_PREFIX}List?allreleases=false').json()
+    codes = {s['code']: s['description'] for s in series}
+
+    attributes = set()
+    dimensions = set()
+    with open('output/series.mcf', 'w') as f_series:
+        for code in sorted(codes):
+            print(code)
+            data = {'seriesCodes': code}
+            text = requests.post(f'{API_PREFIX}DataCSV',
+                                 data=data,
+                                 headers=HEADERS).text.rstrip('\x00')
+            with open(f'input/{code}.csv', 'w') as f_code:
+                f_code.write(text)
+            add_concepts(code, 'Attributes', attributes)
+            add_concepts(code, 'Dimensions', dimensions)
+            f_series.write(
+                SERIES_TEMPLATE.format_map({
+                    'dcid': 'SDG_' + code,
+                    'description': format_description(codes[code])
+                }))
+
+    write_concepts('preprocessed/attributes.csv', attributes)
+    write_concepts('preprocessed/dimensions.csv', dimensions)
diff --git a/scripts/un/sdg/sdmx/process.py b/scripts/un/sdg/sdmx/process.py
new file mode 100644
index 0000000000..5004ccc44a
--- /dev/null
+++ b/scripts/un/sdg/sdmx/process.py
@@ -0,0 +1,298 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Produces CSV/TMCF + schema for UN Stats data.
+
+Produces:
+* output/output.csv: cleaned CSV
+* output/measurement_method.csv: measurement methods
+* output/schema.mcf: properties and classes
+* output/sv.mcf: statistical variables
+* output/unit.mcf: units
+Usage: python3 preprocess.py
+'''
+import collections
+import csv
+import os
+import sys
+
+from util import *
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.abspath(__file__)))))
+
+module_dir_ = os.path.dirname(__file__)
+
+# Create map of M49 -> ISO-alpha3 for countries.
+with open(os.path.join(module_dir_, 'm49.tsv')) as f:
+    PLACES = {}
+    reader = csv.DictReader(f, delimiter='\t')
+    for row in reader:
+        if not row['ISO-alpha3 code']:  # Only countries for now.
+            continue
+        PLACES[int(row['M49 code'])] = row['ISO-alpha3 code']
+
+# Create map of name -> dcid for supported cities.
+with open(os.path.join(module_dir_, 'preprocessed/cities.csv')) as f:
+    reader = csv.DictReader(f)
+    CITIES = {row['name']: row['dcid'] for row in reader}
+
+
+def write_templates(file, templates):
+    '''Write templates to file.
+    Args:
+        file: Input file path.
+        templates: Template strings.
+    '''
+    with open(file, 'w') as f:
+        for template in sorted(templates):
+            f.write(template)
+
+
+def add_concepts(file, concepts):
+    '''Adds concepts from file.
+    Args:
+        file: Input file path.
+        concepts: Dictionary of concepts: concept -> code -> (name, formatted code).
+    '''
+    with open(file) as f:
+        reader = csv.reader(f)
+        for row in reader:
+
+            # Skip totals (as indicated by SDMX).
+            if row[3] == '_T':
+                continue
+            concepts[row[0]][row[1]] = (row[2], make_value(row[1]))
+
+
+def get_observation_about(country_code, country_name, city):
+    '''Returns dcid for place.
+    Args:
+        country_code: M49 for country.
+        country_name: Name of country.
+        city: Name of city.
+    Returns:
+        Dcid of place if found, else empty string.
+    '''
+    if city:
+        formatted_city = city.replace('_', ' ').title() + ', ' + country_name
+        if formatted_city in CITIES and CITIES[formatted_city]:
+            return 'dcs:' + CITIES[formatted_city]
+        else:
+            return ''
+    if country_code in PLACES:
+        return 'dcs:country/' + PLACES[country_code]
+    else:
+        return ''
+
+
+def get_variable_measured(row, properties, concepts):
+    '''Returns templated string for variable_measured.
+    Args:
+        row: Input csv dict row.
+        properties: List of properties for row.
+        concepts: Dictionary of concepts.
+    Returns:
+        Templated string.
+    '''
+    value_ids = []
+    value_descriptions = []
+    cprops = ''
+    for i in properties:
+        field = i[1:-1]
+        if not row[i] or field not in concepts or row[i] not in concepts[field]:
+            continue
+        value_ids.append(concepts[field][row[i]][1])
+        value_descriptions.append(concepts[field][row[i]][0])
+        enum = make_property(field)
+        if field in MAPPED_CONCEPTS:
+            prop = MAPPED_CONCEPTS[field]
+        else:
+            prop = 'sdg_' + enum[0].lower() + enum[1:]
+        val = enum + 'Enum_' + value_ids[-1]
+        cprops += f'\n{prop}: dcs:SDG_{val}'
+    sv = 'sdg/' + '_'.join([row['SeriesCode']] + value_ids)
+    pvs = ', '.join(value_descriptions)
+    description = format_description(row['SeriesDescription'])
+    if pvs:
+        description += ': ' + pvs
+    template = SV_TEMPLATE.format_map({
+        'dcid': sv,
+        'popType': 'SDG_' + row['SeriesCode'],
+        'name': '"' + description + '"',
+        'cprops': cprops
+    })
+    return template
+
+
+def get_measurement_method(row, concepts):
+    '''Returns templated string for measurement_method.
+    Args:
+        row: Input csv dict row.
+        concepts: Dictionary of concepts.
+    Returns:
+        Templated string.
+    '''
+    mmethod = ''
+    description = []
+    for concept in [
+            '[Nature]', '[Observation Status]', '[Report Ordinal]',
+            '[Reporting Type]'
+    ]:
+        field = concept[1:-1]
+        if concept in row:
+            mmethod += '_' + row[concept]
+            if field in concepts and row[concept] in concepts[field]:
+                description.append(concepts[field][row[concept]][0])
+    if not mmethod:
+        return ''
+    mmethod = 'SDG' + mmethod
+    description = 'SDG Measurement Method: ' + ', '.join(
+        description) if description else ''
+    template = MMETHOD_TEMPLATE.format_map({
+        'dcid': mmethod,
+        'description': description
+    })
+    return template
+
+
+def get_unit(row):
+    '''Returns templated string for unit.
+    Args:
+        row: Input csv dict row.
+    Returns:
+        Templated string.
+    '''
+    if not '[Units]' in row:
+        return ''
+    unit = row['[Units]'].replace('^', '')
+    template = UNIT_TEMPLATE.format_map({
+        'dcid': unit,
+        'name': format_unit_name(unit)
+    })
+    return template
+
+
+def write_schema(file, concepts):
+    '''Writes schema from concepts to file.
+    Args:
+        file: Input file path.
+        concepts: Dictionary of concepts.
+    '''
+    with open(file, 'w') as f:
+        for concept in sorted(concepts):
+            if concept in SKIPPED_CONCEPTS:
+                continue
+            prop = make_property(concept)
+            enum = prop + 'Enum'
+            if concept not in MAPPED_CONCEPTS:
+                f.write(
+                    PROPERTY_TEMPLATE.format_map({
+                        'dcid': prop[0].lower() + prop[1:],
+                        'name': concept,
+                        'enum': enum
+                    }))
+            f.write(ENUM_TEMPLATE.format_map({'enum': enum}))
+            for k in sorted(concepts[concept]):
+                v = concepts[concept][k]
+                f.write(
+                    VALUE_TEMPLATE.format_map({
+                        'dcid': v[1],
+                        'enum': enum,
+                        'name': v[0][0].upper() + v[0][1:],
+                    }))
+
+
+def process_input_file(file, writer, concepts, svs, measurement_methods, units):
+    '''Processes one input file and write csv rows.
+    Args:
+        file: Input file path.
+        writer: Csv DictWriter object.
+        concepts: Dictionary of concepts.
+        svs: Set of statistical variables.
+        measurement_methods: Set of measurement methods.
+        units: Set of units.
+    '''
+    print(f'Starting {file}')
+    with open(file) as f_in:
+        reader = csv.DictReader(f_in)
+        properties = sorted([
+            field for field in reader.fieldnames
+            if field[0] == '[' and field[1:-1] not in SKIPPED_CONCEPTS
+        ])
+        try:
+            for row in reader:
+                if not int(row['GeoAreaCode']) in PLACES:
+                    continue
+                if not is_float(row['Value']) or row['Value'] == 'NaN' or row[
+                        'Value'] == 'Nan':
+                    continue
+                observation_about = get_observation_about(
+                    int(row['GeoAreaCode']), row['GeoAreaName'],
+                    row['[Cities]'] if '[Cities]' in reader.fieldnames else '')
+                if not observation_about:
+                    continue
+                sv = get_variable_measured(row, properties, concepts)
+                svs.add(sv)
+                measurement_method = get_measurement_method(row, concepts)
+                if measurement_method:
+                    measurement_methods.add(measurement_method)
+                unit = get_unit(row)
+                if unit:
+                    units.add(unit)
+                writer.writerow({
+                    'variable_measured':
+                        'dcid:' + get_dcid(sv),
+                    'observation_about':
+                        observation_about,
+                    'observation_date':
+                        row['TimePeriod'],
+                    'value':
+                        row['Value'],
+                    'measurement_method':
+                        'dcs:' + get_dcid(measurement_method)
+                        if measurement_method else '',
+                    'unit':
+                        'dcs:' + get_dcid(unit) if unit else '',
+                    'scaling_factor':
+                        row['[UnitMultiplier]']
+                        if '[UnitMultiplier]' in reader.fieldnames else '',
+                })
+        except:
+            print(f'Finished processing {file}')
+
+
+if __name__ == '__main__':
+    concepts = collections.defaultdict(dict)
+    add_concepts('preprocessed/attributes.csv', concepts)
+    add_concepts('preprocessed/dimensions.csv', concepts)
+    write_schema('output/schema.mcf', concepts)
+
+    svs = set()
+    measurement_methods = set()
+    units = set()
+    with open('output/output.csv', 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
+        writer.writeheader()
+        for file in sorted(os.listdir('input')):
+            process_input_file(os.path.join('input', file), writer, concepts,
+                               svs, measurement_methods, units)
+
+    write_templates('output/measurement_method.mcf', measurement_methods)
+    write_templates('output/sv.mcf', svs)
+    write_templates('output/unit.mcf', units)
diff --git a/scripts/un/sdg/sdmx/util.py b/scripts/un/sdg/sdmx/util.py
new file mode 100644
index 0000000000..061dc14974
--- /dev/null
+++ b/scripts/un/sdg/sdmx/util.py
@@ -0,0 +1,197 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''This script does not use the most up-to-date schema format. 
+It should only be used as an illustration of the SDMX -> MCF mapping.
+Do not actually run!
+
+Shared util functions and constants.
+'''
+import re
+
+FIELDNAMES = [
+    'variable_measured', 'observation_about', 'observation_date', 'value',
+    'measurement_method', 'unit', 'scaling_factor'
+]
+
+DCID_PREFIX = 'Node: dcid:'
+TOTAL = '_T'
+
+SERIES_TEMPLATE = '''
+Node: dcid:{dcid}
+name: "{description}"
+typeOf: dcs:SDG_Series
+'''
+PROPERTY_TEMPLATE = '''
+Node: dcid:sdg_{dcid}
+typeOf: schema:Property
+domainIncludes: dcs:Thing
+rangeIncludes: dcs:SDG_{enum}
+name: "{name}"
+isProvisional: dcs:True
+'''
+ENUM_TEMPLATE = '''
+Node: dcid:SDG_{enum}
+typeOf: schema:Class
+subClassOf: schema:Enumeration
+name: "{enum}"
+isProvisional: dcs:True
+'''
+VALUE_TEMPLATE = '''
+Node: dcid:SDG_{enum}_{dcid}
+typeOf: dcs:SDG_{enum}
+name: "{name}"
+isProvisional: dcs:True
+'''
+SV_TEMPLATE = '''
+Node: dcid:{dcid}
+typeOf: dcs:StatisticalVariable
+measuredProperty: dcs:value
+name: {name}
+populationType: dcs:{popType}
+statType: dcs:measuredValue{cprops}
+'''
+MMETHOD_TEMPLATE = '''
+Node: dcid:{dcid}
+typeOf: dcs:SDG_MeasurementMethodEnum
+name: "{dcid}"
+description: "{description}"
+'''
+UNIT_TEMPLATE = '''
+Node: dcid:{dcid}
+typeOf: dcs:UnitOfMeasure
+name: "{name}"
+description: "SDG Unit: {dcid}"
+'''
+
+# Select concepts will be modeled differently.
+SKIPPED_CONCEPTS = {
+    'Cities', 'Freq', 'Nature', 'Observation Status', 'Report Ordinal',
+    'Reporting Type', 'UnitMultiplier', 'Units'
+}
+
+# Use existing properties when they exist.
+# TODO: Also map enums to existing nodes.
+MAPPED_CONCEPTS = {
+    'Age': 'age',
+    'Cause of death': 'causeOfDeath',
+    'Disability status': 'disabilityStatus',
+    'Education level': 'educationalAttainment',
+    'Sex': 'gender',
+    'AGE': 'age',
+    'CAUSE_OF_DEATH': 'causeOfDeath',
+    'DISABILITY_STATUS': 'disabilityStatus',
+    'EDUCATION_LEVEL': 'educationalAttainment',
+    'SEX': 'gender'
+}
+
+FORMATTED_UNITS = {
+    'INDEX': 'idx',
+    'NUM_M': '#m',
+    'NUMBER': '#',
+    'PERCENT': '%',
+    'PH': 'pH',
+    'TONNES': 't',
+    'TONNES_M': 'Metric Tonnes'
+}
+
+
+def format_description(s):
+    '''Formats input with curated style.
+    Args:
+        s: Input string.
+    Returns:
+        Curated string.
+    '''
+    # Remove <=2 levels of ().
+    formatted = re.sub('\((?:[^)(]|\([^)(]*\))*\)', '', s)
+    # Remove <=2 levels of [].
+    formatted = re.sub('\[(?:[^)(]|\[[^)(]*\])*\]', '', formatted)
+    # Remove attributes indicated with 'by'.
+    formatted = formatted.split(', by')[0]
+    # Remove references indicated by 'million USD'.
+    formatted = formatted.split(', million USD')[0]
+    # Remove extra spaces
+    formatted = formatted.replace(' , ', ', ').replace('  ', ' ').strip()
+    # Remove trailing commas
+    if formatted[-1] == ',':
+        formatted = formatted[:-1]
+    # Replace 100,000 with 100K
+    formatted = formatted.replace('100,000', '100K')
+    # Make ascii
+    return formatted.replace('Â',
+                             '').replace('’', '\'').replace('₂', '2').replace(
+                                 '\xa0', ' ').replace('−', '-')
+
+
+def is_float(element):
+    '''Checks if value can be interpreted as float.
+    Args:
+      element: Input.
+    Returns:
+      Whether the value can be cast as a float.
+    '''
+    if element is None:
+        return False
+    try:
+        float(element)
+        return True
+    except ValueError:
+        return False
+
+
+def make_property(s):
+    '''Formats property string.
+    Args:
+      s: Input string.
+    Returns:
+      Formatted string.
+    '''
+    return s.title().replace(' ', '').replace('-',
+                                              '').replace('_',
+                                                          '').replace('/', '')
+
+
+def make_value(s):
+    '''Formats value string.
+    Args:
+      s: Input string.
+    Returns:
+      Formatted string.
+    '''
+    return s.replace('<=', 'LEQ').replace('<',
+                                          'LT').replace('+', 'GEQ').replace(
+                                              ' ', '').replace('_', '')
+
+
+def format_unit_name(dcid):
+    '''Formats unit name stirng.
+  Args:
+    dcid: Input dcid.
+  Retuns:
+    Formatted string.
+  '''
+    if dcid in FORMATTED_UNITS:
+        return FORMATTED_UNITS[dcid]
+    return dcid.lower().replace('_', ' ').replace('1000000', '1M').replace(
+        '100000', '100K').replace('10000', '10k')
+
+
+def get_dcid(template):
+    '''Gets dcid from template.
+    Args:
+        template: Input templated string.
+    Returns:
+        Dcid.
+    '''
+    return template.split(DCID_PREFIX)[1].split('\n')[0]