diff --git a/scripts/un/sdg/footnote.py b/scripts/un/sdg/footnote.py deleted file mode 100644 index d54684c09c..0000000000 --- a/scripts/un/sdg/footnote.py +++ /dev/null @@ -1,44 +0,0 @@ -import collections -import csv -import os - -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname( - os.path.abspath(__file__))))) -from un.sdg import util - -directory = 'sdg-dataset/output/observations' - - -def fix(s): - try: - return s.encode('latin1').decode('utf8') - except: - return s.encode('utf8').decode('utf8') - -sources = collections.defaultdict(set) -for filename in sorted(os.listdir(directory)): - file = os.path.join(directory, filename) - print(file) - with open(file) as f: - reader = csv.DictReader(f) - for row in reader: - dcid = 'sdg/' + util.format_variable_code(row['VARIABLE_CODE']) - if row['SOURCE']: - sources[dcid].add(fix(row['SOURCE']).removesuffix('.').strip().replace('"', "'").replace('\n', '').replace('\t', '').replace('__', '_')) - -with open('schema/sv.mcf') as f_in: - with open('sv_footnote.mcf', 'w') as f_out: - nodes = f_in.read().split('Node: ')[1:] - for node in nodes: - lines = node.split('\n') - for line in lines: - if line.startswith('dcid:'): - dcid = line.removeprefix('dcid:') - if not dcid: - continue - f_out.write('Node: ' + node.removesuffix('\n\n')) - if dcid in sources: - f_out.write('\nfootnote: "Includes data from the following sources: ' + '; '.join(sorted(sources[dcid])) + '"') - f_out.write('\n\n') - diff --git a/scripts/un/sdg/geographies.py b/scripts/un/sdg/geographies.py index a074ba3932..2afb3ad3d8 100644 --- a/scripts/un/sdg/geographies.py +++ b/scripts/un/sdg/geographies.py @@ -2,6 +2,19 @@ import csv import json +CONTAINMENT_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcid:{type}{containment} +''' +PLACE_TEMPLATE = ''' +Node: dcid:{dcid} +typeOf: dcs:{type} +name: "{name}" +unDataCode: "{code}" +unDataLabel: "{label}" +''' + +# Curated map of SDG GEOGRAPHY_CODE to UN data code. FIXED = { 'africa': '2', 'undata-geo/G99999999': '952', @@ -52,13 +65,6 @@ def should_include_containment(s, s_dcid, o, o_dcid): un2dc[row['unDataCode']] = (row['dcid'], type, row['dc_name']) # write base place mcf -PLACE_TEMPLATE = ''' -Node: dcid:{dcid} -typeOf: dcs:{type} -name: "{name}" -unDataCode: "{code}" -unDataLabel: "{label}" -''' un2dc2 = {} subjects = set() with open('geography/geographies.csv') as f_in: @@ -118,10 +124,7 @@ def should_include_containment(s, s_dcid, o, o_dcid): if should_include_containment(s_type, s_dcid, o_type, o_dcid): containment[(s_dcid, s_type)].append(o_dcid) -CONTAINMENT_TEMPLATE = ''' -Node: dcid:{dcid} -typeOf: dcid:{type}{containment} -''' + with open('geography/un_containment.mcf', 'w') as f: for s in sorted(containment): c = '' diff --git a/scripts/un/sdg/process.py b/scripts/un/sdg/process.py index 13057a15f2..e68b7bb22a 100644 --- a/scripts/un/sdg/process.py +++ b/scripts/un/sdg/process.py @@ -72,7 +72,18 @@ def get_measurement_method(row): mmethod += '_' + str(row['REPORTING_TYPE']) return 'SDG' + mmethod + def drop_null(value, series, footnote): + '''Returns value or '' if it should be dropped. + + Args: + value: Input value. + series: Series code. + footnote: Footnote for observation. + + Returns: + value or ''. + ''' if series not in util.ZERO_NULL: return value if footnote != util.ZERO_NULL_TEXT: @@ -89,6 +100,13 @@ def drop_special(value, variable, series): return value +def fix(s): + try: + return s.encode('latin1').decode('utf8') + except: + return s.encode('utf8').decode('utf8') + + def process(input_dir, schema_dir, csv_dir): '''Generates mcf, csv/tmcf artifacts. @@ -210,8 +228,11 @@ def process(input_dir, schema_dir, csv_dir): 'SG_SCP_PROCN_LS.LEVEL_STATUS--DEG_MLOW__GOVERNMENT_NAME--CITY_OF_WROCLAW' ) + #sv_frames.append(df.loc[:, + # ['VARIABLE_CODE', 'VARIABLE_DESCRIPTION'] + + # properties].drop_duplicates()) sv_frames.append(df.loc[:, - ['VARIABLE_CODE', 'VARIABLE_DESCRIPTION'] + + ['VARIABLE_CODE', 'VARIABLE_DESCRIPTION', 'SOURCE'] + properties].drop_duplicates()) measurement_method_frames.append( df.loc[:, ['NATURE', 'OBS_STATUS', 'REPORTING_TYPE']]. @@ -238,9 +259,15 @@ def process(input_dir, schema_dir, csv_dir): with open(os.path.join(schema_dir, 'sv.mcf'), 'w') as f: for df in sv_frames: - for _, row in df.iterrows(): + main = df.drop(['SOURCE'], axis=1).drop_duplicates() + for _, row in main.iterrows(): + sources = df.loc[df['VARIABLE_CODE'] == row['VARIABLE_CODE']] + sources = sources.loc[:, ['SOURCE']].drop_duplicates()['SOURCE'] + footnote = '' + if not sources.empty: + footnote = '\nfootnote: "Includes data from the following sources: ' + '; '.join(sorted([fix(str(s)).removesuffix('.').strip().replace('"', "'").replace('\n', '').replace('\t', '').replace('__', '_') for s in sources])) + '"' cprops = '' - for dimension in sorted(df.columns[2:]): + for dimension in sorted(main.columns[2:]): # Skip totals. if row[dimension] == util.TOTAL: continue @@ -272,6 +299,8 @@ def process(input_dir, schema_dir, csv_dir): '"' + row['VARIABLE_DESCRIPTION'] + '"', 'cprops': cprops, + 'footnote': + footnote, })) with open(os.path.join(schema_dir, 'schema.mcf'), 'w') as f: diff --git a/scripts/un/sdg/testdata/test_schema/sv.mcf b/scripts/un/sdg/testdata/test_schema/sv.mcf index 6e7ea414ca..1d973b73c7 100644 --- a/scripts/un/sdg/testdata/test_schema/sv.mcf +++ b/scripts/un/sdg/testdata/test_schema/sv.mcf @@ -5,6 +5,7 @@ measuredProperty: dcs:value name: "Food waste" populationType: dcs:SDG_AG_FOOD_WST statType: dcs:measuredValue +footnote: "Includes data from the following sources: Food Waste Index Report 2021 / WESR" Node: dcid:sdg/AG_FOOD_WST.FOOD_WASTE_SECTOR--FWS_HHS typeOf: dcs:StatisticalVariable @@ -13,3 +14,4 @@ name: "Food waste [Households]" populationType: dcs:SDG_AG_FOOD_WST statType: dcs:measuredValue sdg_foodWasteSector: dcs:SDG_FoodWasteSectorEnum_FWS_HHS +footnote: "Includes data from the following sources: Food Waste Index Report 2021 / WESR" diff --git a/scripts/un/sdg/util.py b/scripts/un/sdg/util.py index 8abf6f71c7..6090805ef3 100644 --- a/scripts/un/sdg/util.py +++ b/scripts/un/sdg/util.py @@ -59,7 +59,7 @@ measuredProperty: dcs:value name: {name} populationType: dcs:{popType} -statType: dcs:measuredValue{cprops} +statType: dcs:measuredValue{cprops}{footnote} ''' MMETHOD_TEMPLATE = ''' Node: dcid:{dcid}