Skip to content

Commit

Permalink
update util
Browse files Browse the repository at this point in the history
  • Loading branch information
n-h-diaz committed Sep 28, 2023
1 parent c571f42 commit bba3686
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 67 deletions.
3 changes: 0 additions & 3 deletions scripts/un/sdg/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,6 @@ def process(input_dir, schema_dir, csv_dir):
'SG_SCP_PROCN_LS.LEVEL_STATUS--DEG_MLOW__GOVERNMENT_NAME--CITY_OF_WROCLAW'
)

#sv_frames.append(df.loc[:,
# ['VARIABLE_CODE', 'VARIABLE_DESCRIPTION'] +
# properties].drop_duplicates())
sv_frames.append(
df.loc[:, ['VARIABLE_CODE', 'VARIABLE_DESCRIPTION', 'SOURCE'] +
properties].drop_duplicates())
Expand Down
152 changes: 88 additions & 64 deletions scripts/un/sdg/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,10 @@
# Footnote text indicated that a zero point should be treated as null and removed.
ZERO_NULL_TEXT = 'This data point is NIL for the submitting nation.'

# Variables that should be dropped.
DROP_VARIABLE = {'VC_DTH_TOTPT'}

# Series that should be dropped.
DROP_SERIES = {
'TX_IMP_GBMRCH',
'TX_EXP_GBMRCH',
Expand All @@ -125,14 +127,54 @@
'AG_PRD_XSUBDY',
}

MAP = {
# Map of input title text to output formatted text.
TITLE_MAPPINGS = {
'Education level': 'education',
'Frequency of Chlorophyll-a concentration': 'frequency',
'Report Ordinal': 'ordinal',
'Grounds of discrimination': 'discrimination',
'Deviation Level': 'deviation'
}

# List of substrings to be deleted from titles.
TITLE_DELETIONS = [
'Age = ',
'Name of non-communicable disease = ',
'Substance use disorders = ',
'Quantile = ',
'Type of skill = Skill: ',
'Type of skill = ',
'Sex = ',
'Land cover = ',
'Level/Status = ',
'Policy instruments = ',
'Type of product = ',
'Type of waste treatment = ',
'Activity = ',
'',
'Type of renewable technology = ',
'Location = ',
'Level_of_government = ',
'Fiscal intervention stage = ',
'Name of international institution = ',
'Policy Domains = ',
'',
'Mode of transportation = ',
'Food Waste Sector = ',
]

# Map of input title text to output replacement text.
TITLE_REPLACEMENTS = {
'24 to 59 months old': '2 to 5 years old',
'36 to 47 months old': '3 to 4 years old',
'36 to 59 months old': '3 to 5 years old',
'12 to 23 months': '1 to 2 years old',
'24 to 35 months': '2 to 3 years old',
'36 to 47 months old': '3 to 4 years old',
'48 to 59 months': '4 to 5 years old'
}

# Map of SDG code -> dcid.
PLACE_MAPPINGS = {}
with open('geography/place_mappings.csv') as f:
reader = csv.DictReader(f)
Expand Down Expand Up @@ -206,44 +248,41 @@ def is_valid(v):
return v and not v == 'nan'


def replace_me(text, mappings):
new_text = text.split('[')
if len(new_text) == 1:
return text
next_text = new_text[1][0:-1]
new_string = new_text[0] + '['
raw_pairs = next_text.split('|')
def curate_pvs(text, mappings):
'''Curates PVs based on custom mappings.
Args:
text: Input text.
mappings: Custom mappings.
Returns:
Formatted text.
'''
pairs = text[1:-1].split('|')
new_pairs = []
for raw_pair in raw_pairs:
for pair in pairs:
new_pair = ''
pv = pair.split('=')
p, v = pv[0].strip(), pv[1].strip()
if p in mappings:
v_components = v.split('(')
v_main = v_components[0].strip()

# Don't repeat 'education'.
if p == 'Education level' and 'education' in v_main:
new_pair = v_main

temp = raw_pair.split('=')
left_equal, right_equal = temp[0], temp[1]
left_equal = left_equal.strip()
right_equal = right_equal.strip()

if left_equal in mappings:
if left_equal == 'Education level':
if 'education' in right_equal:
new_pair = right_equal
else:
new_pair = right_equal + ' ' + mappings[left_equal]
elif '(' in right_equal:
level, percentage = right_equal.split('(')
level = level.strip()
percentage = percentage[:-1]
new_pair = level + ' ' + mappings[
left_equal] + ' (' + percentage + ')'
else:
new_pair = right_equal + ' ' + mappings[left_equal]
else:
new_pair = raw_pair.strip()
new_pair = v_main + ' ' + mappings[p]

new_pairs.append(new_pair)
new_string += ', '.join(new_pairs)
new_string += ']'
return new_string
# Keep () on the right.
if len(v_components) > 1:
new_pair += ' (' + v_components[1].strip()

new_pairs.append(new_pair)
else:
new_pairs.append(pair.strip())
return '[' + ', '.join(new_pairs) + ']'


def format_variable_description(variable, series):
Expand All @@ -257,41 +296,26 @@ def format_variable_description(variable, series):
Formatted description.
'''
head = format_description(series)
pvs = variable.removeprefix(series)
pvs = variable.removeprefix(series).strip()
if not pvs:
return head

# Remove ISIC code.
pvs = re.sub(r'\(ISIC[^)]*\)', '', pvs)

# Remove isco code.
pvs = re.sub(r'\(isco[^)]*\)', '', pvs)
pvs = replace_me(pvs, MAP)
pvs = pvs.replace('Age = ', '')
pvs = pvs.replace('Name of non-communicable disease = ', '')
pvs = pvs.replace('Substance use disorders = ', '')
pvs = pvs.replace('Quantile = ', '')
pvs = pvs.replace('Type of skill = Skill: ', '')
pvs = pvs.replace('Type of skill = ', '')
pvs = pvs.replace('Sex = ', '')
pvs = pvs.replace('Land cover = ', '')
pvs = pvs.replace('Level/Status = ', '')
pvs = pvs.replace('Policy instruments = ', '')
pvs = pvs.replace('Type of product = ', '')
pvs = pvs.replace('Type of waste treatment = ', '')
pvs = pvs.replace('Activity = ', '')
pvs = pvs.replace('Type of renewable technology = ', '')
pvs = pvs.replace('Location = ', '')
pvs = pvs.replace('Level_of_government = ', '')
pvs = pvs.replace('Fiscal intervention stage = ', '')
pvs = pvs.replace('Name of international institution = ', '')
pvs = pvs.replace('Policy Domains = ', '')
pvs = pvs.replace('Mode of transportation = ', '')
pvs = pvs.replace('Food Waste Sector = ', '')
pvs = pvs.replace('24 to 59 months old', '2 to 5 years old')
pvs = pvs.replace('36 to 47 months old', '3 to 4 years old')
pvs = pvs.replace('36 to 59 months old', '3 to 5 years old')
pvs = pvs.replace('12 to 23 months', '1 to 2 years old')
pvs = pvs.replace('24 to 35 months', '2 to 3 years old')
pvs = pvs.replace('36 to 47 months old', '3 to 4 years old')
pvs = pvs.replace('48 to 59 months', '4 to 5 years old')
return head + pvs

# Custom text formatting.
pvs = curate_pvs(pvs, TITLE_MAPPINGS)

# Custom replacements.
for s in TITLE_DELETIONS:
pvs = pvs.replace(s, '')
for s in TITLE_REPLACEMENTS:
pvs = pvs.replace(s, TITLE_REPLACEMENTS[s])

return head + ' ' + pvs


def format_variable_code(code):
Expand Down
11 changes: 11 additions & 0 deletions scripts/un/sdg/util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,17 @@ def test_is_valid(self):
self.assertFalse(util.is_valid(float('nan')))
self.assertFalse(util.is_valid(''))

def test_curate_pvs(self):
self.assertEqual(
util.curate_pvs(
'[Age = 15 years old and over | Education level = Primary education or less]',
util.TITLE_MAPPINGS),
'[Age = 15 years old and over, Primary education or less]')
self.assertEqual(
util.curate_pvs('[Deviation Level = Extreme (75-100%)]',
util.TITLE_MAPPINGS),
'[Extreme deviation (75-100%)]')

def test_format_variable_description(self):
self.assertEqual(
util.format_variable_description(
Expand Down

0 comments on commit bba3686

Please sign in to comment.