Skip to content

Commit

Permalink
add scripts for HUD_IncomeLimits import (datacommonsorg#924)
Browse files Browse the repository at this point in the history
* add scripts for HUD_IncomeLimits import

* fix

* fix

* comments

* fix

* fix
  • Loading branch information
n-h-diaz authored Nov 16, 2023
1 parent 8702d2a commit abf61a8
Show file tree
Hide file tree
Showing 10 changed files with 400 additions and 0 deletions.
Empty file added scripts/us_hud/__init__.py
Empty file.
18 changes: 18 additions & 0 deletions scripts/us_hud/income/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Income Limits

This import includes median income for households of different sizes for the 80th and 150th (computed) percentiles from the [HUD Income Limits dataset](https://www.huduser.gov/portal/datasets/il.html).

To generate artifacts:

```
python3 process.py
```

This will produce a folder `csv/` with cleaned CSVs `output_[YEAR].csv`.

The `match_bq.csv` file contains places that have additional dcids that we would like to generate stats for.

To run unit tests:
```
python3 -m unittest discover -v -s ../ -p "*_test.py"
```
Empty file.
189 changes: 189 additions & 0 deletions scripts/us_hud/income/match_bq.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
fips,city
geoId/02110,geoId/0236400
geoId/02220,geoId/0270540
geoId/02275,geoId/0286380
geoId/0900108070,geoId/0908000
geoId/0900118500,geoId/0918430
geoId/0900156060,geoId/0955990
geoId/0900168170,geoId/0968100
geoId/0900173070,geoId/0973000
geoId/0900174190,geoId/0974260
geoId/0900308490,geoId/0908420
geoId/0900322630,geoId/0922700
geoId/0900337070,geoId/0937000
geoId/0900350440,geoId/0950370
geoId/0900382590,geoId/0982660
geoId/0900576570,geoId/0976500
geoId/0900747360,geoId/0947290
geoId/0900901220,geoId/0901150
geoId/0900919550,geoId/0919480
geoId/0900946520,geoId/0946450
geoId/0900947535,geoId/0947515
geoId/0900949950,geoId/0949880
geoId/0900952070,geoId/0952000
geoId/0900980070,geoId/0980000
geoId/0900982870,geoId/0982800
geoId/0901152350,geoId/0952280
geoId/0901156270,geoId/0956200
geoId/2300102060,geoId/2302060
geoId/2300138740,geoId/2338740
geoId/2300310565,geoId/2310565
geoId/2300360825,geoId/2360825
geoId/2300560545,geoId/2360545
geoId/2300571990,geoId/2371990
geoId/2300582105,geoId/2382105
geoId/2300923200,geoId/2323200
geoId/2301102100,geoId/2302100
geoId/2301127085,geoId/2327085
geoId/2301130550,geoId/2330550
geoId/2301180740,geoId/2380740
geoId/2301363590,geoId/2363590
geoId/2301902795,geoId/2302795
geoId/2301906925,geoId/2306925
geoId/2301955225,geoId/2355225
geoId/2302303355,geoId/2303355
geoId/2302703950,geoId/2303950
geoId/2302909585,geoId/2309585
geoId/2302921730,geoId/2321730
geoId/2303104860,geoId/2304860
geoId/2303164675,geoId/2364675
geoId/2303165725,geoId/2365725
geoId/24510,geoId/2404000
geoId/2500346225,geoId/2546225
geoId/2500353960,geoId/2553960
geoId/2500502690,geoId/2502690
geoId/2500523000,geoId/2523000
geoId/2500545000,geoId/2545000
geoId/2500562430,geoId/2562465
geoId/2500569170,geoId/2569170
geoId/2500905595,geoId/2505595
geoId/2500916250,geoId/2516285
geoId/2500926150,geoId/2526150
geoId/2500929405,geoId/2529405
geoId/2500934550,geoId/2534550
geoId/2500937490,geoId/2537490
geoId/2500938400,geoId/2538435
geoId/2500943580,geoId/2543615
geoId/2500945245,geoId/2545245
geoId/2500952490,geoId/2552490
geoId/2500959105,geoId/2559105
geoId/2500960015,geoId/2560050
geoId/2500968645,geoId/2568680
geoId/2501313660,geoId/2513660
geoId/2501330840,geoId/2530840
geoId/2501336300,geoId/2536335
geoId/2501352144,geoId/2552144
geoId/2501367000,geoId/2567000
geoId/2501376030,geoId/2576030
geoId/2501546330,geoId/2546330
geoId/2501701605,geoId/2501640
geoId/2501705070,geoId/2505105
geoId/2501709840,geoId/2509875
geoId/2501711000,geoId/2511000
geoId/2501721990,geoId/2521990
geoId/2501724960,geoId/2524960
geoId/2501735215,geoId/2535250
geoId/2501737000,geoId/2537000
geoId/2501737875,geoId/2537875
geoId/2501738715,geoId/2538715
geoId/2501739625,geoId/2539660
geoId/2501739835,geoId/2539835
geoId/2501740115,geoId/2540115
geoId/2501745560,geoId/2545560
geoId/2501756130,geoId/2556165
geoId/2501762535,geoId/2562535
geoId/2501767665,geoId/2567700
geoId/2501772215,geoId/2572250
geoId/2501772600,geoId/2572600
geoId/2501780510,geoId/2580545
geoId/2501781035,geoId/2581035
geoId/2502109175,geoId/2509210
geoId/2502130455,geoId/2530420
geoId/2502141690,geoId/2541725
geoId/2502144105,geoId/2544140
geoId/2502150250,geoId/2550285
geoId/2502155745,geoId/2555745
geoId/2502155955,geoId/2555990
geoId/2502174175,geoId/2574210
geoId/2502178972,geoId/2578972
geoId/2502300170,geoId/2500135
geoId/2502309000,geoId/2509000
geoId/2502331645,geoId/2531680
geoId/2502507000,geoId/2507000
geoId/2502513205,geoId/2513205
geoId/2502556585,geoId/2556585
geoId/2502581005,geoId/2581005
geoId/2502723875,geoId/2523875
geoId/2502725485,geoId/2525485
geoId/2502735075,geoId/2535075
geoId/2502763345,geoId/2563345
geoId/2502782000,geoId/2582000
geoId/29510,geoId/2965000
geoId/32510,geoId/3209700
geoId/3300140180,geoId/3340180
geoId/3300539300,geoId/3339300
geoId/3300705140,geoId/3305140
geoId/3300941300,geoId/3341300
geoId/3301145140,geoId/3345140
geoId/3301150260,geoId/3350260
geoId/3301314200,geoId/3314200
geoId/3301327380,geoId/3327380
geoId/3301562900,geoId/3362900
geoId/3301718820,geoId/3318820
geoId/3301765140,geoId/3365140
geoId/3301769940,geoId/3369940
geoId/3301912900,geoId/3312900
geoId/4400374300,geoId/4474300
geoId/4400549960,geoId/4449960
geoId/4400714140,geoId/4414140
geoId/4400719180,geoId/4419180
geoId/4400722960,geoId/4422960
geoId/4400754640,geoId/4454640
geoId/4400759000,geoId/4459000
geoId/4400780780,geoId/4480780
geoId/5000174650,geoId/5074650
geoId/5000710675,geoId/5010675
geoId/5000766175,geoId/5066175
geoId/5000785150,geoId/5085150
geoId/5001161675,geoId/5061675
geoId/5001948850,geoId/5048850
geoId/5002161225,geoId/5061225
geoId/5002303175,geoId/5003175
geoId/5002346000,geoId/5046000
geoId/51510,geoId/5101000
geoId/51520,geoId/5109816
geoId/51530,geoId/5111032
geoId/51550,geoId/5116000
geoId/51570,geoId/5118448
geoId/51580,geoId/5119728
geoId/51590,geoId/5121344
geoId/51595,geoId/5125808
geoId/51600,geoId/5126496
geoId/51610,geoId/5127200
geoId/51620,geoId/5129600
geoId/51630,geoId/5129744
geoId/51640,geoId/5130208
geoId/51650,geoId/5135000
geoId/51660,geoId/5135624
geoId/51670,geoId/5138424
geoId/51678,geoId/5145512
geoId/51680,geoId/5147672
geoId/51683,geoId/5148952
geoId/51685,geoId/5148968
geoId/51690,geoId/5149784
geoId/51700,geoId/5156000
geoId/51710,geoId/5157000
geoId/51720,geoId/5157688
geoId/51730,geoId/5161832
geoId/51735,geoId/5163768
geoId/51740,geoId/5164000
geoId/51750,geoId/5165392
geoId/51760,geoId/5167000
geoId/51770,geoId/5168000
geoId/51775,geoId/5170000
geoId/51790,geoId/5175216
geoId/51800,geoId/5176432
geoId/51810,geoId/5182000
geoId/51820,geoId/5183680
geoId/51830,geoId/5186160
geoId/51840,geoId/5186720
132 changes: 132 additions & 0 deletions scripts/us_hud/income/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''Generates cleaned CSVs for HUD Income Limits data.
Produces:
* csv/output_[YEAR].csv
Usage:
python3 process.py
'''
import csv
import datetime
import os
import pandas as pd
from absl import app
from absl import flags

FLAGS = flags.FLAGS
flags.DEFINE_string('income_output_dir', 'csv', 'Path to write cleaned CSVs.')

URL_PREFIX = 'https://www.huduser.gov/portal/datasets/il/il'


def get_url(year):
'''Return xls url for year.
Args:
year: Input year.
Returns:
xls url for given year.
'''
if year < 2006:
return ''
suffix = str(year)[-2:]
if year >= 2016:
return f'{URL_PREFIX}{suffix}/Section8-FY{suffix}.xlsx'
elif year == 2015:
return f'{URL_PREFIX}15/Section8_Rev.xlsx'
elif year == 2014:
return f'{URL_PREFIX}14/Poverty.xls'
elif year == 2011:
return f'{URL_PREFIX}11/Section8_v3.xls'
elif year >= 2009:
return f'{URL_PREFIX}{suffix}/Section8.xls'
elif year == 2008:
return f'{URL_PREFIX}08/Section8_FY08.xls'
elif year == 2007:
return f'{URL_PREFIX}07/Section8-rev.xls'
elif year == 2006:
return f'{URL_PREFIX}06/Section8FY2006.xls'
else:
return ''


def compute_150(df, person):
'''Compute 150th percentile income in-place.
Args:
df: Input dataframe (will be modified).
person: Number of people in household.
'''
df[f'l150_{person}'] = df.apply(
lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1)


def process(year, matches, output_dir):
'''Generate cleaned CSV.
Args:
year: Input year.
matches: Map of fips dcid -> city dcid.
output_dir: Directory to write cleaned CSV.
'''
url = get_url(year)
try:
df = pd.read_excel(url)
except:
print(f'No file found for {url}.')
return
if 'fips2010' in df:
df = df.rename(columns={'fips2010': 'fips'})

# Filter to 80th percentile income stats for each household size.
df = df.loc[:, [
'fips', 'l80_1', 'l80_2', 'l80_3', 'l80_4', 'l80_5', 'l80_6', 'l80_7',
'l80_8'
]]

df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10),
axis=1)
df['fips'] = df.apply(lambda x: x['fips'][:-5]
if x['fips'][-5:] == '99999' else x['fips'],
axis=1)
for i in range(1, 9):
compute_150(df, i)
df['year'] = [year for i in range(len(df))]

# Add stats for matching dcids.
df_match = df.copy().loc[df['fips'].isin(matches)]
if not df_match.empty:
df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1)
df = pd.concat([df, df_match])

df.to_csv(os.path.join(output_dir, f'output_{year}.csv'), index=False)


def main(argv):
with open('match_bq.csv') as f:
reader = csv.DictReader(f)
matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader}
if not os.path.exists(FLAGS.income_output_dir):
os.makedirs(FLAGS.income_output_dir)
today = datetime.date.today()
for year in range(2006, today.year):
print(year)
process(year, matches, FLAGS.income_output_dir)


if __name__ == '__main__':
app.run(main)
55 changes: 55 additions & 0 deletions scripts/us_hud/income/process_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''Tests for process.py.
Usage: python3 -m unittest discover -v -s ../ -p "process_test.py"
'''
import os
import pandas as pd
import sys
import unittest
from unittest.mock import patch

sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(
os.path.abspath(__file__)))))
from us_hud.income import process

module_dir_ = os.path.dirname(__file__)

TEST_DIR = os.path.join(module_dir_, 'testdata')


class ProcessTest(unittest.TestCase):

def test_get_url(self):
self.assertEqual(
process.get_url(2022),
'https://www.huduser.gov/portal/datasets/il/il22/Section8-FY22.xlsx'
)
self.assertEqual(process.get_url(1997), '')

def test_compute_150(self):
pass

@patch('pandas.read_excel')
def test_process(self, mock_df):
mock_df.return_value = pd.DataFrame(
pd.read_csv(os.path.join(TEST_DIR, 'test_input_2006.csv')))
matches = {'dcs:geoId/02110': 'dcs:geoId/0236400'}
process.process(2006, matches, TEST_DIR)
with open(os.path.join(TEST_DIR, 'output_2006.csv')) as result:
with open(os.path.join(TEST_DIR,
'expected_output_2006.csv')) as expected:
self.assertEqual(result.read(), expected.read())
Empty file.
Loading

0 comments on commit abf61a8

Please sign in to comment.