From abf61a8be19ae3e3a988acaa689c920b0e8bebcd Mon Sep 17 00:00:00 2001 From: natalie <77713883+n-h-diaz@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:55:06 -0800 Subject: [PATCH] add scripts for HUD_IncomeLimits import (#924) * add scripts for HUD_IncomeLimits import * fix * fix * comments * fix * fix --- scripts/us_hud/__init__.py | 0 scripts/us_hud/income/README.md | 18 ++ scripts/us_hud/income/__init__.py | 0 scripts/us_hud/income/match_bq.csv | 189 ++++++++++++++++++ scripts/us_hud/income/process.py | 132 ++++++++++++ scripts/us_hud/income/process_test.py | 55 +++++ scripts/us_hud/income/testdata/__init__.py | 0 .../income/testdata/expected_output_2006.csv | 2 + .../us_hud/income/testdata/output_2006.csv | 2 + .../income/testdata/test_input_2006.csv | 2 + 10 files changed, 400 insertions(+) create mode 100644 scripts/us_hud/__init__.py create mode 100644 scripts/us_hud/income/README.md create mode 100644 scripts/us_hud/income/__init__.py create mode 100644 scripts/us_hud/income/match_bq.csv create mode 100644 scripts/us_hud/income/process.py create mode 100644 scripts/us_hud/income/process_test.py create mode 100644 scripts/us_hud/income/testdata/__init__.py create mode 100644 scripts/us_hud/income/testdata/expected_output_2006.csv create mode 100644 scripts/us_hud/income/testdata/output_2006.csv create mode 100644 scripts/us_hud/income/testdata/test_input_2006.csv diff --git a/scripts/us_hud/__init__.py b/scripts/us_hud/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/us_hud/income/README.md b/scripts/us_hud/income/README.md new file mode 100644 index 0000000000..57522f078c --- /dev/null +++ b/scripts/us_hud/income/README.md @@ -0,0 +1,18 @@ +# Income Limits + +This import includes median income for households of different sizes for the 80th and 150th (computed) percentiles from the [HUD Income Limits dataset](https://www.huduser.gov/portal/datasets/il.html). + +To generate artifacts: + +``` +python3 process.py +``` + +This will produce a folder `csv/` with cleaned CSVs `output_[YEAR].csv`. + +The `match_bq.csv` file contains places that have additional dcids that we would like to generate stats for. + +To run unit tests: +``` +python3 -m unittest discover -v -s ../ -p "*_test.py" +``` diff --git a/scripts/us_hud/income/__init__.py b/scripts/us_hud/income/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/us_hud/income/match_bq.csv b/scripts/us_hud/income/match_bq.csv new file mode 100644 index 0000000000..20e0286b24 --- /dev/null +++ b/scripts/us_hud/income/match_bq.csv @@ -0,0 +1,189 @@ +fips,city +geoId/02110,geoId/0236400 +geoId/02220,geoId/0270540 +geoId/02275,geoId/0286380 +geoId/0900108070,geoId/0908000 +geoId/0900118500,geoId/0918430 +geoId/0900156060,geoId/0955990 +geoId/0900168170,geoId/0968100 +geoId/0900173070,geoId/0973000 +geoId/0900174190,geoId/0974260 +geoId/0900308490,geoId/0908420 +geoId/0900322630,geoId/0922700 +geoId/0900337070,geoId/0937000 +geoId/0900350440,geoId/0950370 +geoId/0900382590,geoId/0982660 +geoId/0900576570,geoId/0976500 +geoId/0900747360,geoId/0947290 +geoId/0900901220,geoId/0901150 +geoId/0900919550,geoId/0919480 +geoId/0900946520,geoId/0946450 +geoId/0900947535,geoId/0947515 +geoId/0900949950,geoId/0949880 +geoId/0900952070,geoId/0952000 +geoId/0900980070,geoId/0980000 +geoId/0900982870,geoId/0982800 +geoId/0901152350,geoId/0952280 +geoId/0901156270,geoId/0956200 +geoId/2300102060,geoId/2302060 +geoId/2300138740,geoId/2338740 +geoId/2300310565,geoId/2310565 +geoId/2300360825,geoId/2360825 +geoId/2300560545,geoId/2360545 +geoId/2300571990,geoId/2371990 +geoId/2300582105,geoId/2382105 +geoId/2300923200,geoId/2323200 +geoId/2301102100,geoId/2302100 +geoId/2301127085,geoId/2327085 +geoId/2301130550,geoId/2330550 +geoId/2301180740,geoId/2380740 +geoId/2301363590,geoId/2363590 +geoId/2301902795,geoId/2302795 +geoId/2301906925,geoId/2306925 +geoId/2301955225,geoId/2355225 +geoId/2302303355,geoId/2303355 +geoId/2302703950,geoId/2303950 +geoId/2302909585,geoId/2309585 +geoId/2302921730,geoId/2321730 +geoId/2303104860,geoId/2304860 +geoId/2303164675,geoId/2364675 +geoId/2303165725,geoId/2365725 +geoId/24510,geoId/2404000 +geoId/2500346225,geoId/2546225 +geoId/2500353960,geoId/2553960 +geoId/2500502690,geoId/2502690 +geoId/2500523000,geoId/2523000 +geoId/2500545000,geoId/2545000 +geoId/2500562430,geoId/2562465 +geoId/2500569170,geoId/2569170 +geoId/2500905595,geoId/2505595 +geoId/2500916250,geoId/2516285 +geoId/2500926150,geoId/2526150 +geoId/2500929405,geoId/2529405 +geoId/2500934550,geoId/2534550 +geoId/2500937490,geoId/2537490 +geoId/2500938400,geoId/2538435 +geoId/2500943580,geoId/2543615 +geoId/2500945245,geoId/2545245 +geoId/2500952490,geoId/2552490 +geoId/2500959105,geoId/2559105 +geoId/2500960015,geoId/2560050 +geoId/2500968645,geoId/2568680 +geoId/2501313660,geoId/2513660 +geoId/2501330840,geoId/2530840 +geoId/2501336300,geoId/2536335 +geoId/2501352144,geoId/2552144 +geoId/2501367000,geoId/2567000 +geoId/2501376030,geoId/2576030 +geoId/2501546330,geoId/2546330 +geoId/2501701605,geoId/2501640 +geoId/2501705070,geoId/2505105 +geoId/2501709840,geoId/2509875 +geoId/2501711000,geoId/2511000 +geoId/2501721990,geoId/2521990 +geoId/2501724960,geoId/2524960 +geoId/2501735215,geoId/2535250 +geoId/2501737000,geoId/2537000 +geoId/2501737875,geoId/2537875 +geoId/2501738715,geoId/2538715 +geoId/2501739625,geoId/2539660 +geoId/2501739835,geoId/2539835 +geoId/2501740115,geoId/2540115 +geoId/2501745560,geoId/2545560 +geoId/2501756130,geoId/2556165 +geoId/2501762535,geoId/2562535 +geoId/2501767665,geoId/2567700 +geoId/2501772215,geoId/2572250 +geoId/2501772600,geoId/2572600 +geoId/2501780510,geoId/2580545 +geoId/2501781035,geoId/2581035 +geoId/2502109175,geoId/2509210 +geoId/2502130455,geoId/2530420 +geoId/2502141690,geoId/2541725 +geoId/2502144105,geoId/2544140 +geoId/2502150250,geoId/2550285 +geoId/2502155745,geoId/2555745 +geoId/2502155955,geoId/2555990 +geoId/2502174175,geoId/2574210 +geoId/2502178972,geoId/2578972 +geoId/2502300170,geoId/2500135 +geoId/2502309000,geoId/2509000 +geoId/2502331645,geoId/2531680 +geoId/2502507000,geoId/2507000 +geoId/2502513205,geoId/2513205 +geoId/2502556585,geoId/2556585 +geoId/2502581005,geoId/2581005 +geoId/2502723875,geoId/2523875 +geoId/2502725485,geoId/2525485 +geoId/2502735075,geoId/2535075 +geoId/2502763345,geoId/2563345 +geoId/2502782000,geoId/2582000 +geoId/29510,geoId/2965000 +geoId/32510,geoId/3209700 +geoId/3300140180,geoId/3340180 +geoId/3300539300,geoId/3339300 +geoId/3300705140,geoId/3305140 +geoId/3300941300,geoId/3341300 +geoId/3301145140,geoId/3345140 +geoId/3301150260,geoId/3350260 +geoId/3301314200,geoId/3314200 +geoId/3301327380,geoId/3327380 +geoId/3301562900,geoId/3362900 +geoId/3301718820,geoId/3318820 +geoId/3301765140,geoId/3365140 +geoId/3301769940,geoId/3369940 +geoId/3301912900,geoId/3312900 +geoId/4400374300,geoId/4474300 +geoId/4400549960,geoId/4449960 +geoId/4400714140,geoId/4414140 +geoId/4400719180,geoId/4419180 +geoId/4400722960,geoId/4422960 +geoId/4400754640,geoId/4454640 +geoId/4400759000,geoId/4459000 +geoId/4400780780,geoId/4480780 +geoId/5000174650,geoId/5074650 +geoId/5000710675,geoId/5010675 +geoId/5000766175,geoId/5066175 +geoId/5000785150,geoId/5085150 +geoId/5001161675,geoId/5061675 +geoId/5001948850,geoId/5048850 +geoId/5002161225,geoId/5061225 +geoId/5002303175,geoId/5003175 +geoId/5002346000,geoId/5046000 +geoId/51510,geoId/5101000 +geoId/51520,geoId/5109816 +geoId/51530,geoId/5111032 +geoId/51550,geoId/5116000 +geoId/51570,geoId/5118448 +geoId/51580,geoId/5119728 +geoId/51590,geoId/5121344 +geoId/51595,geoId/5125808 +geoId/51600,geoId/5126496 +geoId/51610,geoId/5127200 +geoId/51620,geoId/5129600 +geoId/51630,geoId/5129744 +geoId/51640,geoId/5130208 +geoId/51650,geoId/5135000 +geoId/51660,geoId/5135624 +geoId/51670,geoId/5138424 +geoId/51678,geoId/5145512 +geoId/51680,geoId/5147672 +geoId/51683,geoId/5148952 +geoId/51685,geoId/5148968 +geoId/51690,geoId/5149784 +geoId/51700,geoId/5156000 +geoId/51710,geoId/5157000 +geoId/51720,geoId/5157688 +geoId/51730,geoId/5161832 +geoId/51735,geoId/5163768 +geoId/51740,geoId/5164000 +geoId/51750,geoId/5165392 +geoId/51760,geoId/5167000 +geoId/51770,geoId/5168000 +geoId/51775,geoId/5170000 +geoId/51790,geoId/5175216 +geoId/51800,geoId/5176432 +geoId/51810,geoId/5182000 +geoId/51820,geoId/5183680 +geoId/51830,geoId/5186160 +geoId/51840,geoId/5186720 diff --git a/scripts/us_hud/income/process.py b/scripts/us_hud/income/process.py new file mode 100644 index 0000000000..fb9fc767b9 --- /dev/null +++ b/scripts/us_hud/income/process.py @@ -0,0 +1,132 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''Generates cleaned CSVs for HUD Income Limits data. + +Produces: +* csv/output_[YEAR].csv + +Usage: +python3 process.py +''' +import csv +import datetime +import os +import pandas as pd +from absl import app +from absl import flags + +FLAGS = flags.FLAGS +flags.DEFINE_string('income_output_dir', 'csv', 'Path to write cleaned CSVs.') + +URL_PREFIX = 'https://www.huduser.gov/portal/datasets/il/il' + + +def get_url(year): + '''Return xls url for year. + + Args: + year: Input year. + + Returns: + xls url for given year. + ''' + if year < 2006: + return '' + suffix = str(year)[-2:] + if year >= 2016: + return f'{URL_PREFIX}{suffix}/Section8-FY{suffix}.xlsx' + elif year == 2015: + return f'{URL_PREFIX}15/Section8_Rev.xlsx' + elif year == 2014: + return f'{URL_PREFIX}14/Poverty.xls' + elif year == 2011: + return f'{URL_PREFIX}11/Section8_v3.xls' + elif year >= 2009: + return f'{URL_PREFIX}{suffix}/Section8.xls' + elif year == 2008: + return f'{URL_PREFIX}08/Section8_FY08.xls' + elif year == 2007: + return f'{URL_PREFIX}07/Section8-rev.xls' + elif year == 2006: + return f'{URL_PREFIX}06/Section8FY2006.xls' + else: + return '' + + +def compute_150(df, person): + '''Compute 150th percentile income in-place. + + Args: + df: Input dataframe (will be modified). + person: Number of people in household. + ''' + df[f'l150_{person}'] = df.apply( + lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1) + + +def process(year, matches, output_dir): + '''Generate cleaned CSV. + + Args: + year: Input year. + matches: Map of fips dcid -> city dcid. + output_dir: Directory to write cleaned CSV. + ''' + url = get_url(year) + try: + df = pd.read_excel(url) + except: + print(f'No file found for {url}.') + return + if 'fips2010' in df: + df = df.rename(columns={'fips2010': 'fips'}) + + # Filter to 80th percentile income stats for each household size. + df = df.loc[:, [ + 'fips', 'l80_1', 'l80_2', 'l80_3', 'l80_4', 'l80_5', 'l80_6', 'l80_7', + 'l80_8' + ]] + + df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10), + axis=1) + df['fips'] = df.apply(lambda x: x['fips'][:-5] + if x['fips'][-5:] == '99999' else x['fips'], + axis=1) + for i in range(1, 9): + compute_150(df, i) + df['year'] = [year for i in range(len(df))] + + # Add stats for matching dcids. + df_match = df.copy().loc[df['fips'].isin(matches)] + if not df_match.empty: + df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1) + df = pd.concat([df, df_match]) + + df.to_csv(os.path.join(output_dir, f'output_{year}.csv'), index=False) + + +def main(argv): + with open('match_bq.csv') as f: + reader = csv.DictReader(f) + matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader} + if not os.path.exists(FLAGS.income_output_dir): + os.makedirs(FLAGS.income_output_dir) + today = datetime.date.today() + for year in range(2006, today.year): + print(year) + process(year, matches, FLAGS.income_output_dir) + + +if __name__ == '__main__': + app.run(main) diff --git a/scripts/us_hud/income/process_test.py b/scripts/us_hud/income/process_test.py new file mode 100644 index 0000000000..6a2e68f13d --- /dev/null +++ b/scripts/us_hud/income/process_test.py @@ -0,0 +1,55 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +'''Tests for process.py. + +Usage: python3 -m unittest discover -v -s ../ -p "process_test.py" +''' +import os +import pandas as pd +import sys +import unittest +from unittest.mock import patch + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__))))) +from us_hud.income import process + +module_dir_ = os.path.dirname(__file__) + +TEST_DIR = os.path.join(module_dir_, 'testdata') + + +class ProcessTest(unittest.TestCase): + + def test_get_url(self): + self.assertEqual( + process.get_url(2022), + 'https://www.huduser.gov/portal/datasets/il/il22/Section8-FY22.xlsx' + ) + self.assertEqual(process.get_url(1997), '') + + def test_compute_150(self): + pass + + @patch('pandas.read_excel') + def test_process(self, mock_df): + mock_df.return_value = pd.DataFrame( + pd.read_csv(os.path.join(TEST_DIR, 'test_input_2006.csv'))) + matches = {'dcs:geoId/02110': 'dcs:geoId/0236400'} + process.process(2006, matches, TEST_DIR) + with open(os.path.join(TEST_DIR, 'output_2006.csv')) as result: + with open(os.path.join(TEST_DIR, + 'expected_output_2006.csv')) as expected: + self.assertEqual(result.read(), expected.read()) diff --git a/scripts/us_hud/income/testdata/__init__.py b/scripts/us_hud/income/testdata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/us_hud/income/testdata/expected_output_2006.csv b/scripts/us_hud/income/testdata/expected_output_2006.csv new file mode 100644 index 0000000000..76e466eb5c --- /dev/null +++ b/scripts/us_hud/income/testdata/expected_output_2006.csv @@ -0,0 +1,2 @@ +fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year +dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006 diff --git a/scripts/us_hud/income/testdata/output_2006.csv b/scripts/us_hud/income/testdata/output_2006.csv new file mode 100644 index 0000000000..76e466eb5c --- /dev/null +++ b/scripts/us_hud/income/testdata/output_2006.csv @@ -0,0 +1,2 @@ +fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year +dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006 diff --git a/scripts/us_hud/income/testdata/test_input_2006.csv b/scripts/us_hud/income/testdata/test_input_2006.csv new file mode 100644 index 0000000000..5eb7f16ff1 --- /dev/null +++ b/scripts/us_hud/income/testdata/test_input_2006.csv @@ -0,0 +1,2 @@ +State_Alpha,fips,State,County_Town_Name,County,Metro_Area_Name,CBSASub,County_Name,median1999,median2006,State_Name,l50_1,l50_2,l50_3,l50_4,l50_5,l50_6,l50_7,l50_8,msa,l30_1,l30_2,l30_3,l30_4,l30_5,l30_6,l30_7,l30_8,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,metro +AL,100199999,1,Autauga County,1,"Montgomery, AL MSA",METRO33860M33860,Autauga County,45182,55900,Alabama,19550,22350,25150,27950,30200,32400,34650,36900,5240,11750,13400,15100,16750,18100,19450,20750,22100,31300,35750,40250,44700,48300,51850,55450,59000,1