From abf61a8be19ae3e3a988acaa689c920b0e8bebcd Mon Sep 17 00:00:00 2001
From: natalie <77713883+n-h-diaz@users.noreply.github.com>
Date: Thu, 16 Nov 2023 15:55:06 -0800
Subject: [PATCH] add scripts for HUD_IncomeLimits import (#924)

* add scripts for HUD_IncomeLimits import

* fix

* fix

* comments

* fix

* fix
---
 scripts/us_hud/__init__.py                    |   0
 scripts/us_hud/income/README.md               |  18 ++
 scripts/us_hud/income/__init__.py             |   0
 scripts/us_hud/income/match_bq.csv            | 189 ++++++++++++++++++
 scripts/us_hud/income/process.py              | 132 ++++++++++++
 scripts/us_hud/income/process_test.py         |  55 +++++
 scripts/us_hud/income/testdata/__init__.py    |   0
 .../income/testdata/expected_output_2006.csv  |   2 +
 .../us_hud/income/testdata/output_2006.csv    |   2 +
 .../income/testdata/test_input_2006.csv       |   2 +
 10 files changed, 400 insertions(+)
 create mode 100644 scripts/us_hud/__init__.py
 create mode 100644 scripts/us_hud/income/README.md
 create mode 100644 scripts/us_hud/income/__init__.py
 create mode 100644 scripts/us_hud/income/match_bq.csv
 create mode 100644 scripts/us_hud/income/process.py
 create mode 100644 scripts/us_hud/income/process_test.py
 create mode 100644 scripts/us_hud/income/testdata/__init__.py
 create mode 100644 scripts/us_hud/income/testdata/expected_output_2006.csv
 create mode 100644 scripts/us_hud/income/testdata/output_2006.csv
 create mode 100644 scripts/us_hud/income/testdata/test_input_2006.csv

diff --git a/scripts/us_hud/__init__.py b/scripts/us_hud/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/us_hud/income/README.md b/scripts/us_hud/income/README.md
new file mode 100644
index 0000000000..57522f078c
--- /dev/null
+++ b/scripts/us_hud/income/README.md
@@ -0,0 +1,18 @@
+# Income Limits
+
+This import includes median income for households of different sizes for the 80th and 150th (computed) percentiles from the [HUD Income Limits dataset](https://www.huduser.gov/portal/datasets/il.html).
+
+To generate artifacts: 
+
+```
+python3 process.py
+```
+
+This will produce a folder `csv/` with cleaned CSVs `output_[YEAR].csv`.
+
+The `match_bq.csv` file contains places that have additional dcids that we would like to generate stats for.
+
+To run unit tests: 
+```
+python3 -m unittest discover -v -s ../ -p "*_test.py"
+```
diff --git a/scripts/us_hud/income/__init__.py b/scripts/us_hud/income/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/us_hud/income/match_bq.csv b/scripts/us_hud/income/match_bq.csv
new file mode 100644
index 0000000000..20e0286b24
--- /dev/null
+++ b/scripts/us_hud/income/match_bq.csv
@@ -0,0 +1,189 @@
+fips,city
+geoId/02110,geoId/0236400
+geoId/02220,geoId/0270540
+geoId/02275,geoId/0286380
+geoId/0900108070,geoId/0908000
+geoId/0900118500,geoId/0918430
+geoId/0900156060,geoId/0955990
+geoId/0900168170,geoId/0968100
+geoId/0900173070,geoId/0973000
+geoId/0900174190,geoId/0974260
+geoId/0900308490,geoId/0908420
+geoId/0900322630,geoId/0922700
+geoId/0900337070,geoId/0937000
+geoId/0900350440,geoId/0950370
+geoId/0900382590,geoId/0982660
+geoId/0900576570,geoId/0976500
+geoId/0900747360,geoId/0947290
+geoId/0900901220,geoId/0901150
+geoId/0900919550,geoId/0919480
+geoId/0900946520,geoId/0946450
+geoId/0900947535,geoId/0947515
+geoId/0900949950,geoId/0949880
+geoId/0900952070,geoId/0952000
+geoId/0900980070,geoId/0980000
+geoId/0900982870,geoId/0982800
+geoId/0901152350,geoId/0952280
+geoId/0901156270,geoId/0956200
+geoId/2300102060,geoId/2302060
+geoId/2300138740,geoId/2338740
+geoId/2300310565,geoId/2310565
+geoId/2300360825,geoId/2360825
+geoId/2300560545,geoId/2360545
+geoId/2300571990,geoId/2371990
+geoId/2300582105,geoId/2382105
+geoId/2300923200,geoId/2323200
+geoId/2301102100,geoId/2302100
+geoId/2301127085,geoId/2327085
+geoId/2301130550,geoId/2330550
+geoId/2301180740,geoId/2380740
+geoId/2301363590,geoId/2363590
+geoId/2301902795,geoId/2302795
+geoId/2301906925,geoId/2306925
+geoId/2301955225,geoId/2355225
+geoId/2302303355,geoId/2303355
+geoId/2302703950,geoId/2303950
+geoId/2302909585,geoId/2309585
+geoId/2302921730,geoId/2321730
+geoId/2303104860,geoId/2304860
+geoId/2303164675,geoId/2364675
+geoId/2303165725,geoId/2365725
+geoId/24510,geoId/2404000
+geoId/2500346225,geoId/2546225
+geoId/2500353960,geoId/2553960
+geoId/2500502690,geoId/2502690
+geoId/2500523000,geoId/2523000
+geoId/2500545000,geoId/2545000
+geoId/2500562430,geoId/2562465
+geoId/2500569170,geoId/2569170
+geoId/2500905595,geoId/2505595
+geoId/2500916250,geoId/2516285
+geoId/2500926150,geoId/2526150
+geoId/2500929405,geoId/2529405
+geoId/2500934550,geoId/2534550
+geoId/2500937490,geoId/2537490
+geoId/2500938400,geoId/2538435
+geoId/2500943580,geoId/2543615
+geoId/2500945245,geoId/2545245
+geoId/2500952490,geoId/2552490
+geoId/2500959105,geoId/2559105
+geoId/2500960015,geoId/2560050
+geoId/2500968645,geoId/2568680
+geoId/2501313660,geoId/2513660
+geoId/2501330840,geoId/2530840
+geoId/2501336300,geoId/2536335
+geoId/2501352144,geoId/2552144
+geoId/2501367000,geoId/2567000
+geoId/2501376030,geoId/2576030
+geoId/2501546330,geoId/2546330
+geoId/2501701605,geoId/2501640
+geoId/2501705070,geoId/2505105
+geoId/2501709840,geoId/2509875
+geoId/2501711000,geoId/2511000
+geoId/2501721990,geoId/2521990
+geoId/2501724960,geoId/2524960
+geoId/2501735215,geoId/2535250
+geoId/2501737000,geoId/2537000
+geoId/2501737875,geoId/2537875
+geoId/2501738715,geoId/2538715
+geoId/2501739625,geoId/2539660
+geoId/2501739835,geoId/2539835
+geoId/2501740115,geoId/2540115
+geoId/2501745560,geoId/2545560
+geoId/2501756130,geoId/2556165
+geoId/2501762535,geoId/2562535
+geoId/2501767665,geoId/2567700
+geoId/2501772215,geoId/2572250
+geoId/2501772600,geoId/2572600
+geoId/2501780510,geoId/2580545
+geoId/2501781035,geoId/2581035
+geoId/2502109175,geoId/2509210
+geoId/2502130455,geoId/2530420
+geoId/2502141690,geoId/2541725
+geoId/2502144105,geoId/2544140
+geoId/2502150250,geoId/2550285
+geoId/2502155745,geoId/2555745
+geoId/2502155955,geoId/2555990
+geoId/2502174175,geoId/2574210
+geoId/2502178972,geoId/2578972
+geoId/2502300170,geoId/2500135
+geoId/2502309000,geoId/2509000
+geoId/2502331645,geoId/2531680
+geoId/2502507000,geoId/2507000
+geoId/2502513205,geoId/2513205
+geoId/2502556585,geoId/2556585
+geoId/2502581005,geoId/2581005
+geoId/2502723875,geoId/2523875
+geoId/2502725485,geoId/2525485
+geoId/2502735075,geoId/2535075
+geoId/2502763345,geoId/2563345
+geoId/2502782000,geoId/2582000
+geoId/29510,geoId/2965000
+geoId/32510,geoId/3209700
+geoId/3300140180,geoId/3340180
+geoId/3300539300,geoId/3339300
+geoId/3300705140,geoId/3305140
+geoId/3300941300,geoId/3341300
+geoId/3301145140,geoId/3345140
+geoId/3301150260,geoId/3350260
+geoId/3301314200,geoId/3314200
+geoId/3301327380,geoId/3327380
+geoId/3301562900,geoId/3362900
+geoId/3301718820,geoId/3318820
+geoId/3301765140,geoId/3365140
+geoId/3301769940,geoId/3369940
+geoId/3301912900,geoId/3312900
+geoId/4400374300,geoId/4474300
+geoId/4400549960,geoId/4449960
+geoId/4400714140,geoId/4414140
+geoId/4400719180,geoId/4419180
+geoId/4400722960,geoId/4422960
+geoId/4400754640,geoId/4454640
+geoId/4400759000,geoId/4459000
+geoId/4400780780,geoId/4480780
+geoId/5000174650,geoId/5074650
+geoId/5000710675,geoId/5010675
+geoId/5000766175,geoId/5066175
+geoId/5000785150,geoId/5085150
+geoId/5001161675,geoId/5061675
+geoId/5001948850,geoId/5048850
+geoId/5002161225,geoId/5061225
+geoId/5002303175,geoId/5003175
+geoId/5002346000,geoId/5046000
+geoId/51510,geoId/5101000
+geoId/51520,geoId/5109816
+geoId/51530,geoId/5111032
+geoId/51550,geoId/5116000
+geoId/51570,geoId/5118448
+geoId/51580,geoId/5119728
+geoId/51590,geoId/5121344
+geoId/51595,geoId/5125808
+geoId/51600,geoId/5126496
+geoId/51610,geoId/5127200
+geoId/51620,geoId/5129600
+geoId/51630,geoId/5129744
+geoId/51640,geoId/5130208
+geoId/51650,geoId/5135000
+geoId/51660,geoId/5135624
+geoId/51670,geoId/5138424
+geoId/51678,geoId/5145512
+geoId/51680,geoId/5147672
+geoId/51683,geoId/5148952
+geoId/51685,geoId/5148968
+geoId/51690,geoId/5149784
+geoId/51700,geoId/5156000
+geoId/51710,geoId/5157000
+geoId/51720,geoId/5157688
+geoId/51730,geoId/5161832
+geoId/51735,geoId/5163768
+geoId/51740,geoId/5164000
+geoId/51750,geoId/5165392
+geoId/51760,geoId/5167000
+geoId/51770,geoId/5168000
+geoId/51775,geoId/5170000
+geoId/51790,geoId/5175216
+geoId/51800,geoId/5176432
+geoId/51810,geoId/5182000
+geoId/51820,geoId/5183680
+geoId/51830,geoId/5186160
+geoId/51840,geoId/5186720
diff --git a/scripts/us_hud/income/process.py b/scripts/us_hud/income/process.py
new file mode 100644
index 0000000000..fb9fc767b9
--- /dev/null
+++ b/scripts/us_hud/income/process.py
@@ -0,0 +1,132 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''Generates cleaned CSVs for HUD Income Limits data.
+
+Produces: 
+* csv/output_[YEAR].csv
+
+Usage:
+python3 process.py
+'''
+import csv
+import datetime
+import os
+import pandas as pd
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('income_output_dir', 'csv', 'Path to write cleaned CSVs.')
+
+URL_PREFIX = 'https://www.huduser.gov/portal/datasets/il/il'
+
+
+def get_url(year):
+    '''Return xls url for year.
+
+  Args:
+    year: Input year.
+
+  Returns:
+    xls url for given year.
+  '''
+    if year < 2006:
+        return ''
+    suffix = str(year)[-2:]
+    if year >= 2016:
+        return f'{URL_PREFIX}{suffix}/Section8-FY{suffix}.xlsx'
+    elif year == 2015:
+        return f'{URL_PREFIX}15/Section8_Rev.xlsx'
+    elif year == 2014:
+        return f'{URL_PREFIX}14/Poverty.xls'
+    elif year == 2011:
+        return f'{URL_PREFIX}11/Section8_v3.xls'
+    elif year >= 2009:
+        return f'{URL_PREFIX}{suffix}/Section8.xls'
+    elif year == 2008:
+        return f'{URL_PREFIX}08/Section8_FY08.xls'
+    elif year == 2007:
+        return f'{URL_PREFIX}07/Section8-rev.xls'
+    elif year == 2006:
+        return f'{URL_PREFIX}06/Section8FY2006.xls'
+    else:
+        return ''
+
+
+def compute_150(df, person):
+    '''Compute 150th percentile income in-place.
+
+  Args:
+    df: Input dataframe (will be modified).
+    person: Number of people in household.
+  '''
+    df[f'l150_{person}'] = df.apply(
+        lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1)
+
+
+def process(year, matches, output_dir):
+    '''Generate cleaned CSV.
+
+  Args:
+    year: Input year.
+    matches: Map of fips dcid -> city dcid.
+    output_dir: Directory to write cleaned CSV.
+  '''
+    url = get_url(year)
+    try:
+        df = pd.read_excel(url)
+    except:
+        print(f'No file found for {url}.')
+        return
+    if 'fips2010' in df:
+        df = df.rename(columns={'fips2010': 'fips'})
+
+    # Filter to 80th percentile income stats for each household size.
+    df = df.loc[:, [
+        'fips', 'l80_1', 'l80_2', 'l80_3', 'l80_4', 'l80_5', 'l80_6', 'l80_7',
+        'l80_8'
+    ]]
+
+    df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10),
+                          axis=1)
+    df['fips'] = df.apply(lambda x: x['fips'][:-5]
+                          if x['fips'][-5:] == '99999' else x['fips'],
+                          axis=1)
+    for i in range(1, 9):
+        compute_150(df, i)
+    df['year'] = [year for i in range(len(df))]
+
+    # Add stats for matching dcids.
+    df_match = df.copy().loc[df['fips'].isin(matches)]
+    if not df_match.empty:
+        df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1)
+        df = pd.concat([df, df_match])
+
+    df.to_csv(os.path.join(output_dir, f'output_{year}.csv'), index=False)
+
+
+def main(argv):
+    with open('match_bq.csv') as f:
+        reader = csv.DictReader(f)
+        matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader}
+    if not os.path.exists(FLAGS.income_output_dir):
+        os.makedirs(FLAGS.income_output_dir)
+    today = datetime.date.today()
+    for year in range(2006, today.year):
+        print(year)
+        process(year, matches, FLAGS.income_output_dir)
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/scripts/us_hud/income/process_test.py b/scripts/us_hud/income/process_test.py
new file mode 100644
index 0000000000..6a2e68f13d
--- /dev/null
+++ b/scripts/us_hud/income/process_test.py
@@ -0,0 +1,55 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''Tests for process.py.
+
+Usage: python3 -m unittest discover -v -s ../ -p "process_test.py"
+'''
+import os
+import pandas as pd
+import sys
+import unittest
+from unittest.mock import patch
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.abspath(__file__)))))
+from us_hud.income import process
+
+module_dir_ = os.path.dirname(__file__)
+
+TEST_DIR = os.path.join(module_dir_, 'testdata')
+
+
+class ProcessTest(unittest.TestCase):
+
+    def test_get_url(self):
+        self.assertEqual(
+            process.get_url(2022),
+            'https://www.huduser.gov/portal/datasets/il/il22/Section8-FY22.xlsx'
+        )
+        self.assertEqual(process.get_url(1997), '')
+
+    def test_compute_150(self):
+        pass
+
+    @patch('pandas.read_excel')
+    def test_process(self, mock_df):
+        mock_df.return_value = pd.DataFrame(
+            pd.read_csv(os.path.join(TEST_DIR, 'test_input_2006.csv')))
+        matches = {'dcs:geoId/02110': 'dcs:geoId/0236400'}
+        process.process(2006, matches, TEST_DIR)
+        with open(os.path.join(TEST_DIR, 'output_2006.csv')) as result:
+            with open(os.path.join(TEST_DIR,
+                                   'expected_output_2006.csv')) as expected:
+                self.assertEqual(result.read(), expected.read())
diff --git a/scripts/us_hud/income/testdata/__init__.py b/scripts/us_hud/income/testdata/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/us_hud/income/testdata/expected_output_2006.csv b/scripts/us_hud/income/testdata/expected_output_2006.csv
new file mode 100644
index 0000000000..76e466eb5c
--- /dev/null
+++ b/scripts/us_hud/income/testdata/expected_output_2006.csv
@@ -0,0 +1,2 @@
+fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year
+dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006
diff --git a/scripts/us_hud/income/testdata/output_2006.csv b/scripts/us_hud/income/testdata/output_2006.csv
new file mode 100644
index 0000000000..76e466eb5c
--- /dev/null
+++ b/scripts/us_hud/income/testdata/output_2006.csv
@@ -0,0 +1,2 @@
+fips,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,l150_1,l150_2,l150_3,l150_4,l150_5,l150_6,l150_7,l150_8,year
+dcs:geoId/01001,31300,35750,40250,44700,48300,51850,55450,59000,58688,67031,75469,83812,90562,97219,103969,110625,2006
diff --git a/scripts/us_hud/income/testdata/test_input_2006.csv b/scripts/us_hud/income/testdata/test_input_2006.csv
new file mode 100644
index 0000000000..5eb7f16ff1
--- /dev/null
+++ b/scripts/us_hud/income/testdata/test_input_2006.csv
@@ -0,0 +1,2 @@
+State_Alpha,fips,State,County_Town_Name,County,Metro_Area_Name,CBSASub,County_Name,median1999,median2006,State_Name,l50_1,l50_2,l50_3,l50_4,l50_5,l50_6,l50_7,l50_8,msa,l30_1,l30_2,l30_3,l30_4,l30_5,l30_6,l30_7,l30_8,l80_1,l80_2,l80_3,l80_4,l80_5,l80_6,l80_7,l80_8,metro
+AL,100199999,1,Autauga County,1,"Montgomery, AL MSA",METRO33860M33860,Autauga County,45182,55900,Alabama,19550,22350,25150,27950,30200,32400,34650,36900,5240,11750,13400,15100,16750,18100,19450,20750,22100,31300,35750,40250,44700,48300,51850,55450,59000,1