Skip to content

Commit

Permalink
us_hud_income 20241213 changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kurus21 committed Dec 13, 2024
1 parent 2f487cb commit 3e16d8a
Show file tree
Hide file tree
Showing 11 changed files with 156 additions and 233 deletions.
Empty file removed scripts/us_hud/income/__init__.py
Empty file.
192 changes: 3 additions & 189 deletions scripts/us_hud/income/match_bq.csv
Original file line number Diff line number Diff line change
@@ -1,189 +1,3 @@
fips,city
geoId/02110,geoId/0236400
geoId/02220,geoId/0270540
geoId/02275,geoId/0286380
geoId/0900108070,geoId/0908000
geoId/0900118500,geoId/0918430
geoId/0900156060,geoId/0955990
geoId/0900168170,geoId/0968100
geoId/0900173070,geoId/0973000
geoId/0900174190,geoId/0974260
geoId/0900308490,geoId/0908420
geoId/0900322630,geoId/0922700
geoId/0900337070,geoId/0937000
geoId/0900350440,geoId/0950370
geoId/0900382590,geoId/0982660
geoId/0900576570,geoId/0976500
geoId/0900747360,geoId/0947290
geoId/0900901220,geoId/0901150
geoId/0900919550,geoId/0919480
geoId/0900946520,geoId/0946450
geoId/0900947535,geoId/0947515
geoId/0900949950,geoId/0949880
geoId/0900952070,geoId/0952000
geoId/0900980070,geoId/0980000
geoId/0900982870,geoId/0982800
geoId/0901152350,geoId/0952280
geoId/0901156270,geoId/0956200
geoId/2300102060,geoId/2302060
geoId/2300138740,geoId/2338740
geoId/2300310565,geoId/2310565
geoId/2300360825,geoId/2360825
geoId/2300560545,geoId/2360545
geoId/2300571990,geoId/2371990
geoId/2300582105,geoId/2382105
geoId/2300923200,geoId/2323200
geoId/2301102100,geoId/2302100
geoId/2301127085,geoId/2327085
geoId/2301130550,geoId/2330550
geoId/2301180740,geoId/2380740
geoId/2301363590,geoId/2363590
geoId/2301902795,geoId/2302795
geoId/2301906925,geoId/2306925
geoId/2301955225,geoId/2355225
geoId/2302303355,geoId/2303355
geoId/2302703950,geoId/2303950
geoId/2302909585,geoId/2309585
geoId/2302921730,geoId/2321730
geoId/2303104860,geoId/2304860
geoId/2303164675,geoId/2364675
geoId/2303165725,geoId/2365725
geoId/24510,geoId/2404000
geoId/2500346225,geoId/2546225
geoId/2500353960,geoId/2553960
geoId/2500502690,geoId/2502690
geoId/2500523000,geoId/2523000
geoId/2500545000,geoId/2545000
geoId/2500562430,geoId/2562465
geoId/2500569170,geoId/2569170
geoId/2500905595,geoId/2505595
geoId/2500916250,geoId/2516285
geoId/2500926150,geoId/2526150
geoId/2500929405,geoId/2529405
geoId/2500934550,geoId/2534550
geoId/2500937490,geoId/2537490
geoId/2500938400,geoId/2538435
geoId/2500943580,geoId/2543615
geoId/2500945245,geoId/2545245
geoId/2500952490,geoId/2552490
geoId/2500959105,geoId/2559105
geoId/2500960015,geoId/2560050
geoId/2500968645,geoId/2568680
geoId/2501313660,geoId/2513660
geoId/2501330840,geoId/2530840
geoId/2501336300,geoId/2536335
geoId/2501352144,geoId/2552144
geoId/2501367000,geoId/2567000
geoId/2501376030,geoId/2576030
geoId/2501546330,geoId/2546330
geoId/2501701605,geoId/2501640
geoId/2501705070,geoId/2505105
geoId/2501709840,geoId/2509875
geoId/2501711000,geoId/2511000
geoId/2501721990,geoId/2521990
geoId/2501724960,geoId/2524960
geoId/2501735215,geoId/2535250
geoId/2501737000,geoId/2537000
geoId/2501737875,geoId/2537875
geoId/2501738715,geoId/2538715
geoId/2501739625,geoId/2539660
geoId/2501739835,geoId/2539835
geoId/2501740115,geoId/2540115
geoId/2501745560,geoId/2545560
geoId/2501756130,geoId/2556165
geoId/2501762535,geoId/2562535
geoId/2501767665,geoId/2567700
geoId/2501772215,geoId/2572250
geoId/2501772600,geoId/2572600
geoId/2501780510,geoId/2580545
geoId/2501781035,geoId/2581035
geoId/2502109175,geoId/2509210
geoId/2502130455,geoId/2530420
geoId/2502141690,geoId/2541725
geoId/2502144105,geoId/2544140
geoId/2502150250,geoId/2550285
geoId/2502155745,geoId/2555745
geoId/2502155955,geoId/2555990
geoId/2502174175,geoId/2574210
geoId/2502178972,geoId/2578972
geoId/2502300170,geoId/2500135
geoId/2502309000,geoId/2509000
geoId/2502331645,geoId/2531680
geoId/2502507000,geoId/2507000
geoId/2502513205,geoId/2513205
geoId/2502556585,geoId/2556585
geoId/2502581005,geoId/2581005
geoId/2502723875,geoId/2523875
geoId/2502725485,geoId/2525485
geoId/2502735075,geoId/2535075
geoId/2502763345,geoId/2563345
geoId/2502782000,geoId/2582000
geoId/29510,geoId/2965000
geoId/32510,geoId/3209700
geoId/3300140180,geoId/3340180
geoId/3300539300,geoId/3339300
geoId/3300705140,geoId/3305140
geoId/3300941300,geoId/3341300
geoId/3301145140,geoId/3345140
geoId/3301150260,geoId/3350260
geoId/3301314200,geoId/3314200
geoId/3301327380,geoId/3327380
geoId/3301562900,geoId/3362900
geoId/3301718820,geoId/3318820
geoId/3301765140,geoId/3365140
geoId/3301769940,geoId/3369940
geoId/3301912900,geoId/3312900
geoId/4400374300,geoId/4474300
geoId/4400549960,geoId/4449960
geoId/4400714140,geoId/4414140
geoId/4400719180,geoId/4419180
geoId/4400722960,geoId/4422960
geoId/4400754640,geoId/4454640
geoId/4400759000,geoId/4459000
geoId/4400780780,geoId/4480780
geoId/5000174650,geoId/5074650
geoId/5000710675,geoId/5010675
geoId/5000766175,geoId/5066175
geoId/5000785150,geoId/5085150
geoId/5001161675,geoId/5061675
geoId/5001948850,geoId/5048850
geoId/5002161225,geoId/5061225
geoId/5002303175,geoId/5003175
geoId/5002346000,geoId/5046000
geoId/51510,geoId/5101000
geoId/51520,geoId/5109816
geoId/51530,geoId/5111032
geoId/51550,geoId/5116000
geoId/51570,geoId/5118448
geoId/51580,geoId/5119728
geoId/51590,geoId/5121344
geoId/51595,geoId/5125808
geoId/51600,geoId/5126496
geoId/51610,geoId/5127200
geoId/51620,geoId/5129600
geoId/51630,geoId/5129744
geoId/51640,geoId/5130208
geoId/51650,geoId/5135000
geoId/51660,geoId/5135624
geoId/51670,geoId/5138424
geoId/51678,geoId/5145512
geoId/51680,geoId/5147672
geoId/51683,geoId/5148952
geoId/51685,geoId/5148968
geoId/51690,geoId/5149784
geoId/51700,geoId/5156000
geoId/51710,geoId/5157000
geoId/51720,geoId/5157688
geoId/51730,geoId/5161832
geoId/51735,geoId/5163768
geoId/51740,geoId/5164000
geoId/51750,geoId/5165392
geoId/51760,geoId/5167000
geoId/51770,geoId/5168000
geoId/51775,geoId/5170000
geoId/51790,geoId/5175216
geoId/51800,geoId/5176432
geoId/51810,geoId/5182000
geoId/51820,geoId/5183680
geoId/51830,geoId/5186160
geoId/51840,geoId/5186720
fips,city
1001,City1
1002,City2
31 changes: 18 additions & 13 deletions scripts/us_hud/income/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Usage:
python3 process.py
'''

import csv
import datetime
import os
Expand Down Expand Up @@ -97,26 +96,25 @@ def compute_150(df, person):
lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1)


def process(year, matches, output_data, input_folder):
def process(year, matches, input_folder):
'''Generate cleaned data and accumulate it in output_data.'''
url = get_url(year)

if year == 2023 or year == 2024:
try:
filename = f"Section8-FY{year}.xlsx"
download_file(url, filename, input_folder)
# Read the Excel file and process the generator output
with open(os.path.join(input_folder, filename), 'rb') as f:
rows = iter_excel_calamine(f)
data = [row for row in rows]
df = pd.DataFrame(data)
data = list(rows) # Convert the generator to a list of rows
df = pd.DataFrame(data) # Now create the DataFrame
except Exception as e:
logging.fatal(f'Error in the process method : {year}: {url} {e}.')
return
else:
# For other years, download via URL
try:
filename = f"Section8-FY{year}.xls"
download_file(url, filename, input_folder)
df = pd.read_excel(os.path.join(input_folder, filename))
except Exception as e:
logging.fatal(f'Error in the process method : {url} {e}.')
Expand All @@ -133,7 +131,7 @@ def process(year, matches, output_data, input_folder):
]]

# Format FIPS codes
df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10),
df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(5),
axis=1)
df['fips'] = df.apply(lambda x: x['fips'][:-5]
if x['fips'][-5:] == '99999' else x['fips'],
Expand All @@ -151,9 +149,7 @@ def process(year, matches, output_data, input_folder):
if not df_match.empty:
df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1)
df = pd.concat([df, df_match])

# Append this year's data to the output_data list
output_data.append(df)
return df


def main(argv):
Expand All @@ -165,6 +161,7 @@ def main(argv):
# Ensure the output directory exists
if not os.path.exists(FLAGS.income_output_dir):
os.makedirs(FLAGS.income_output_dir)

today = datetime.date.today()

# List to accumulate all data
Expand All @@ -173,10 +170,18 @@ def main(argv):
# Define input folder for downloaded files
input_folder = 'input'

# Process data for years 2006 to the current year
# First, download all files for years 2006 to current year
for year in range(2006, today.year + 1):
url = get_url(year)
if url:
filename = f"Section8-FY{year}.xlsx" if year >= 2016 else f"Section8-FY{year}.xls"
download_file(url, filename, input_folder)

# Now process the data after all files are downloaded
for year in range(2006, today.year + 1):
print(year)
process(year, matches, output_data, input_folder)
print(f"Processing data for year: {year}")
df = process(year, matches, input_folder)
output_data.append(df)

# Concatenate all DataFrames in output_data into one single DataFrame
final_df = pd.concat(output_data, ignore_index=True)
Expand Down
60 changes: 35 additions & 25 deletions scripts/us_hud/income/process_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,45 +11,55 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''Tests for process.py.

Usage: python3 -m unittest discover -v -s ../ -p "process_test.py"
'''
import os
import pandas as pd
import sys
import unittest
from unittest.mock import patch
import filecmp

# Set the absolute path for the test data directory
TEST_DIR = '/usr/local/google/home/rbhande/Documents/income/data-master/scripts/us_hud/income/testdata'
OUTPUT_DIR = '/usr/local/google/home/rbhande/Documents/income/data-master/scripts/us_hud/income/testdata/output' # Directory to save output

# Ensure the module is loaded correctly
import sys

sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(
os.path.abspath(__file__)))))
from us_hud.income import process

module_dir_ = os.path.dirname(__file__)

TEST_DIR = os.path.join(module_dir_, 'testdata')
from us_hud.income import process


class ProcessTest(unittest.TestCase):

def test_get_url(self):
"""Test the get_url function and check if it returns the correct URL for the given year."""
year = 2022
print(f"Checking URL for year: {year}")
self.assertEqual(
process.get_url(2022),
process.get_url(year),
'https://www.huduser.gov/portal/datasets/il/il22/Section8-FY22.xlsx'
)
self.assertEqual(process.get_url(1997), '')

def test_compute_150(self):
pass
print(f"Success! Correct URL for year {year}.")
year = 1997
print(f"Checking URL for year: {year}")
self.assertEqual(process.get_url(year), '')

@patch('pandas.read_excel')
def test_process(self, mock_df):
mock_df.return_value = pd.DataFrame(
pd.read_csv(os.path.join(TEST_DIR, 'test_input_2006.csv')))
def test_process_with_dynamic_csv(self):
matches = {'dcs:geoId/02110': 'dcs:geoId/0236400'}
process.process(2006, matches, TEST_DIR)
with open(os.path.join(TEST_DIR, 'output_2006.csv')) as result:
with open(os.path.join(TEST_DIR,
'expected_output_2006.csv')) as expected:
self.assertEqual(result.read(), expected.read())
output_data = []
input_folder = TEST_DIR

if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print("Calling process() function...")
df = process.process(2006, matches, input_folder)
df.to_csv(os.path.join(OUTPUT_DIR, "output_test.csv"), index=False)
same = filecmp.cmp(os.path.join(OUTPUT_DIR, "output_test.csv"),
os.path.join(TEST_DIR, "expected_output.csv"))
# Assert that the files are identical
self.assertTrue(same)


if __name__ == '__main__':
unittest.main()
Binary file not shown.
Empty file.
Loading

0 comments on commit 3e16d8a

Please sign in to comment.