Skip to content

Commit

Permalink
us_hud income 20241211 changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kurus21 committed Dec 11, 2024
1 parent e9c8261 commit 2f487cb
Showing 1 changed file with 32 additions and 28 deletions.
60 changes: 32 additions & 28 deletions scripts/us_hud/income/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''Generates cleaned CSVs for HUD Income Limits data.
Produces:
Expand All @@ -21,8 +20,6 @@
python3 process.py
'''



import csv
import datetime
import os
Expand Down Expand Up @@ -77,14 +74,17 @@ def download_file(url: str, filename: str, input_folder: str):
file.write(response.content)
logging.info(f"Downloaded file: {file_path}")
else:
logging.fatal(f"Failed to download from {url}, status code {response.status_code}")
logging.fatal(
f"Failed to download from {url}, status code {response.status_code}"
)
except Exception as e:
logging.fatal(f"Failed to download {url}: {str(e)}")


def iter_excel_calamine(file: IO[bytes]) -> Iterator[dict[str, object]]:
'''Reads Excel file using python_calamine.'''
workbook = python_calamine.CalamineWorkbook.from_filelike(file) # type: ignore[arg-type]
workbook = python_calamine.CalamineWorkbook.from_filelike(
file) # type: ignore[arg-type]
rows = iter(workbook.get_sheet_by_index(0).to_python())
headers = list(map(str, next(rows))) # Get headers from the first row
for row in rows:
Expand All @@ -100,29 +100,28 @@ def compute_150(df, person):
def process(year, matches, output_data, input_folder):
'''Generate cleaned data and accumulate it in output_data.'''
url = get_url(year)



if year == 2023 or year == 2024:
try:
filename = f"Section8-FY{year}.xlsx"
download_file(url, filename, input_folder)
download_file(url, filename, input_folder)
with open(os.path.join(input_folder, filename), 'rb') as f:
rows = iter_excel_calamine(f)
data = [row for row in rows]
data = [row for row in rows]
df = pd.DataFrame(data)
except Exception as e:
logging.fatal(f'Error in the process method : {year}: {url} {e}.')
return
else:
# For other years, download via URL
try:
filename = f"Section8-FY{year}.xls"
download_file(url, filename, input_folder)
df = pd.read_excel(os.path.join(input_folder, filename))
except Exception as e :
filename = f"Section8-FY{year}.xls"
download_file(url, filename, input_folder)
df = pd.read_excel(os.path.join(input_folder, filename))
except Exception as e:
logging.fatal(f'Error in the process method : {url} {e}.')
return

# Process the DataFrame (common code for all years)
if 'fips2010' in df:
df = df.rename(columns={'fips2010': 'fips'})
Expand All @@ -134,13 +133,16 @@ def process(year, matches, output_data, input_folder):
]]

# Format FIPS codes
df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10), axis=1)
df['fips'] = df.apply(lambda x: x['fips'][:-5] if x['fips'][-5:] == '99999' else x['fips'], axis=1)
df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10),
axis=1)
df['fips'] = df.apply(lambda x: x['fips'][:-5]
if x['fips'][-5:] == '99999' else x['fips'],
axis=1)

# Compute 150th percentile for each household size
for i in range(1, 9):
compute_150(df, i)

# Add year column
df['year'] = [year for _ in range(len(df))]

Expand All @@ -159,30 +161,32 @@ def main(argv):
with open('match_bq.csv') as f:
reader = csv.DictReader(f)
matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader}

# Ensure the output directory exists
if not os.path.exists(FLAGS.income_output_dir):
os.makedirs(FLAGS.income_output_dir)
today = datetime.date.today()

# List to accumulate all data
output_data = []

# Define input folder for downloaded files
input_folder = 'input'


input_folder = 'input'

# Process data for years 2006 to the current year
for year in range(2006, today.year + 1):
print(year)
process(year, matches, output_data, input_folder)
process(year, matches, output_data, input_folder)

# Concatenate all DataFrames in output_data into one single DataFrame
final_df = pd.concat(output_data, ignore_index=True)

# Save the merged data to a single CSV
final_df.to_csv(os.path.join(FLAGS.income_output_dir, 'output_all_years.csv'), index=False)
logging.info(f'Merged data saved to {FLAGS.income_output_dir}/output_all_years.csv')
final_df.to_csv(os.path.join(FLAGS.income_output_dir,
'output_all_years.csv'),
index=False)
logging.info(
f'Merged data saved to {FLAGS.income_output_dir}/output_all_years.csv')


if __name__ == '__main__':
Expand Down

0 comments on commit 2f487cb

Please sign in to comment.