diff --git a/scripts/us_hud/income/process.py b/scripts/us_hud/income/process.py index 9dc97771b..ed31d9023 100644 --- a/scripts/us_hud/income/process.py +++ b/scripts/us_hud/income/process.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - '''Generates cleaned CSVs for HUD Income Limits data. Produces: @@ -21,8 +20,6 @@ python3 process.py ''' - - import csv import datetime import os @@ -77,14 +74,17 @@ def download_file(url: str, filename: str, input_folder: str): file.write(response.content) logging.info(f"Downloaded file: {file_path}") else: - logging.fatal(f"Failed to download from {url}, status code {response.status_code}") + logging.fatal( + f"Failed to download from {url}, status code {response.status_code}" + ) except Exception as e: logging.fatal(f"Failed to download {url}: {str(e)}") def iter_excel_calamine(file: IO[bytes]) -> Iterator[dict[str, object]]: '''Reads Excel file using python_calamine.''' - workbook = python_calamine.CalamineWorkbook.from_filelike(file) # type: ignore[arg-type] + workbook = python_calamine.CalamineWorkbook.from_filelike( + file) # type: ignore[arg-type] rows = iter(workbook.get_sheet_by_index(0).to_python()) headers = list(map(str, next(rows))) # Get headers from the first row for row in rows: @@ -100,15 +100,14 @@ def compute_150(df, person): def process(year, matches, output_data, input_folder): '''Generate cleaned data and accumulate it in output_data.''' url = get_url(year) - - + if year == 2023 or year == 2024: try: filename = f"Section8-FY{year}.xlsx" - download_file(url, filename, input_folder) + download_file(url, filename, input_folder) with open(os.path.join(input_folder, filename), 'rb') as f: rows = iter_excel_calamine(f) - data = [row for row in rows] + data = [row for row in rows] df = pd.DataFrame(data) except Exception as e: logging.fatal(f'Error in the process method : {year}: {url} {e}.') @@ -116,13 +115,13 @@ def process(year, matches, output_data, input_folder): else: # For other years, download via URL try: - filename = f"Section8-FY{year}.xls" - download_file(url, filename, input_folder) - df = pd.read_excel(os.path.join(input_folder, filename)) - except Exception as e : + filename = f"Section8-FY{year}.xls" + download_file(url, filename, input_folder) + df = pd.read_excel(os.path.join(input_folder, filename)) + except Exception as e: logging.fatal(f'Error in the process method : {url} {e}.') return - + # Process the DataFrame (common code for all years) if 'fips2010' in df: df = df.rename(columns={'fips2010': 'fips'}) @@ -134,13 +133,16 @@ def process(year, matches, output_data, input_folder): ]] # Format FIPS codes - df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10), axis=1) - df['fips'] = df.apply(lambda x: x['fips'][:-5] if x['fips'][-5:] == '99999' else x['fips'], axis=1) + df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10), + axis=1) + df['fips'] = df.apply(lambda x: x['fips'][:-5] + if x['fips'][-5:] == '99999' else x['fips'], + axis=1) # Compute 150th percentile for each household size for i in range(1, 9): compute_150(df, i) - + # Add year column df['year'] = [year for _ in range(len(df))] @@ -159,30 +161,32 @@ def main(argv): with open('match_bq.csv') as f: reader = csv.DictReader(f) matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader} - + # Ensure the output directory exists if not os.path.exists(FLAGS.income_output_dir): os.makedirs(FLAGS.income_output_dir) today = datetime.date.today() - + # List to accumulate all data output_data = [] - + # Define input folder for downloaded files - input_folder = 'input' - - + input_folder = 'input' + # Process data for years 2006 to the current year for year in range(2006, today.year + 1): print(year) - process(year, matches, output_data, input_folder) - + process(year, matches, output_data, input_folder) + # Concatenate all DataFrames in output_data into one single DataFrame final_df = pd.concat(output_data, ignore_index=True) - + # Save the merged data to a single CSV - final_df.to_csv(os.path.join(FLAGS.income_output_dir, 'output_all_years.csv'), index=False) - logging.info(f'Merged data saved to {FLAGS.income_output_dir}/output_all_years.csv') + final_df.to_csv(os.path.join(FLAGS.income_output_dir, + 'output_all_years.csv'), + index=False) + logging.info( + f'Merged data saved to {FLAGS.income_output_dir}/output_all_years.csv') if __name__ == '__main__':