Skip to content

Commit

Permalink
Population_Estimates_By_ASR_Changes (#1117)
Browse files Browse the repository at this point in the history
* Population_Estimates_By_ASR_Changes

* Modified README to remove double quotes from --mode

* Updated files by removing print and unwanted comments

* SCHEDULES=scripts/us_census/pep/population_estimates_by_asr:USCensusPEP_AgeSexRace

* SCHEDULES=scripts/us_census/pep/population_estimates_by_asr:USCensusPEP_AgeSexRace

* Updated code to remove drop duplicate

* Updated code to remove drop duplicate

* Updated code to remove drop duplicate

* Removed unwanted comment

* Addressed comments in PR, modified future_year_url method in process.py, added block of code to save raw input data

* Modified state.json to remove 2023 year url and deleted national_2020_2021.py as it is included in national_2020_2029.py

* Updated condition in process.py for checking number of files in input folder
  • Loading branch information
Bipnabraham authored Dec 12, 2024
1 parent d2552ef commit 80f6d01
Show file tree
Hide file tree
Showing 36 changed files with 6,813 additions and 761,951 deletions.
24 changes: 21 additions & 3 deletions scripts/us_census/pep/population_estimates_by_asr/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# US Census PEP: Population Estimates by Age, Sex and Race

## About the Dataset
This dataset has Population Estimates for the National, State and County geographic levels in United States from the year 1900 to 2020 on a yearly basis.
This dataset has Population Estimates for the National, State and County geographic levels in United States from the year 1900 on a yearly basis till latest year.

The population is categorized by various set of combinations as below:

Expand Down Expand Up @@ -53,3 +51,23 @@ Run the test cases
The below script will download the data and clean the data, Also generate final csv, mcf and tmcf files.

`/bin/python3 scripts/us_census/pep/Population_Estimate_by_ASR/process.py`

Execute the 'process.py' script by using the following commands:

- if you want to perform "download and process", run the below command:

`python3 process.py

- if you want to perform "only process", run the below command:

`python3 process.py --mode=process`
- if you want to perform "only download", run the below command:

`python3 process.py --mode=download`

### New Implentation:
- [Updated the script on October 29, 2024]
- Downloading input files is now integrated into preprocess.py, eliminating the need to run the separate download.sh script.
- All source file URLs, including future URLs adhering to the same structure, are centrally managed in the input_url.json file.
- All input files required for processing should be stored within the designated "input_files" folder.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import os
import pandas as pd
import re


def input_url(file_name: str, key_name: str):
Expand Down Expand Up @@ -93,7 +94,10 @@ def gender_based_grouping(df: pd.DataFrame):
df['SVs'] = df['SVs'].str.replace\
('_NativeHawaiianAndOtherPacificIslanderAlone', '')
df['SVs'] = df['SVs'].str.replace('_TwoOrMoreRaces', '')
df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
df = df.groupby(['Year', 'geo_ID', 'SVs',
'Measurement_Method']).sum().reset_index()

return df


Expand All @@ -103,5 +107,15 @@ def race_based_grouping(df: pd.DataFrame):
"""
df['SVs'] = df['SVs'].str.replace('_Male', '')
df['SVs'] = df['SVs'].str.replace('_Female', '')
df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
df = df.groupby(['Year', 'geo_ID', 'SVs',
'Measurement_Method']).sum().reset_index()
return df


def extract_year(year_str):
match = re.search(r'\d{4}', year_str)
if match:
return match.group()
else:
return None
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{"1970-79":"https://www2.census.gov/programs-surveys/popest/tables/1900-1980/counties/asrh/co-asr-7079.csv",
"1980-89":"https://www2.census.gov/programs-surveys/popest/datasets/1980-1990/counties/asrh/pe-02.csv",
"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA6.csv"}

Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def county1970(url_file: str, output_folder: str):
# Contains aggregated data for age and race.
df_ar = pd.DataFrame()
df = pd.read_csv(_url, names=_cols, low_memory=False, encoding='ISO-8859-1')
#Writing raw data to csv
df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
"raw_data", 'raw_data_county_1970_1979.csv'),
index=False)
df = (df.drop(_cols, axis=1).join(df[_cols]))
df['geo_ID'] = df['geo_ID'].astype(int)
df['geo_ID'] = [f'{x:05}' for x in df['geo_ID']]
Expand All @@ -58,14 +62,12 @@ def county1970(url_file: str, output_folder: str):
df['geo_ID'] = 'geoId/' + df['geo_ID']
# Making copies of the current DF to be aggregated upon.
final_df = pd.concat([final_df, df])
df_ar = pd.concat([df_ar, df])
final_df.insert(3, 'Measurement_Method', 'CensusPEPSurvey', True)
# DF sent to an external function for aggregation based on gender.
df = gender_based_grouping(df)
df.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
final_df = gender_based_grouping(final_df)
df_ar = pd.concat([df_ar, df])
# DF sent to an external function for aggregation based on race.
df_ar = race_based_grouping(df_ar)
df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
final_df = pd.concat([final_df, df_ar, df])
final_df = final_df[~final_df.SVs.str.contains('OtherRaces')]
final_df.to_csv(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ def county1980(url_file: str, output_folder: str):
,8,9,10,11,12,13,14,15,16,17]
df = pd.read_csv(_url,skiprows=7,names=cols,low_memory=False,\
encoding='ISO-8859-1')

#Writing raw data to csv
df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
"raw_data", 'raw_data_county_1980_1989.csv'),
index=False)
df = (df.drop(cols, axis=1).join(df[cols]))
df['geo_ID'] = df['geo_ID'].astype(int)
df['geo_ID'] = [f'{x:05}' for x in df['geo_ID']]
Expand All @@ -61,14 +64,13 @@ def county1980(url_file: str, output_folder: str):
df.drop(columns=['Race', 'sv'], inplace=True)
# Generating Aggregated Data by using Group by on rows.
df_as = pd.concat([df_as, df])
df_ar = pd.concat([df_ar, df])
df.insert(3, 'Measurement_Method', 'CensusPEPSurvey', True)
df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
# DF sent to an external function for aggregation based on gender.
df_as = gender_based_grouping(df_as)
df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
df_ar = pd.concat([df_ar, df])
df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
# DF sent to an external function for aggregation based on race.
df_ar = race_based_grouping(df_ar)
df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
df = pd.concat([df_as, df_ar, df])
df['geo_ID'] = 'geoId/' + df['geo_ID'].astype(str)
final_df = df[~df.SVs.str.contains('OtherRaces')]
Expand All @@ -77,10 +79,10 @@ def county1980(url_file: str, output_folder: str):
'county_1980_1989.csv'))
# Aggregating the County Data on geo_ID to make State Data.
final_df['geo_ID'] = final_df['geo_ID'].str[:-3]
final_df = final_df.groupby(['Year','geo_ID','SVs']).sum()\
final_df = final_df.groupby(['Year','geo_ID','SVs', 'Measurement_Method']).sum()\
.stack(0).reset_index()
final_df['observation'] = final_df[0]
final_df.drop(columns=['level_3', 0], inplace=True)
final_df.drop(columns=['level_4', 0], inplace=True)
final_df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
final_df.to_csv(
os.path.join(os.path.dirname(os.path.abspath(__file__)), output_folder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,15 @@ def county1990(output_folder: str):
j = f'{i:02}'
url = 'https://www2.census.gov/programs-surveys'+\
'/popest/tables/1990-2000/counties/asrh/casrh'+str(j)+'.txt'

cols=['Year','geo_ID','Race',0,1,2,3,4,5,6,7\
,8,9,10,11,12,13,14,15,16,17]
df = pd.read_table(url,index_col=False,delim_whitespace=True\
,skiprows=16,skipfooter=14,engine='python',names=cols,\
encoding='ISO-8859-1')
#Writing raw data to csv
df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
"raw_data", 'raw_data_county_1990_2000.csv'),
index=False)
# Removing the lines that have false symbols.
num_df = (df.drop(cols, axis=1).join(df[cols]\
.apply(pd.to_numeric, errors='coerce')))
Expand Down Expand Up @@ -94,12 +97,9 @@ def county1990(output_folder: str):
df_ar = pd.concat([df_ar, final_df])
# DF sent to an external function for aggregation based on gender.
df_as = gender_based_grouping(df_as)
df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
# DF sent to an external function for aggregation based on race.
df_ar = race_based_grouping(df_ar)
df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
final_df = pd.concat([final_df, df_ar, df_as])

# Writing to output csv.
final_df.to_csv(
os.path.join(os.path.dirname(os.path.abspath(__file__)), output_folder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def county2000(output_folder: str):
"""
This Python Script Loads csv datasets from 2000-2010 on a County Level,
This Python Script Loads csv datasets from 2000-2009 on a County Level,
cleans it and create a cleaned csv.
"""
# Used to collect data after every loop for every file's df.
Expand All @@ -36,6 +36,11 @@ def county2000(output_folder: str):
url = 'https://www2.census.gov/programs-surveys/popest/datasets/2'+\
'000-2010/intercensal/county/co-est00int-alldata-'+str(j)+'.csv'
df = pd.read_csv(url, encoding='ISO-8859-1')
#Writing raw data to csv
df.to_csv(os.path.join(
os.path.dirname(os.path.abspath(__file__)), "raw_data",
'raw_data_county_2000_2009_file_' + str(i) + '.csv'),
index=False)
# Filter years 1 - 12.
df['Year'] = df['YEAR']
df.drop(columns=['YEAR'], inplace=True)
Expand Down Expand Up @@ -134,5 +139,5 @@ def county2000(output_folder: str):
# Write to final file.
final_df.to_csv(
os.path.join(os.path.dirname(
os.path.abspath(__file__)), output_folder,'county_2000_2010.csv'), \
os.path.abspath(__file__)), output_folder,'county_2000_2009.csv'), \
index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@ def county2010(url_file: str, output_folder: str):
'''
_url = input_url(url_file, "2010-20")
df = pd.read_csv(_url, encoding='ISO-8859-1', low_memory=False)
#Writing raw data to csv
df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
"raw_data", 'raw_data_county_2010_2020.csv'),
index=False)
# Filter by agegrp = 0.
df = df.query("YEAR not in [1, 2, 13]")
df = df.query("YEAR not in [1, 2, 13, 14]")
df = df.query("AGEGRP != 0")
# Filter years 3 - 14.
df['YEAR'] = df['YEAR'].astype(str)
Expand All @@ -43,8 +47,7 @@ def county2010(url_file: str, output_folder: str):
'9': '2016',
'10': '2017',
'11': '2018',
'12': '2019',
'14': '2020'
'12': '2019'
}
})
df.insert(6, 'geo_ID', 'geoId/', True)
Expand Down
Loading

0 comments on commit 80f6d01

Please sign in to comment.