Population_Estimates_By_ASR_Changes (#1117)

* Population_Estimates_By_ASR_Changes * Modified README to remove double quotes from --mode * Updated files by removing print and unwanted comments * SCHEDULES=scripts/us_census/pep/population_estimates_by_asr:USCensusPEP_AgeSexRace * SCHEDULES=scripts/us_census/pep/population_estimates_by_asr:USCensusPEP_AgeSexRace * Updated code to remove drop duplicate * Updated code to remove drop duplicate * Updated code to remove drop duplicate * Removed unwanted comment * Addressed comments in PR, modified future_year_url method in process.py, added block of code to save raw input data * Modified state.json to remove 2023 year url and deleted national_2020_2021.py as it is included in national_2020_2029.py * Updated condition in process.py for checking number of files in input folder
datacommonsorg · Dec 12, 2024 · 80f6d01 · 80f6d01
1 parent d2552ef
commit 80f6d01
Show file tree

Hide file tree

Showing 36 changed files with 6,813 additions and 761,951 deletions.
diff --git a/scripts/us_census/pep/population_estimates_by_asr/README.md b/scripts/us_census/pep/population_estimates_by_asr/README.md
@@ -1,7 +1,5 @@
-# US Census PEP: Population Estimates by Age, Sex and Race
-
 ## About the Dataset
-This dataset has Population Estimates for the National, State and County geographic levels in United States from the year 1900 to 2020 on a yearly basis.
+This dataset has Population Estimates for the National, State and County geographic levels in United States from the year 1900 on a yearly basis till latest year.
 
 The population is categorized by various set of combinations as below:
 
@@ -53,3 +51,23 @@ Run the test cases
 The below script will download the data and clean the data, Also generate final csv, mcf and tmcf files.
 
 `/bin/python3 scripts/us_census/pep/Population_Estimate_by_ASR/process.py`
+
+Execute the 'process.py' script by using the following commands:
+
+  - if you want to perform "download and process", run the below command:
+
+        `python3 process.py
+
+  - if you want to perform "only process", run the below command:
+
+        `python3 process.py --mode=process`
+        
+  - if you want to perform "only download", run the below command:
+
+        `python3 process.py --mode=download`
+
+### New Implentation:
+- [Updated the script on October 29, 2024]
+- Downloading input files is now integrated into preprocess.py, eliminating the need to run the separate download.sh script. 
+- All source file URLs, including future URLs adhering to the same structure, are centrally managed in the input_url.json file.
+- All input files required for processing should be stored within the designated "input_files" folder.
diff --git a/scripts/us_census/pep/population_estimates_by_asr/common_functions.py b/scripts/us_census/pep/population_estimates_by_asr/common_functions.py
@@ -4,6 +4,7 @@
 import json
 import os
 import pandas as pd
+import re
 
 
 def input_url(file_name: str, key_name: str):
@@ -93,7 +94,10 @@ def gender_based_grouping(df: pd.DataFrame):
     df['SVs'] = df['SVs'].str.replace\
         ('_NativeHawaiianAndOtherPacificIslanderAlone', '')
     df['SVs'] = df['SVs'].str.replace('_TwoOrMoreRaces', '')
-    df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
+    df = df.groupby(['Year', 'geo_ID', 'SVs',
+                     'Measurement_Method']).sum().reset_index()
+
     return df
 
 
@@ -103,5 +107,15 @@ def race_based_grouping(df: pd.DataFrame):
     """
     df['SVs'] = df['SVs'].str.replace('_Male', '')
     df['SVs'] = df['SVs'].str.replace('_Female', '')
-    df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
+    df = df.groupby(['Year', 'geo_ID', 'SVs',
+                     'Measurement_Method']).sum().reset_index()
     return df
+
+
+def extract_year(year_str):
+    match = re.search(r'\d{4}', year_str)
+    if match:
+        return match.group()
+    else:
+        return None
diff --git a/scripts/us_census/pep/population_estimates_by_asr/county.json b/scripts/us_census/pep/population_estimates_by_asr/county.json
@@ -1,3 +1,4 @@
 {"1970-79":"https://www2.census.gov/programs-surveys/popest/tables/1900-1980/counties/asrh/co-asr-7079.csv",
 "1980-89":"https://www2.census.gov/programs-surveys/popest/datasets/1980-1990/counties/asrh/pe-02.csv",
 "2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA6.csv"}
+
diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_1970_1979.py b/scripts/us_census/pep/population_estimates_by_asr/county_1970_1979.py
@@ -34,6 +34,10 @@ def county1970(url_file: str, output_folder: str):
     # Contains aggregated data for age and race.
     df_ar = pd.DataFrame()
     df = pd.read_csv(_url, names=_cols, low_memory=False, encoding='ISO-8859-1')
+    #Writing raw data to csv
+    df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                           "raw_data", 'raw_data_county_1970_1979.csv'),
+              index=False)
     df = (df.drop(_cols, axis=1).join(df[_cols]))
     df['geo_ID'] = df['geo_ID'].astype(int)
     df['geo_ID'] = [f'{x:05}' for x in df['geo_ID']]
@@ -58,14 +62,12 @@ def county1970(url_file: str, output_folder: str):
     df['geo_ID'] = 'geoId/' + df['geo_ID']
     # Making copies of the current DF to be aggregated upon.
     final_df = pd.concat([final_df, df])
-    df_ar = pd.concat([df_ar, df])
     final_df.insert(3, 'Measurement_Method', 'CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on gender.
-    df = gender_based_grouping(df)
-    df.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
+    final_df = gender_based_grouping(final_df)
+    df_ar = pd.concat([df_ar, df])
     # DF sent to an external function for aggregation based on race.
     df_ar = race_based_grouping(df_ar)
-    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     final_df = pd.concat([final_df, df_ar, df])
     final_df = final_df[~final_df.SVs.str.contains('OtherRaces')]
     final_df.to_csv(

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_1980_1989.py b/scripts/us_census/pep/population_estimates_by_asr/county_1980_1989.py
@@ -37,7 +37,10 @@ def county1980(url_file: str, output_folder: str):
                     ,8,9,10,11,12,13,14,15,16,17]
     df = pd.read_csv(_url,skiprows=7,names=cols,low_memory=False,\
         encoding='ISO-8859-1')
-
+    #Writing raw data to csv
+    df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                           "raw_data", 'raw_data_county_1980_1989.csv'),
+              index=False)
     df = (df.drop(cols, axis=1).join(df[cols]))
     df['geo_ID'] = df['geo_ID'].astype(int)
     df['geo_ID'] = [f'{x:05}' for x in df['geo_ID']]
@@ -61,14 +64,13 @@ def county1980(url_file: str, output_folder: str):
     df.drop(columns=['Race', 'sv'], inplace=True)
     # Generating Aggregated Data by using Group by on rows.
     df_as = pd.concat([df_as, df])
-    df_ar = pd.concat([df_ar, df])
-    df.insert(3, 'Measurement_Method', 'CensusPEPSurvey', True)
+    df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on gender.
     df_as = gender_based_grouping(df_as)
-    df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
+    df_ar = pd.concat([df_ar, df])
+    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on race.
     df_ar = race_based_grouping(df_ar)
-    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     df = pd.concat([df_as, df_ar, df])
     df['geo_ID'] = 'geoId/' + df['geo_ID'].astype(str)
     final_df = df[~df.SVs.str.contains('OtherRaces')]
@@ -77,10 +79,10 @@ def county1980(url_file: str, output_folder: str):
                      'county_1980_1989.csv'))
     # Aggregating the County Data on geo_ID to make State Data.
     final_df['geo_ID'] = final_df['geo_ID'].str[:-3]
-    final_df = final_df.groupby(['Year','geo_ID','SVs']).sum()\
+    final_df = final_df.groupby(['Year','geo_ID','SVs', 'Measurement_Method']).sum()\
     .stack(0).reset_index()
     final_df['observation'] = final_df[0]
-    final_df.drop(columns=['level_3', 0], inplace=True)
+    final_df.drop(columns=['level_4', 0], inplace=True)
     final_df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
     final_df.to_csv(
         os.path.join(os.path.dirname(os.path.abspath(__file__)), output_folder,

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_1990_2000.py b/scripts/us_census/pep/population_estimates_by_asr/county_1990_2000.py
@@ -40,12 +40,15 @@ def county1990(output_folder: str):
             j = f'{i:02}'
             url = 'https://www2.census.gov/programs-surveys'+\
                 '/popest/tables/1990-2000/counties/asrh/casrh'+str(j)+'.txt'
-
             cols=['Year','geo_ID','Race',0,1,2,3,4,5,6,7\
                 ,8,9,10,11,12,13,14,15,16,17]
             df = pd.read_table(url,index_col=False,delim_whitespace=True\
                 ,skiprows=16,skipfooter=14,engine='python',names=cols,\
                     encoding='ISO-8859-1')
+            #Writing raw data to csv
+            df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                   "raw_data", 'raw_data_county_1990_2000.csv'),
+                      index=False)
             # Removing the lines that have false symbols.
             num_df = (df.drop(cols, axis=1).join(df[cols]\
                 .apply(pd.to_numeric, errors='coerce')))
@@ -94,12 +97,9 @@ def county1990(output_folder: str):
     df_ar = pd.concat([df_ar, final_df])
     # DF sent to an external function for aggregation based on gender.
     df_as = gender_based_grouping(df_as)
-    df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on race.
     df_ar = race_based_grouping(df_ar)
-    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     final_df = pd.concat([final_df, df_ar, df_as])
-
     # Writing to output csv.
     final_df.to_csv(
         os.path.join(os.path.dirname(os.path.abspath(__file__)), output_folder,

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_2000_2010.py b/scripts/us_census/pep/population_estimates_by_asr/county_2000_2010.py
@@ -23,7 +23,7 @@
 
 def county2000(output_folder: str):
     """
-    This Python Script Loads csv datasets from 2000-2010 on a County Level,
+    This Python Script Loads csv datasets from 2000-2009 on a County Level,
     cleans it and create a cleaned csv.
     """
     # Used to collect data after every loop for every file's df.
@@ -36,6 +36,11 @@ def county2000(output_folder: str):
             url = 'https://www2.census.gov/programs-surveys/popest/datasets/2'+\
                 '000-2010/intercensal/county/co-est00int-alldata-'+str(j)+'.csv'
             df = pd.read_csv(url, encoding='ISO-8859-1')
+            #Writing raw data to csv
+            df.to_csv(os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "raw_data",
+                'raw_data_county_2000_2009_file_' + str(i) + '.csv'),
+                      index=False)
             # Filter years 1 - 12.
             df['Year'] = df['YEAR']
             df.drop(columns=['YEAR'], inplace=True)
@@ -134,5 +139,5 @@ def county2000(output_folder: str):
     # Write to final file.
     final_df.to_csv(
         os.path.join(os.path.dirname(
-        os.path.abspath(__file__)), output_folder,'county_2000_2010.csv'), \
+        os.path.abspath(__file__)), output_folder,'county_2000_2009.csv'), \
         index=False)
diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_2010_2020.py b/scripts/us_census/pep/population_estimates_by_asr/county_2010_2020.py
@@ -27,8 +27,12 @@ def county2010(url_file: str, output_folder: str):
     '''
     _url = input_url(url_file, "2010-20")
     df = pd.read_csv(_url, encoding='ISO-8859-1', low_memory=False)
+    #Writing raw data to csv
+    df.to_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                           "raw_data", 'raw_data_county_2010_2020.csv'),
+              index=False)
     # Filter by agegrp = 0.
-    df = df.query("YEAR not in [1, 2, 13]")
+    df = df.query("YEAR not in [1, 2, 13, 14]")
     df = df.query("AGEGRP != 0")
     # Filter years 3 - 14.
     df['YEAR'] = df['YEAR'].astype(str)
@@ -43,8 +47,7 @@ def county2010(url_file: str, output_folder: str):
             '9': '2016',
             '10': '2017',
             '11': '2018',
-            '12': '2019',
-            '14': '2020'
+            '12': '2019'
         }
     })
     df.insert(6, 'geo_ID', 'geoId/', True)