datacommonsorg · Bipnabraham · Dec 12, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024
diff --git a/scripts/us_census/pep/population_estimates_by_asr/README.md b/scripts/us_census/pep/population_estimates_by_asr/README.md
@@ -1,3 +1,18 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 # US Census PEP: Population Estimates by Age, Sex and Race
 
 ## About the Dataset
@@ -19,7 +34,7 @@ These are the attributes that we will use
 |-------------------------------------------------------|---------------------------------------------------------------------------------------|
 | Year       					| The Year of the population estimates provided. 				|
 | Age   				| The Individual Ages or Age Buckets of the population in the US. 						|
-| Race   	| Races of the population in the US (https://www.census.gov/topics/population/race/about.html, https://www.census.gov/newsroom/blogs/random-samplings/2021/08/measuring-racial-ethnic-diversity-2020-census.html).  	|
+| Race   	| Races of the population in the US (https://www.census.gov/topics/population/race/about.html, https://www.census.gov/newsroom/blogs/random-samplings/2021/08/measuring-racial-ethnic-diversi   ty-2020-census.html).  	|
 | Sex   				| Gender either Male or Female. 							|
 
 
@@ -53,3 +68,23 @@ Run the test cases
 The below script will download the data and clean the data, Also generate final csv, mcf and tmcf files.
 
 `/bin/python3 scripts/us_census/pep/Population_Estimate_by_ASR/process.py`
+
+Execute the 'process.py' script by using the following commands:
+
+  - if you want to perform "download and process", run the below command:
+
+        `python3 process.py
+
+  - if you want to perform "only process", run the below command:
+
+        `python3 process.py --mode=process`
+
+  - if you want to perform "only download", run the below command:
+
+        `python3 process.py --mode=download`
+
+### New Implentation:
+- [Updated the script on October 29, 2024]
+- Downloading input files is now integrated into preprocess.py, eliminating the need to run the separate download.sh script. 
+- All source file URLs, including future URLs adhering to the same structure, are centrally managed in the input_url.json file.
+- All input files required for processing should be stored within the designated "input_files" folder.
diff --git a/scripts/us_census/pep/population_estimates_by_asr/common_functions.py b/scripts/us_census/pep/population_estimates_by_asr/common_functions.py
@@ -4,6 +4,7 @@
 import json
 import os
 import pandas as pd
+import re
 
 
 def input_url(file_name: str, key_name: str):
@@ -93,7 +94,10 @@ def gender_based_grouping(df: pd.DataFrame):
     df['SVs'] = df['SVs'].str.replace\
         ('_NativeHawaiianAndOtherPacificIslanderAlone', '')
     df['SVs'] = df['SVs'].str.replace('_TwoOrMoreRaces', '')
-    df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
+    df = df.groupby(['Year', 'geo_ID', 'SVs',
+                     'Measurement_Method']).sum().reset_index()
+
     return df
 
 
@@ -103,5 +107,15 @@ def race_based_grouping(df: pd.DataFrame):
     """
     df['SVs'] = df['SVs'].str.replace('_Male', '')
     df['SVs'] = df['SVs'].str.replace('_Female', '')
-    df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
+    df = df.groupby(['Year', 'geo_ID', 'SVs',
+                     'Measurement_Method']).sum().reset_index()
     return df
+
+
+def extract_year(year_str):
+    match = re.search(r'\d{4}', year_str)
+    if match:
+        return match.group()
+    else:
+        return None
diff --git a/scripts/us_census/pep/population_estimates_by_asr/county.json b/scripts/us_census/pep/population_estimates_by_asr/county.json
@@ -1,3 +1,5 @@
 {"1970-79":"https://www2.census.gov/programs-surveys/popest/tables/1900-1980/counties/asrh/co-asr-7079.csv",
 "1980-89":"https://www2.census.gov/programs-surveys/popest/datasets/1980-1990/counties/asrh/pe-02.csv",
-"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA6.csv"}
+"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA6.csv",
+"2020-23":"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/counties/asrh/cc-est2023-alldata.csv"}
+
diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_1970_1979.py b/scripts/us_census/pep/population_estimates_by_asr/county_1970_1979.py
@@ -58,14 +58,12 @@ def county1970(url_file: str, output_folder: str):
     df['geo_ID'] = 'geoId/' + df['geo_ID']
     # Making copies of the current DF to be aggregated upon.
     final_df = pd.concat([final_df, df])
-    df_ar = pd.concat([df_ar, df])
     final_df.insert(3, 'Measurement_Method', 'CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on gender.
-    df = gender_based_grouping(df)
-    df.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
+    final_df = gender_based_grouping(final_df)
+    df_ar = pd.concat([df_ar, df])
     # DF sent to an external function for aggregation based on race.
     df_ar = race_based_grouping(df_ar)
-    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     final_df = pd.concat([final_df, df_ar, df])
     final_df = final_df[~final_df.SVs.str.contains('OtherRaces')]
     final_df.to_csv(

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_1980_1989.py b/scripts/us_census/pep/population_estimates_by_asr/county_1980_1989.py
@@ -61,14 +61,13 @@ def county1980(url_file: str, output_folder: str):
     df.drop(columns=['Race', 'sv'], inplace=True)
     # Generating Aggregated Data by using Group by on rows.
     df_as = pd.concat([df_as, df])
-    df_ar = pd.concat([df_ar, df])
-    df.insert(3, 'Measurement_Method', 'CensusPEPSurvey', True)
+    df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on gender.
     df_as = gender_based_grouping(df_as)
-    df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
+    df_ar = pd.concat([df_ar, df])
+    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on race.
     df_ar = race_based_grouping(df_ar)
-    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     df = pd.concat([df_as, df_ar, df])
     df['geo_ID'] = 'geoId/' + df['geo_ID'].astype(str)
     final_df = df[~df.SVs.str.contains('OtherRaces')]
@@ -77,10 +76,13 @@ def county1980(url_file: str, output_folder: str):
                      'county_1980_1989.csv'))
     # Aggregating the County Data on geo_ID to make State Data.
     final_df['geo_ID'] = final_df['geo_ID'].str[:-3]
-    final_df = final_df.groupby(['Year','geo_ID','SVs']).sum()\
+    final_df.to_csv("county1980_1.csv", index=False)
+
+    final_df = final_df.groupby(['Year','geo_ID','SVs', 'Measurement_Method']).sum()\
     .stack(0).reset_index()
     final_df['observation'] = final_df[0]
-    final_df.drop(columns=['level_3', 0], inplace=True)
+    final_df.to_csv("county1980_2.csv", index=False)
+    final_df.drop(columns=['level_4', 0], inplace=True)
     final_df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey'
     final_df.to_csv(
         os.path.join(os.path.dirname(os.path.abspath(__file__)), output_folder,

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_1990_2000.py b/scripts/us_census/pep/population_estimates_by_asr/county_1990_2000.py
@@ -40,7 +40,6 @@ def county1990(output_folder: str):
             j = f'{i:02}'
             url = 'https://www2.census.gov/programs-surveys'+\
                 '/popest/tables/1990-2000/counties/asrh/casrh'+str(j)+'.txt'
-
             cols=['Year','geo_ID','Race',0,1,2,3,4,5,6,7\
                 ,8,9,10,11,12,13,14,15,16,17]
             df = pd.read_table(url,index_col=False,delim_whitespace=True\
@@ -94,12 +93,9 @@ def county1990(output_folder: str):
     df_ar = pd.concat([df_ar, final_df])
     # DF sent to an external function for aggregation based on gender.
     df_as = gender_based_grouping(df_as)
-    df_as.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     # DF sent to an external function for aggregation based on race.
     df_ar = race_based_grouping(df_ar)
-    df_ar.insert(3, 'Measurement_Method', 'dcAggregate/CensusPEPSurvey', True)
     final_df = pd.concat([final_df, df_ar, df_as])
-
     # Writing to output csv.
     final_df.to_csv(
         os.path.join(os.path.dirname(os.path.abspath(__file__)), output_folder,

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_2010_2020.py b/scripts/us_census/pep/population_estimates_by_asr/county_2010_2020.py
@@ -28,7 +28,7 @@ def county2010(url_file: str, output_folder: str):
     _url = input_url(url_file, "2010-20")
     df = pd.read_csv(_url, encoding='ISO-8859-1', low_memory=False)
     # Filter by agegrp = 0.
-    df = df.query("YEAR not in [1, 2, 13]")
+    df = df.query("YEAR not in [1, 2, 13, 14]")
     df = df.query("AGEGRP != 0")
     # Filter years 3 - 14.
     df['YEAR'] = df['YEAR'].astype(str)
@@ -43,8 +43,7 @@ def county2010(url_file: str, output_folder: str):
             '9': '2016',
             '10': '2017',
             '11': '2018',
-            '12': '2019',
-            '14': '2020'
+            '12': '2019'
         }
     })
     df.insert(6, 'geo_ID', 'geoId/', True)

diff --git a/scripts/us_census/pep/population_estimates_by_asr/county_2020_2023.py b/scripts/us_census/pep/population_estimates_by_asr/county_2020_2023.py
@@ -0,0 +1,191 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This Python Script is for County Level Data 2020-2022.
+'''
+import os
+import numpy as np
+import pandas as pd
+from common_functions import input_url, replace_agegrp
+
+
+def county2029(url_file: str, output_folder: str):
+    '''
+    This Python Script Loads csv datasets from 2010-2020 on a County Level,
+    cleans it and create a cleaned csv.
+    '''
+    # _url = input_url(url_file, "2020-23")
+    df = pd.read_csv(url_file, encoding='ISO-8859-1', low_memory=False)
+    # Filter by agegrp = 0.
+    df = df.query("YEAR not in [1]")
+    df = df.query("AGEGRP != 0")
+    # Filter years 3 - 14.
+    df['YEAR'] = df['YEAR'].astype(str)
+    base_year = 2020
+    df['YEAR'] = df['YEAR'].astype(int) + base_year - 2
+    df.insert(6, 'geo_ID', 'geoId/', True)
+    df['geo_ID'] = 'geoId/' + (df['STATE'].map(str)).str.zfill(2) + \
+        (df['COUNTY'].map(str)).str.zfill(3)
+    df['AGEGRP'] = df['AGEGRP'].astype(str)
+    # Replacing the numbers with more understandable metadata headings.
+    # Code 0 is sent if AGEGRP starts from 0 and 1 if it starts from 0To4
+    df = replace_agegrp(df, 1)
+    # Drop unwanted columns.
+    df.drop(columns=['SUMLEV', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME'], \
+        inplace=True)
+    df = df.drop(columns=[
+        'TOT_POP', 'NH_MALE', 'NH_FEMALE', 'NHWA_MALE', 'NHWA_FEMALE',
+        'NHBA_MALE', 'NHBA_FEMALE', 'NHIA_MALE', 'NHIA_FEMALE', 'NHAA_MALE',
+        'NHAA_FEMALE', 'NHNA_MALE', 'NHNA_FEMALE', 'NHTOM_MALE', 'NHTOM_FEMALE',
+        'H_MALE', 'H_FEMALE', 'HWA_MALE', 'HWA_FEMALE', 'HBA_MALE',
+        'HBA_FEMALE', 'HIA_MALE', 'HIA_FEMALE', 'HAA_MALE', 'HAA_FEMALE',
+        'HNA_MALE', 'HNA_FEMALE', 'HTOM_MALE', 'HTOM_FEMALE'
+    ])
+
+    columns = [
+        'Year', 'geo_ID', 'AGEGRP', 'TOT_MALE', 'TOT_FEMALE', 'WA_MALE',
+        'WA_FEMALE', 'BA_MALE', 'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE',
+        'AA_FEMALE', 'NA_MALE', 'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE',
+        'WAC_MALE', 'WAC_FEMALE', 'BAC_MALE', 'BAC_FEMALE', 'IAC_MALE',
+        'IAC_FEMALE', 'AAC_MALE', 'AAC_FEMALE', 'NAC_MALE', 'NAC_FEMALE',
+        'NHWAC_MALE', 'NHWAC_FEMALE', 'NHBAC_MALE', 'NHBAC_FEMALE',
+        'NHIAC_MALE', 'NHIAC_FEMALE', 'NHAAC_MALE', 'NHAAC_FEMALE',
+        'NHNAC_MALE', 'NHNAC_FEMALE', 'HWAC_MALE', 'HWAC_FEMALE', 'HBAC_MALE',
+        'HBAC_FEMALE', 'HIAC_MALE', 'HIAC_FEMALE', 'HAAC_MALE', 'HAAC_FEMALE',
+        'HNAC_MALE', 'HNAC_FEMALE'
+    ]
+
+    df['Year'] = df['YEAR']
+    df.drop(columns=['YEAR'], inplace=True)
+    df['WhiteAloneAgg'] = df['WA_MALE'].astype(int) + df['WA_FEMALE'].astype(
+        int)
+    df['BlackOrAfricanAmericanAlone'] = df['BA_MALE'].astype(int)\
+        +df['BA_FEMALE'].astype(int)
+    df['AmericanIndianAndAlaskaNativeAlone'] = df['IA_MALE'].astype(int)\
+        +df['IA_FEMALE'].astype(int)
+    df['AsianAloneAgg'] = df['AA_MALE'].astype(int) + df['AA_FEMALE'].astype(
+        int)
+    df['NativeHawaiianAndOtherPacificIslanderAloneAgg'] = df['NA_MALE']\
+        .astype(int)+df['NA_FEMALE'].astype(int)
+    df['TwoOrMoreRacesAgg'] = df['TOM_MALE'].astype(int)+\
+        df['TOM_FEMALE'].astype(int)
+    df = df.melt(id_vars=['Year','geo_ID' ,'AGEGRP'], var_name='sv' , \
+        value_name='observation')
+    # Changing Names to be more understandable.
+    _sexrace_dict = {
+        'TOT_MALE':
+            'Male',
+        'TOT_FEMALE':
+            'Female',
+        'WA_MALE':
+            'Male_WhiteAlone',
+        'WA_FEMALE':
+            'Female_WhiteAlone',
+        'BA_MALE':
+            'Male_BlackOrAfricanAmericanAlone',
+        'BA_FEMALE':
+            'Female_BlackOrAfricanAmericanAlone',
+        'IA_MALE':
+            'Male_AmericanIndianAndAlaskaNativeAlone',
+        'IA_FEMALE':
+            'Female_AmericanIndianAndAlaskaNativeAlone',
+        'AA_MALE':
+            'Male_AsianAlone',
+        'AA_FEMALE':
+            'Female_AsianAlone',
+        'NA_MALE':
+            'Male_NativeHawaiianAndOtherPacificIslanderAlone',
+        'NA_FEMALE':
+            'Female_NativeHawaiianAndOtherPacificIslanderAlone',
+        'TOM_MALE':
+            'Male_TwoOrMoreRaces',
+        'TOM_FEMALE':
+            'Female_TwoOrMoreRaces',
+        'WAC_MALE':
+            "Male_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'WAC_FEMALE':
+            "Female_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'BAC_MALE':
+            "Male_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'BAC_FEMALE':
+            "Female_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'IAC_MALE':
+            "Male_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'IAC_FEMALE':
+            "Female_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'AAC_MALE':
+            "Male_AsianAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'AAC_FEMALE':
+            "Female_AsianAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NAC_MALE':
+            "Male_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NAC_FEMALE':
+            "Female_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHWAC_MALE':
+            "Male_NotHispanicOrLatino_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHWAC_FEMALE':
+            "Female_NotHispanicOrLatino_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHBAC_MALE':
+            "Male_NotHispanicOrLatino_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHBAC_FEMALE':
+            "Female_NotHispanicOrLatino_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHIAC_MALE':
+            "Male_NotHispanicOrLatino_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHIAC_FEMALE':
+            "Female_NotHispanicOrLatino_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHAAC_MALE':
+            "Male_NotHispanicOrLatino_AsianAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHAAC_FEMALE':
+            "Female_NotHispanicOrLatino_AsianAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHNAC_MALE':
+            "Male_NotHispanicOrLatino_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHNAC_FEMALE':
+            "Female_NotHispanicOrLatino_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHWAC_MALE':
+            "Male_NotHispanicOrLatino_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'NHWAC_FEMALE':
+            "Female_NotHispanicOrLatino_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HBAC_MALE':
+            "Male_HispanicOrLatino_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HBAC_FEMALE':
+            "Female_HispanicOrLatino_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HIAC_MALE':
+            "Male_HispanicOrLatino_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HIAC_FEMALE':
+            "Female_HispanicOrLatino_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HAAC_MALE':
+            "Male_HispanicOrLatino_AsianAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HAAC_FEMALE':
+            "Female_HispanicOrLatino_AsianAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HNAC_MALE':
+            "Male_HispanicOrLatino_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HNAC_FEMALE':
+            "Female_HispanicOrLatino_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HWAC_MALE':
+            "Male_HispanicOrLatino_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces",
+        'HWAC_FEMALE':
+            "Female_HispanicOrLatino_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces"
+    }
+    df = df.replace({"sv": _sexrace_dict})
+    df['SVs'] = 'Count_Person_' + df['AGEGRP'] + '_' + df['sv']
+    df = df.drop(columns=['AGEGRP', 'sv'])
+    df['Measurement_Method'] = np.where(df['SVs'].str.contains('Agg')\
+        , 'dcAggregate/CensusPEPSurvey', 'CensusPEPSurvey')
+    df['SVs'] = df['SVs'].str.replace('Agg', '')
+
+    # Write to final file.
+    df.to_csv(
+        os.path.join(os.path.dirname(
+        os.path.abspath(__file__)), output_folder,'county_2020_2023.csv'),\
+        index=False)
diff --git a/scripts/us_census/pep/population_estimates_by_asr/input_url.json b/scripts/us_census/pep/population_estimates_by_asr/input_url.json
@@ -0,0 +1,11 @@
+[
+    {
+        "download_path": "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/counties/asrh/cc-est2023-alldata.csv"
+    },
+    {
+        "download_path": "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/national/asrh/nc-est2023-agesex-res.csv"
+    },
+    {
+        "download_path": "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/state/asrh/sc-est2023-alldata6.csv"
+    }
+]