datacommonsorg · Bipnabraham · Dec 12, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024
diff --git a/scripts/us_census/pep/population_estimates_by_asr/README.md b/scripts/us_census/pep/population_estimates_by_asr/README.md
@@ -1,3 +1,18 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 # US Census PEP: Population Estimates by Age, Sex and Race
 
 ## About the Dataset
@@ -19,7 +34,7 @@ These are the attributes that we will use
 |-------------------------------------------------------|---------------------------------------------------------------------------------------|
 | Year       					| The Year of the population estimates provided. 				|
 | Age   				| The Individual Ages or Age Buckets of the population in the US. 						|
-| Race   	| Races of the population in the US (https://www.census.gov/topics/population/race/about.html, https://www.census.gov/newsroom/blogs/random-samplings/2021/08/measuring-racial-ethnic-diversity-2020-census.html).  	|
+| Race   	| Races of the population in the US (https://www.census.gov/topics/population/race/about.html, https://www.census.gov/newsroom/blogs/random-samplings/2021/08/measuring-racial-ethnic-diversi   ty-2020-census.html).  	|
 | Sex   				| Gender either Male or Female. 							|
 
 
@@ -53,3 +68,23 @@ Run the test cases
 The below script will download the data and clean the data, Also generate final csv, mcf and tmcf files.
 
 `/bin/python3 scripts/us_census/pep/Population_Estimate_by_ASR/process.py`
+
+Execute the 'process.py' script by using the following commands:
+
+  - if you want to perform "download and process", run the below command:
+
+        `python3 process.py
+
+  - if you want to perform "only process", run the below command:
+
+        `python3 process.py --mode=process`
+
+  - if you want to perform "only download", run the below command:
+
+        `python3 process.py --mode=download`
+
+### New Implentation:
+- [Updated the script on October 29, 2024]
+- Downloading input files is now integrated into preprocess.py, eliminating the need to run the separate download.sh script. 
+- All source file URLs, including future URLs adhering to the same structure, are centrally managed in the input_url.json file.
+- All input files required for processing should be stored within the designated "input_files" folder.
diff --git a/scripts/us_census/pep/population_estimates_by_asr/common_functions.py b/scripts/us_census/pep/population_estimates_by_asr/common_functions.py
@@ -93,15 +93,18 @@ def gender_based_grouping(df: pd.DataFrame):
     df['SVs'] = df['SVs'].str.replace\
         ('_NativeHawaiianAndOtherPacificIslanderAlone', '')
     df['SVs'] = df['SVs'].str.replace('_TwoOrMoreRaces', '')
-    df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    df = df.groupby(['Year', 'geo_ID', 'SVs',
+                     'Measurement_Method']).sum().reset_index()
     return df
 
 
 def race_based_grouping(df: pd.DataFrame):
     """
     Aggregates the columns based on race by removing gender from SV
-    """
+    """ 
     df['SVs'] = df['SVs'].str.replace('_Male', '')
     df['SVs'] = df['SVs'].str.replace('_Female', '')
-    df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    # df = df.groupby(['Year', 'geo_ID', 'SVs']).sum().reset_index()
+    df = df.groupby(['Year', 'geo_ID', 'SVs',
+                     'Measurement_Method']).sum().reset_index()
     return df
diff --git a/scripts/us_census/pep/population_estimates_by_asr/county.json b/scripts/us_census/pep/population_estimates_by_asr/county.json
@@ -1,3 +1,5 @@
 {"1970-79":"https://www2.census.gov/programs-surveys/popest/tables/1900-1980/counties/asrh/co-asr-7079.csv",
 "1980-89":"https://www2.census.gov/programs-surveys/popest/datasets/1980-1990/counties/asrh/pe-02.csv",
-"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA6.csv"}
+"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA6.csv",
+"2020-23":"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/counties/asrh/cc-est2023-alldata.csv"}
+
diff --git a/scripts/us_census/pep/population_estimates_by_asr/input_url.json b/scripts/us_census/pep/population_estimates_by_asr/input_url.json
@@ -0,0 +1,11 @@
+[
+    {
+        "download_path": "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/counties/asrh/cc-est2023-alldata.csv"
+    },
+    {
+        "download_path": "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/national/asrh/nc-est2023-agesex-res.csv"
+    },
+    {
+        "download_path": "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/state/asrh/sc-est2023-alldata6.csv"
+    }
+]
diff --git a/scripts/us_census/pep/population_estimates_by_asr/manifest.json b/scripts/us_census/pep/population_estimates_by_asr/manifest.json
@@ -0,0 +1,19 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "USCensusPEP_AgeSexRace",
+      "curator_emails": ["[email protected]"],
+      "provenance_url": "https://www2.census.gov/programs-surveys/popest/tables",
+      "provenance_description": ".The Census Bureau's Population Estimates Program (PEP) produces estimates of the population for the United States.",
+      "scripts": ["process.py"],
+      "import_inputs": [
+        {
+          "template_mcf": "output/usa_population_asr.tmcf",
+          "cleaned_csv": "output/usa_population_asr.csv"
+        }
+      ],
+      "cron_schedule": "0 05 * * 1"
+    }
+  ]
+}
+
diff --git a/scripts/us_census/pep/population_estimates_by_asr/national.json b/scripts/us_census/pep/population_estimates_by_asr/national.json
@@ -14,4 +14,6 @@
 "1980-90files":["E8081CQI.TXT", "E8182CQI.TXT", "E8283CQI.TXT", "E8384CQI.TXT",
     "E8485CQI.TXT", "E8586CQI.TXT", "E8687CQI.TXT", "E8788CQI.TXT",
     "E8889CQI.TXT", "E8990CQI.TXT"],
-"2000-10":"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/national/us-est00int-alldata.csv"}
+"2000-10":"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/national/us-est00int-alldata.csv",
+"2020-23":"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/national/asrh/nc-est2023-agesex-res.csv"}
+
diff --git a/scripts/us_census/pep/population_estimates_by_asr/process.py b/scripts/us_census/pep/population_estimates_by_asr/process.py
@@ -1,4 +1,4 @@
-# Copyright 2022 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,13 @@
 import pandas as pd
 import numpy as np
 
+import requests
+import shutil
+import time
+import json
+from datetime import datetime as dt
+from absl import logging
+
 from absl import app
 from absl import flags
 from national_1900_1959 import national1900
@@ -27,20 +34,50 @@
 from national_2000_2010 import national2000
 from national_2010_2019 import national2010
 from national_2020_2021 import national2020
+from national_2020_2023 import national2023
 from state_1970_1979 import state1970
 from state_1990_2000 import state1990
 from state_2000_2010 import state2000
 from state_2010_2020 import state2010
+from state_2020_2023 import state2020
 from county_1970_1979 import county1970
 from county_1980_1989 import county1980
 from county_1990_2000 import county1990
 from county_2000_2010 import county2000
 from county_2010_2020 import county2010
+from county_2020_2023 import county2020
 
 FLAGS = flags.FLAGS
 DEFAULT_INPUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "input_data")
+flags.DEFINE_string('mode', '', 'Options: download or process')
 flags.DEFINE_string("input_path", DEFAULT_INPUT_PATH, "Import Data File's List")
+_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
+_INPUT_FILE_PATH = os.path.join(_MODULE_DIR, 'input_files')
+_FILES_TO_DOWNLOAD = None
+
+
+def add_future_year_urls():
+    global _FILES_TO_DOWNLOAD
+    with open(os.path.join(_MODULE_DIR, 'input_url.json'), 'r') as inpit_file:
+        _FILES_TO_DOWNLOAD = json.load(inpit_file)
+    urls_to_scan = [
+        "https://www2.census.gov/programs-surveys/popest/datasets/2020-{YEAR}/counties/asrh/cc-est{YEAR}-alldata.csv",
+        "https://www2.census.gov/programs-surveys/popest/datasets/2020-{YEAR}/national/asrh/nc-est{YEAR}-agesex-res.csv",
+        "https://www2.census.gov/programs-surveys/popest/datasets/2020-{YEAR}/state/asrh/sc-est{YEAR}-alldata6.csv"
+    ]
+    if dt.now().year > 2023:
+        YEAR = dt.now().year
+        for url in urls_to_scan:
+            url_to_check = url.format(YEAR=YEAR)
+            try:
+                check_url = requests.head(url_to_check)
+                if check_url.status_code == 200:
+                    _FILES_TO_DOWNLOAD.append({"download_path": url_to_check})
+
+            except:
+                logging.fatal(f"URL is not accessable {url_to_check}")
+
 
 MCF_TEMPLATE = ("Node: dcid:{pv1}\n"
                 "typeOf: dcs:StatisticalVariable\n"
@@ -203,51 +240,107 @@ def process(self):
         output_path = os.path.dirname(self._cleaned_csv_file_path)
         if not os.path.exists(output_path):
             os.mkdir(output_path)
+        processed_count = 0
+        total_files_to_process = len(self._input_files)
+        if total_files_to_process is None:
+            logging.fatal(
+                f"No input files found in the directory: {self._input_files}")
+        logging.info(f"Number of files to be processed {len(self._input_files)}")
         sv_list = []
         # data_df is used to read every single file which has been generated.
         # final_df concatenates all these files.
+        expected_columns = ['geo_ID','Year','observation','Measurement_Method','SVs']
         for file_path in self._input_files:
+            logging.info(f"Processing --- {file_path}")
             data_df = pd.read_csv(file_path)
-            final_df = pd.concat([final_df, data_df])
-            sv_list += data_df["SVs"].to_list()
-        # Drop the unwanted columns and NA.
-        final_df.drop(columns=['Unnamed: 0'], inplace=True)
-        final_df = final_df.dropna()
-        final_df['Year'] = final_df['Year'].astype(float).astype(int)
-        final_df = final_df.sort_values(by=['Year', 'geo_ID'])
-        final_df = _measurement_method(final_df)
-        final_df.to_csv(self._cleaned_csv_file_path, index=False)
-        sv_list = list(set(sv_list))
-        sv_list.sort()
-        self._generate_mcf(sv_list)
-        self._generate_tmcf()
+            for column in expected_columns:
+                if column not in data_df.columns:
+                    logging.fatal(f"Error: {file_path} is missing column {column}")
+            if not data_df.empty:
+                processed_count += 1
+                final_df = pd.concat([final_df, data_df])
+                # final_df.to_csv("final_csv.csv")
+                sv_list += data_df["SVs"].to_list()
+            else:
+                logging.fatal(f"Failed to process {file_path}")
 
+        logging.info(f"Number of files processed {processed_count}")
+        # After processing all files, ensure all files were processed
+        if processed_count == total_files_to_process & total_files_to_process > 0:
+            # Drop the unwanted columns and NA.
+            logging.info(f"Dropping unwanted columns and NA")
+            final_df.drop(columns=['Unnamed: 0'], inplace=True)
+            final_df = final_df.dropna()
+            final_df['Year'] = final_df['Year'].astype(float).astype(int)
+            final_df = final_df.sort_values(by=['Year', 'geo_ID'])
+            final_df = _measurement_method(final_df)
+            #To remove inconsistent value issue'Count_Person_0Years_Male', observationDate: '2020', value1: 1891716.0, value2: 1876349.0
+            final_df = final_df.drop_duplicates(
+                subset=['geo_ID', 'Year', 'Measurement_Method', 'SVs'],
+                keep='last')
+            final_df.to_csv(self._cleaned_csv_file_path, index=False)
+            sv_list = list(set(sv_list))
+            sv_list.sort()
+            logging.info(f"----Generating MCF and TMCF----")
+            self._generate_mcf(sv_list)
+            self._generate_tmcf()
+        else:
+            logging.fatal(
+                "File processing mismatch: Expected {total_files_to_process} files, but processed {files_processed}. Output file generation aborted"
+            )
 
-def main(_):
-    input_path = FLAGS.input_path
-    if not os.path.exists(input_path):
-        os.mkdir(input_path)
-    # Running the fuctions in individual files by Year and Area.
+
+def download_files():
+    """
+    This method calls the download functions for state, county and country for each year
+    Returns:
+    True if there was no errors
+    """
     national_url_file = "national.json"
     state_url_file = "state.json"
     county_url_file = "county.json"
     output_folder = "input_data"
-    national1900(output_folder)
-    national1960(output_folder)
-    national1980(national_url_file, output_folder)
-    national2000(national_url_file, output_folder)
-    national2010(national_url_file, output_folder)
-    national2020(national_url_file, output_folder)
-    state1970(state_url_file, output_folder)
-    state1990(state_url_file, output_folder)
-    state2000(state_url_file, output_folder)
-    state2010(state_url_file, output_folder)
-    county1970(county_url_file, output_folder)
-    county1980(county_url_file, output_folder)
-    county1990(output_folder)
-    county2000(output_folder)
-    county2010(county_url_file, output_folder)
+    try:
+        add_future_year_urls()
+        national1900(output_folder)
+        national1960(output_folder)
+        national1980(national_url_file, output_folder)
+        national2000(national_url_file, output_folder)
+        national2010(national_url_file, output_folder)
+        national2020(national_url_file, output_folder)
+
+        state1970(state_url_file, output_folder)
+        state1990(state_url_file, output_folder)
+        state2000(state_url_file, output_folder)
+        state2010(state_url_file, output_folder)
+
+        county1970(county_url_file, output_folder)
+        county1980(county_url_file, output_folder)
+        county1990(output_folder)
+        county2000(output_folder)
+        county2010(county_url_file, output_folder)
+
+
+        global _FILES_TO_DOWNLOAD
+        for file in _FILES_TO_DOWNLOAD:
+            #file_name_to_save = None
+            url = file['download_path']
+            if 'national' in url:
+                national2023(url, output_folder)
+            if 'state' in url:
+                state2020(url, output_folder)
+            if 'counties' in url:
+                county2020(url, output_folder)
+    except Exception as e:
+        logging.fatal(f"Error while downloading : {e}")
+    return True
 
+
+def main(_):
+    mode = FLAGS.mode
+    input_path = FLAGS.input_path
+    if not os.path.exists(input_path):
+        os.mkdir(input_path)
     ip_files = os.listdir(input_path)
     ip_files = [input_path + os.sep + file for file in ip_files]
     data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
@@ -256,8 +349,16 @@ def main(_):
     cleaned_csv_path = os.path.join(data_file_path, "usa_population_asr.csv")
     mcf_path = os.path.join(data_file_path, "usa_population_asr.mcf")
     tmcf_path = os.path.join(data_file_path, "usa_population_asr.tmcf")
-    loader = USCensusPEPByASR(ip_files, cleaned_csv_path, mcf_path, tmcf_path)
-    loader.process()
+    # Running the fuctions in individual files by Year and Area
+    download_status = True
+    if mode == "" or mode == "download":
+        # download & process
+        add_future_year_urls()
+        download_status = download_files()
+    if download_status and (mode == "" or mode == "process"):
+        loader = USCensusPEPByASR(ip_files, cleaned_csv_path, mcf_path,
+                                  tmcf_path)
+        loader.process()
 
 
 if __name__ == "__main__":

diff --git a/scripts/us_census/pep/population_estimates_by_asr/process_test.py b/scripts/us_census/pep/population_estimates_by_asr/process_test.py
@@ -1,4 +1,4 @@
-# Copyright 2022 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -74,7 +74,6 @@ class TestPreprocess(unittest.TestCase):
     base = USCensusPEPByASR(ip_data, cleaned_csv_file_path, mcf_file_path,
                             tmcf_file_path)
     base.process()
-    shutil.rmtree(input_path, ignore_errors=True)
 
     def test_mcf_tmcf_files(self):
         """
@@ -100,11 +99,6 @@ def test_mcf_tmcf_files(self):
 
         with open(self.tmcf_file_path, encoding="UTF-8") as tmcf_file:
             tmcf_data = tmcf_file.read()
-        if path.exists(self.mcf_file_path):
-            os.remove(self.mcf_file_path)
-        if path.exists(self.tmcf_file_path):
-            os.remove(self.tmcf_file_path)
-
         self.assertEqual(expected_mcf_data.strip(), mcf_data.strip())
         self.assertEqual(expected_tmcf_data.strip(), tmcf_data.strip())
 
@@ -123,7 +117,9 @@ def test_create_csv(self):
 
         with open(self.cleaned_csv_file_path, encoding="utf-8-sig") as csv_file:
             csv_data = csv_file.read()
-        if path.exists(self.cleaned_csv_file_path):
-            os.remove(self.cleaned_csv_file_path)
 
         self.assertEqual(expected_csv_data.strip(), csv_data.strip())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/scripts/us_census/pep/population_estimates_by_asr/state.json b/scripts/us_census/pep/population_estimates_by_asr/state.json
@@ -10,4 +10,5 @@
     "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/state/asrh/sasrh98.txt",
     "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/state/asrh/sasrh99.txt"],
 "2000-10":"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/st-est00int-alldata.csv",
-"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/asrh/SC-EST2020-ALLDATA6.csv"}
+"2010-20":"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/asrh/SC-EST2020-ALLDATA6.csv",
+"2020-23":"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/state/asrh/sc-est2023-alldata6.csv"}