diff --git a/scripts/us_census/pep/monthly_population_estimate/README.md b/scripts/us_census/pep/monthly_population_estimate/README.md index f607a13e8c..e22b5690c7 100644 --- a/scripts/us_census/pep/monthly_population_estimate/README.md +++ b/scripts/us_census/pep/monthly_population_estimate/README.md @@ -1,12 +1,12 @@ # US Census PEP: National Population Count by Residential Status and Military Status ## About the Dataset -This dataset has Population Count Estimates for the United States from the year 1980 to 2022 on a monthly basis. +This dataset has Population Count Estimates for the United States from the year 1980 on a monthly basis till latest year. The population is categorized by residential status (resident,InArmedForcesOverseas), military status(Civilian,InArmedForces) and a combination of the same. ### Download URL -The data in txt/xls/xlsx formats are downloadable from within https://www2.census.gov/programs-surveys/popest/tables. The actual URLs are listed in file_urls.json. +The data in txt/xls/xlsx formats are downloadable from within https://www2.census.gov/programs-surveys/popest/tables. The actual URLs are listed in input_url.json. #### API Output These are the attributes that we will use @@ -44,12 +44,22 @@ Run the test cases ```/bin/python3 scripts/us_census/pep/monthly_population_estimate/preprocess_test.py ``` +### Import Procedure +[Updated the script on November 11, 2024] +Downloading input files is now integrated into preprocess.py, eliminating the need to run the separate download.sh script. +All source file URLs, including future URLs adhering to the same structure, are centrally managed in the input_url.json file. +All input files required for processing should be stored within the designated "input_files" folder. -### Import Procedure +### Downloading and Processing Data -The below script make a new folder named as input_data (if not already present) where the download.py script is present and will download the data into this folder. -`/bin/python3 scripts/us_census/pep/monthly_population_estimate/download.py` +To perform "download and process", run the below command: + python3 preprocess.py +Running this command generates input_fles and csv, mcf, tmcf files -The below script will generate csv and mcf files. -`/bin/python3 scripts/us_census/pep/monthly_population_estimate/preprocess.py` +If you want to perform "only process", run the below command: + python3 preprocess.py --mode=process + +If you want to perform "only download", run the below command: + python3 preprocess.py --mode=download + \ No newline at end of file diff --git a/scripts/us_census/pep/monthly_population_estimate/download.py b/scripts/us_census/pep/monthly_population_estimate/download.py deleted file mode 100644 index 3c478367a4..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/download.py +++ /dev/null @@ -1,243 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" A Script to download, perform some basic transformations to - USA Census PEP monthly population data from the URLS in - provided json file and save it as an xlsx file. -""" - -import os -import json -import pandas as pd -import numpy as np -from absl import app -from absl import flags - -_FLAGS = flags.FLAGS -_URLS_JSON_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "file_urls.json") - -_URLS_JSON = None -with open(_URLS_JSON_PATH, encoding="UTF-8") as file: - _URLS_JSON = json.load(file) - -# Flag names are globally defined! So in general, we need to be -# careful to pick names that are unlikely to be used by other libraries. -# If there is a conflict, we'll get an error at import time. -flags.DEFINE_list("us_census_pep_monthly_pop_estimate_url", \ - _URLS_JSON["urls"], "Import Data URL's List") - -_HEADER = 1 -_SCALING_FACTOR_TXT_FILE = 1000 - - -def _save_data(url: str, download_local_path: str) -> None: - """ - This method loads the Data from url to pandas Dataframe. - Writes the data to local path provided as one the parameter. - - Args: - url (str): Url of the dataset - download_local_path (str): LocalPath to save the datasets. - - Returns: - None - """ - df = None - file_name = url.split("/")[-1] - if ".xls" in url: - df = pd.read_excel(url, header=_HEADER) - df.to_excel(os.path.join(download_local_path, file_name), - index=False, - header=False, - engine='xlsxwriter') - elif ".csv" in url: - file_name = file_name.replace(".csv", ".xlsx") - df = pd.read_csv(url, header=None) - df = _clean_csv_file(df) - df.to_excel(os.path.join(download_local_path, file_name), - index=False, - engine='xlsxwriter') - elif ".txt" in url: - file_name = file_name.replace(".txt", ".xlsx") - cols = [ - "Year and Month", "Date", "Resident Population", - "Resident Population Plus Armed Forces Overseas", - "Civilian Population", "Civilian NonInstitutionalized Population" - ] - df = pd.read_table(url, - index_col=False, - delim_whitespace=True, - engine='python', - skiprows=17, - names=cols) - # Skipping 17 rows as the initial 17 rows contains the information about - # the file being used, heading files spread accross multiple lines and - # other irrelevant information like source/contact details. - df = _clean_txt_file(df) - # Multiplying the data with scaling factor 1000. - for col in df.columns: - if "year" not in col.lower(): - df[col] = df[col].apply(_mulitply_scaling_factor) - df.to_excel(os.path.join(download_local_path, file_name), - index=False, - engine='xlsxwriter') - - -def _concat_cols(col: pd.Series) -> pd.Series: - """ - This method concats two DataFrame column values - with space in-between. - - Args: - col[0] (Series) : DataFrame Column of dtype str - col[1] (Series) : DataFrame Column of dtype str - - Returns: - res (Series) : Concatenated DataFrame Columns - """ - # Looking at the data whenever col[0] has year, col[1] is None - # Thus concatinating Date with Month which is needed here - res = col[0] - if col[1] is None: - return res - res = col[0] + ' ' + col[1] - return res - - -def _mulitply_scaling_factor(col: pd.Series) -> pd.Series: - """ - This method multiply dataframe column with scaling factor. - - Args: - col (Series): DataFrame Column of dtype int - **kwargs (dict): Dict with key 'scaling_factor' and value type int - - Returns: - res (Series): DataFrame column values mulitplied by scaling_factor. - """ - res = col - if col not in [None, np.NAN]: - if col.isdigit(): - res = int(col) * _SCALING_FACTOR_TXT_FILE - return res - - -def _clean_csv_file(df: pd.DataFrame) -> pd.DataFrame: - """ - This method cleans the dataframe loaded from a csv file format. - Also, Performs transformations on the data. - - Args: - df (DataFrame) : DataFrame of csv dataset - - Returns: - df (DataFrame) : Transformed DataFrame for txt dataset. - """ - # Removal of file description and headers in the initial lines of the input - # - # Input Data: - # table with row headers in column A and column headers in rows 3 through 5 (leading dots indicate sub-parts) - # Table 1. Monthly Population Estimates for the United States: April 1, 2000 to December 1, 2010 - # Year and Month Resident Population Resident Population Plus Armed Forces Overseas Civilian Population Civilian Noninstitutionalized Population - # 2000 - # .April 1 28,14,24,602 28,16,52,670 28,02,00,922 27,61,62,490 - # .May 1 28,16,46,806 28,18,76,634 28,04,28,534 27,63,89,920 - # - # Output Data: - # (Made Headers) Year and Month Resident Population Resident Population Plus Armed Forces Overseas Civilian Population Civilian Noninstitutionalized Population - # 2000 - # .April 1 28,14,24,602 28,16,52,670 28,02,00,922 27,61,62,490 - # .May 1 28,16,46,806 28,18,76,634 28,04,28,534 27,63,89,920 - - idx = df[df[0] == "Year and Month"].index - df = df.iloc[idx.values[0] + 1:][:] - df = df.dropna(axis=1, how='all') - cols = [ - "Year and Month", "Resident Population", - "Resident Population Plus Armed Forces Overseas", "Civilian Population", - "Civilian NonInstitutionalized Population" - ] - df.columns = cols - for col in df.columns: - df[col] = df[col].str.replace(",", "") - return df - - -def _clean_txt_file(df: pd.DataFrame) -> pd.DataFrame: - """ - This method cleans the dataframe loaded from a txt file format. - Also, Performs transformations on the data. - - Args: - df (DataFrame) : DataFrame of txt dataset - scaling_factor_txt_file (int) : Scaling factor for text file - - Returns: - df (DataFrame) : Transformed DataFrame for txt dataset. - """ - # Month and Year are concatenated into a single column if they are not None - df['Year and Month'] = df[['Year and Month', 'Date']]\ - .apply(_concat_cols, axis=1) - df.drop(columns=['Date'], inplace=True) - for col in df.columns: - df[col] = df[col].str.replace(",", "") - - # The index numbers alotted as per where the columns are present to - # move the columns left - resident_population = 1 - resident_population_plus_armed_forces_overseas = 2 - civilian_population = 3 - civilian_noninstitutionalized_population = 4 - # Moving the row data left upto one index value. - # As the text file has (census) mentioned in some rows and it makes the - # other column's data shift by one place, we need to shift it back to the - # original place. - idx = df[df['Resident Population'] == "(census)"].index - df.iloc[idx, resident_population] = df.iloc[idx][ - "Resident Population Plus Armed Forces Overseas"] - df.iloc[idx, resident_population_plus_armed_forces_overseas] = df.iloc[idx][ - "Civilian Population"] - df.iloc[idx, civilian_population] = df.iloc[idx][ - "Civilian NonInstitutionalized Population"] - df.iloc[idx, civilian_noninstitutionalized_population] = np.NAN - return df - - -def download(download_path: str, file_urls: list) -> None: - """ - This method iterates on each url and calls the above defined - functions to download and clean the data. - - Args: - download_path (str) : Local Path to download datasets from URLS - file_urls (list) : List of dataset URLS. - - Returns: - df (DataFrame) : Transformed DataFrame for txt dataset. - """ - if not os.path.exists(download_path): - os.mkdir(download_path) - for url in file_urls: - _save_data(url, download_path) - - -def main(_): - file_urls = _FLAGS.us_census_pep_monthly_pop_estimate_url - path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "input_data") - download(path, file_urls) - - -if __name__ == "__main__": - app.run(main) diff --git a/scripts/us_census/pep/monthly_population_estimate/download_test.py b/scripts/us_census/pep/monthly_population_estimate/download_test.py deleted file mode 100644 index 8f097d3ffd..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/download_test.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Script to automate the testing for USA Population preprocess script. -""" - -import os -import unittest -from os import path -from download import download -import pandas as pd -# module_dir_ is the path to where this test is running from. -module_dir_ = os.path.dirname(__file__) -_TEST_DATA_FOLDER = os.path.join(module_dir_, "test_data") -_OP_DATA_FOLDER = os.path.join(module_dir_, "test_output_data_download") - - -class TestPreprocess(unittest.TestCase): - """ - TestPreprocess is inherting unittest class - properties which further requried for unit testing - """ - - def test_create_xlsx(self): - """ - This method is required to test between output generated - preprocess script and excepted output files like XLSX - """ - ip_data_path = [os.path.join(_TEST_DATA_FOLDER, "test_census_data.csv")] - download(_OP_DATA_FOLDER, ip_data_path) - expected_xlsx_file_path = os.path.join( - _TEST_DATA_FOLDER, "download_expected_USA_Population_Count.xlsx") - - expected_df = pd.read_excel(expected_xlsx_file_path) - actual_df = pd.read_excel(_OP_DATA_FOLDER + "/test_census_data.xlsx") - - if path.exists(_OP_DATA_FOLDER): - os.remove(_OP_DATA_FOLDER) - - self.assertEqual(True, actual_df.equals(expected_df)) diff --git a/scripts/us_census/pep/monthly_population_estimate/file_urls.json b/scripts/us_census/pep/monthly_population_estimate/file_urls.json deleted file mode 100644 index 6316eae3bb..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/file_urls.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "urls" : ["https://www2.census.gov/programs-surveys/popest/tables/2000-2009/state/totals/na-est2009-01.csv", - "https://www2.census.gov/programs-surveys/popest/tables/2020-2021/national/totals/NA-EST2021-POP.xlsx", - "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/national/totals/na-est2019-01.xlsx", - "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/national/totals/nat-total.txt"] -} \ No newline at end of file diff --git a/scripts/us_census/pep/monthly_population_estimate/input_url.json b/scripts/us_census/pep/monthly_population_estimate/input_url.json new file mode 100644 index 0000000000..34acb3089d --- /dev/null +++ b/scripts/us_census/pep/monthly_population_estimate/input_url.json @@ -0,0 +1,12 @@ +[ + { + "download_path": "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/national/totals/nat-total.txt" + }, + { + "download_path": "https://www2.census.gov/programs-surveys/popest/tables/2000-2009/state/totals/na-est2009-01.csv" + }, + { + "download_path": "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/national/totals/na-est2019-01.xlsx" + } + +] \ No newline at end of file diff --git a/scripts/us_census/pep/monthly_population_estimate/manifest.json b/scripts/us_census/pep/monthly_population_estimate/manifest.json new file mode 100644 index 0000000000..78fd2eb75f --- /dev/null +++ b/scripts/us_census/pep/monthly_population_estimate/manifest.json @@ -0,0 +1,22 @@ +{ + "import_specifications": [ + { + "import_name": "USCensusPEP_MonthlyPopulation", + "curator_emails": [ + "mogalluru@google.com" + ], + "provenance_url": "https://www2.census.gov/programs-surveys/popest/tables/", + "provenance_description": "The Census Bureau's Population Estimates Program (PEP) produces estimates of the population for the United States.", + "scripts": [ + "preprocess.py" + ], + "import_inputs": [ + { + "template_mcf": "output/USA_Population_Count.tmcf", + "cleaned_csv": "output/USA_Population_Count.csv" + } + ], + "cron_schedule": "0 07 * * 1" + } + ] +} diff --git a/scripts/us_census/pep/monthly_population_estimate/preprocess.py b/scripts/us_census/pep/monthly_population_estimate/preprocess.py index 924eadc517..51ab2a1bca 100644 --- a/scripts/us_census/pep/monthly_population_estimate/preprocess.py +++ b/scripts/us_census/pep/monthly_population_estimate/preprocess.py @@ -15,22 +15,43 @@ from the datasets in the provided local path. Typical usage: 1. python3 preprocess.py - 2. python3 preprocess.py -i input_data + 2. python3 preprocess.py -mode='download' + 3. python3 preprocess.py -mode='process' """ from dataclasses import replace import os import re - +import warnings +import requests +import numpy as np +import time +import json +import sys +from datetime import datetime as dt + +warnings.filterwarnings('ignore') import pandas as pd from absl import app from absl import flags +from absl import logging pd.set_option("display.max_columns", None) FLAGS = flags.FLAGS +flags.DEFINE_string('mode', '', 'Options: download or process') +_MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) +_INPUT_FILE_PATH = os.path.join(_MODULE_DIR, 'input_files') default_input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "input_data") + "input_files") flags.DEFINE_string("input_path", default_input_path, "Import Data File's List") +_HEADER = 1 +_SCALING_FACTOR_TXT_FILE = 1000 + +#Creating folder to store the raw data from source +raw_data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "raw_data") +if not os.path.exists(raw_data_path): + os.mkdir(raw_data_path) _MCF_TEMPLATE = ("Node: dcid:{dcid}\n" "typeOf: dcs:StatisticalVariable\n" @@ -174,9 +195,9 @@ class CensusUSACountryPopulation: Files using pre-defined templates. """ - def __init__(self, input_files: list, csv_file_path: str, - mcf_file_path: str, tmcf_file_path: str) -> None: - self._input_files = input_files + def __init__(self, input_path: str, csv_file_path: str, mcf_file_path: str, + tmcf_file_path: str) -> None: + self.input_path = input_path #added self._cleaned_csv_file_path = csv_file_path self._mcf_file_path = mcf_file_path self._tmcf_file_path = tmcf_file_path @@ -267,28 +288,37 @@ def _transform_data(self, df: pd.DataFrame) -> None: CSV file format. Arguments: - file (str) : Dataset File Path + df (DataFrame): Input DataFrame containing the raw data to be transformed. Returns: - df (DataFrame) : DataFrame. + bool: Returns True if the transformation and file saving are successful, + False if an error occurs during processing. """ - df = self._transform_df(df) + try: + df = self._transform_df(df) - if self._df is None: - self._df = df - else: - self._df = self._df.append(df, ignore_index=True) - - self._df.sort_values(by=['Date', 'date_range'], - ascending=False, - inplace=True) - self._df.drop_duplicates("Date", keep="first", inplace=True) - self._df.drop(['date_range'], axis=1, inplace=True) - float_col = self._df.select_dtypes(include=['float64']) - for col in float_col.columns.values: - self._df[col] = self._df[col].astype('int64') - self._df.to_csv(self._cleaned_csv_file_path, index=False) + if self._df is None: + self._df = df + else: + self._df = pd.concat([self._df, df], ignore_index=True) + + self._df.sort_values(by=['Date', 'date_range'], + ascending=False, + inplace=True) + # Data for 2020 exists in two sources, causing overlap. We'll eliminate duplicates + self._df.drop_duplicates("Date", keep="first", inplace=True) + self._df.drop(['date_range'], axis=1, inplace=True) + float_col = self._df.select_dtypes(include=['float64']) + for col in float_col.columns.values: + try: + self._df[col] = self._df[col].astype('int64') + except: + pass + self._df.to_csv(self._cleaned_csv_file_path, index=False) + except Exception as e: + logging.fatal(f'Error when processing file:-{e}') + return True def _generate_mcf(self, df_cols: list) -> None: """ @@ -301,47 +331,49 @@ def _generate_mcf(self, df_cols: list) -> None: Returns: None """ - - mcf_nodes = [] - for col in df_cols: - pvs = [] - residence = "" - status = "" - armedf = "" - if col.lower() in ["date", "location"]: - continue - if re.findall('Resident', col): - if re.findall('InUSArmedForcesOverseas', col): - status = "USResident__InUSArmedForcesOverseas" - else: - status = "USResident" - residence = "residentStatus: dcs:" + status - pvs.append(residence) - elif re.findall('ArmedForces', col): - residence = "residentStatus: dcs:" + "InUSArmedForcesOverseas" - pvs.append(residence) - if re.findall('Resides', col): - if re.findall('Household', col): - residence = "residenceType: dcs:" + "Household" + try: + mcf_nodes = [] + for col in df_cols: + pvs = [] + residence = "" + status = "" + armedf = "" + if col.lower() in ["date", "location"]: + continue + if re.findall('Resident', col): + if re.findall('InUSArmedForcesOverseas', col): + status = "USResident__InUSArmedForcesOverseas" + else: + status = "USResident" + residence = "residentStatus: dcs:" + status pvs.append(residence) - if re.findall('Civilian', col): - armedf = "armedForcesStatus: dcs:Civilian" - pvs.append(armedf) - if re.findall('NonInstitutionalized', col): - residence = ("institutionalization: dcs:" + - "USC_NonInstitutionalized") + elif re.findall('ArmedForces', col): + residence = "residentStatus: dcs:" + "InUSArmedForcesOverseas" pvs.append(residence) - if re.findall('Count_Person_InUSArmedForcesOverseas', col): - armedf = "armedForcesStatus: dcs:InArmedForces" - pvs.append(armedf) - node = _MCF_TEMPLATE.format(dcid=col, xtra_pvs='\n'.join(pvs)) - mcf_nodes.append(node) - - mcf = '\n'.join(mcf_nodes) - - # Writing Genereated MCF to local path. - with open(self._mcf_file_path, 'w+', encoding='utf-8') as f_out: - f_out.write(mcf.rstrip('\n')) + if re.findall('Resides', col): + if re.findall('Household', col): + residence = "residenceType: dcs:" + "Household" + pvs.append(residence) + if re.findall('Civilian', col): + armedf = "armedForcesStatus: dcs:Civilian" + pvs.append(armedf) + if re.findall('NonInstitutionalized', col): + residence = ("institutionalization: dcs:" + + "USC_NonInstitutionalized") + pvs.append(residence) + if re.findall('Count_Person_InUSArmedForcesOverseas', col): + armedf = "armedForcesStatus: dcs:InArmedForces" + pvs.append(armedf) + node = _MCF_TEMPLATE.format(dcid=col, xtra_pvs='\n'.join(pvs)) + mcf_nodes.append(node) + + mcf = '\n'.join(mcf_nodes) + + # Writing Genereated MCF to local path. + with open(self._mcf_file_path, 'w+', encoding='utf-8') as f_out: + f_out.write(mcf.rstrip('\n')) + except Exception as e: + logging.fatal(f'Error when Generating MCF file:-{e}') def _generate_tmcf(self, df_cols: list) -> None: """ @@ -378,30 +410,289 @@ def process(self): calls defined methods to clean, generate final cleaned CSV file, MCF file and TMCF file. """ - for file in self._input_files: + #input_path = FLAGS.input_path + ip_files = os.listdir(self.input_path) + self.input_files = [ + self.input_path + os.sep + file for file in ip_files + ] + if len(self.input_files) == 0: + logging.info("No files to process") + return + processed_count = 0 + total_files_to_process = len(self.input_files) + logging.info(f"No of files to be processed {len(self.input_files)}") + for file in self.input_files: df = self._load_data(file) - self._transform_data(df) - self._generate_mcf(self._df.columns) - self._generate_tmcf(self._df.columns) + result = self._transform_data(df) + if result: + processed_count += 1 + else: + logging.fatal(f'Failed to process {file}') + logging.info(f"No of files processed {processed_count}") + if total_files_to_process > 0 & (processed_count + == total_files_to_process): + self._generate_mcf(self._df.columns) + self._generate_tmcf(self._df.columns) + else: + logging.fatal( + "Aborting output files as no of files to process not matching processed files" + ) -def main(_): - input_path = FLAGS.input_path +def add_future_year_urls(): + """ + This method scans the download URLs for future years. + """ + global _FILES_TO_DOWNLOAD + with open(os.path.join(_MODULE_DIR, 'input_url.json'), 'r') as inpit_file: + _FILES_TO_DOWNLOAD = json.load(inpit_file) + urls_to_scan = [ + "https://www2.census.gov/programs-surveys/popest/tables/2020-{YEAR}/national/totals/NA-EST{YEAR}-POP.xlsx" + ] + # This method will generate URLs for the years 2021 to 2029 + # need to the latest avaibale year + for future_year in range(2030, 2021, -1): + if dt.now().year > future_year: + YEAR = future_year + for url in urls_to_scan: + url_to_check = url.format(YEAR=YEAR) + try: + check_url = requests.head(url_to_check) + if check_url.status_code == 200: + _FILES_TO_DOWNLOAD.append( + {"download_path": url_to_check}) + break + + except: + logging.error(f"URL is not accessable {url_to_check}") + + +def _clean_csv_file(df: pd.DataFrame) -> pd.DataFrame: + """ + This method cleans the dataframe loaded from a csv file format. + Also, Performs transformations on the data. - ip_files = os.listdir(input_path) - ip_files = [input_path + os.sep + file for file in ip_files] + Args: + df (DataFrame) : DataFrame of csv dataset - # Defining Output file names - data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "output") - cleaned_csv_path = os.path.join(data_file_path, "USA_Population_Count.csv") - mcf_path = os.path.join(data_file_path, "USA_Population_Count.mcf") - tmcf_path = os.path.join(data_file_path, "USA_Population_Count.tmcf") + Returns: + df (DataFrame) : Transformed DataFrame for txt dataset. + """ + # Removal of file description and headers in the initial lines of the input + # + # Input Data: + # table with row headers in column A and column headers in rows 3 through 5 (leading dots indicate sub-parts) + # Table 1. Monthly Population Estimates for the United States: April 1, 2000 to December 1, 2010 + # Year and Month Resident Population Resident Population Plus Armed Forces Overseas Civilian Population Civilian Noninstitutionalized Population + # 2000 + # .April 1 28,14,24,602 28,16,52,670 28,02,00,922 27,61,62,490 + # .May 1 28,16,46,806 28,18,76,634 28,04,28,534 27,63,89,920 + # + # Output Data: + # (Made Headers) Year and Month Resident Population Resident Population Plus Armed Forces Overseas Civilian Population Civilian Noninstitutionalized Population + # 2000 + # .April 1 28,14,24,602 28,16,52,670 28,02,00,922 27,61,62,490 + # .May 1 28,16,46,806 28,18,76,634 28,04,28,534 27,63,89,920 + + idx = df[df[0] == "Year and Month"].index + df = df.iloc[idx.values[0] + 1:][:] + df = df.dropna(axis=1, how='all') + cols = [ + "Year and Month", "Resident Population", + "Resident Population Plus Armed Forces Overseas", "Civilian Population", + "Civilian NonInstitutionalized Population" + ] + df.columns = cols + for col in df.columns: + df[col] = df[col].str.replace(",", "") + return df + + +def _clean_txt_file(df: pd.DataFrame) -> pd.DataFrame: + """ + This method cleans the dataframe loaded from a txt file format. + Also, Performs transformations on the data. - loader = CensusUSACountryPopulation(ip_files, cleaned_csv_path, mcf_path, - tmcf_path) + Arguments: + df (DataFrame): DataFrame representing the loaded TXT dataset. + + Returns: + DataFrame: Transformed DataFrame after cleaning operations. + """ + df['Year and Month'] = df[['Year and Month', 'Date']]\ + .apply(_concat_cols, axis=1) + df.drop(columns=['Date'], inplace=True) + for col in df.columns: + df[col] = df[col].str.replace(",", "") + + # The index numbers alotted as per where the columns are present to + # move the columns left + resident_population = 1 + resident_population_plus_armed_forces_overseas = 2 + civilian_population = 3 + civilian_noninstitutionalized_population = 4 + # Moving the row data left upto one index value. + # As the text file has (census) mentioned in some rows and it makes the + # other column's data shift by one place, we need to shift it back to the + # original place. + idx = df[df['Resident Population'] == "(census)"].index + df.iloc[idx, resident_population] = df.iloc[idx][ + "Resident Population Plus Armed Forces Overseas"] + df.iloc[idx, resident_population_plus_armed_forces_overseas] = df.iloc[idx][ + "Civilian Population"] + df.iloc[idx, civilian_population] = df.iloc[idx][ + "Civilian NonInstitutionalized Population"] + df.iloc[idx, civilian_noninstitutionalized_population] = np.NAN + return df + + +def _mulitply_scaling_factor(col: pd.Series) -> pd.Series: + """ + This method multiply dataframe column with scaling factor. + + Arguments: + col (Series): A DataFrame column of dtype int, containing the values to be scaled. + + Returns: + Series: A DataFrame column with values multiplied by the scaling factor. + """ + res = col + if col not in [None, np.NAN]: + if col.isdigit(): + res = int(col) * _SCALING_FACTOR_TXT_FILE + return res + + +def _concat_cols(col: pd.Series) -> pd.Series: + """ + This method concats two DataFrame column values + with space in-between. - loader.process() + Args: + col (Series): A pandas Series containing two values from the DataFrame. + + Returns: + res (Series) : Concatenated DataFrame Columns + """ + res = col[0] + if col[1] is None: + return res + res = col[0] + ' ' + col[1] + return res + + +def download_files(): + """ + This method allows to download the input files. + """ + global _FILES_TO_DOWNLOAD + session = requests.session() + max_retry = 5 + for file_to_dowload in _FILES_TO_DOWNLOAD: + file_name = None + url = file_to_dowload['download_path'] + if 'file_name' in file_to_dowload and len( + file_to_dowload['file_name'] > 5): + file_name = file_to_dowload['file_name'] + else: + file_name = url.split('/')[-1] + retry_number = 0 + + is_file_downloaded = False + while is_file_downloaded == False: + try: + df = None + file_name = url.split("/")[-1] + + if ".xls" in url: + df = pd.read_excel(url, header=_HEADER) + df.to_excel(os.path.join(raw_data_path, file_name), + index=False, + header=False, + engine='xlsxwriter') + df.to_excel(os.path.join(_INPUT_FILE_PATH, file_name), + index=False, + header=False, + engine='xlsxwriter') + elif ".csv" in url: + with requests.get(url, stream=True) as response: + response.raise_for_status() + if response.status_code == 200: + with open(os.path.join(raw_data_path, file_name), + 'wb') as f: + f.write(response.content) + file_name = file_name.replace(".csv", ".xlsx") + df = pd.read_csv(url, header=None) + df = _clean_csv_file(df) + df.to_excel(os.path.join(_INPUT_FILE_PATH, file_name), + index=False, + engine='xlsxwriter') + elif ".txt" in url: + with requests.get(url, stream=True) as response: + response.raise_for_status() + if response.status_code == 200: + with open(os.path.join(raw_data_path, file_name), + 'wb') as f: + f.write(response.content) + file_name = file_name.replace(".txt", ".xlsx") + cols = [ + "Year and Month", "Date", "Resident Population", + "Resident Population Plus Armed Forces Overseas", + "Civilian Population", + "Civilian NonInstitutionalized Population" + ] + df = pd.read_table(url, + index_col=False, + delim_whitespace=True, + engine='python', + skiprows=17, + names=cols) + # Skipping 17 rows as the initial 17 rows contains the information about + # the file being used, heading files spread accross multiple lines and + # other irrelevant information like source/contact details. + df = _clean_txt_file(df) + # Multiplying the data with scaling factor 1000. + for col in df.columns: + if "year" not in col.lower(): + df[col] = df[col].apply(_mulitply_scaling_factor) + df.to_excel(os.path.join(_INPUT_FILE_PATH, file_name), + index=False, + engine='xlsxwriter') + + is_file_downloaded = True + logging.info(f"Downloaded file : {url}") + + except Exception as e: + logging.error(f"Retry file download {url} - {e}") + time.sleep(5) + retry_number += 1 + if retry_number > max_retry: + logging.fatal(f"Error downloading {url}") + logging.error("Exit from script") + sys.exit(1) + return True + + +def main(_): + mode = FLAGS.mode + # Defining Output file names + output_path = os.path.join(_MODULE_DIR, "output") + input_path = os.path.join(_MODULE_DIR, "input_files") + if not os.path.exists(input_path): + os.mkdir(input_path) + if not os.path.exists(output_path): + os.mkdir(output_path) + cleaned_csv_path = os.path.join(output_path, "USA_Population_Count.csv") + mcf_path = os.path.join(output_path, "USA_Population_Count.mcf") + tmcf_path = os.path.join(output_path, "USA_Population_Count.tmcf") + download_status = True + if mode == "" or mode == "download": + add_future_year_urls() + download_status = download_files() + if download_status and (mode == "" or mode == "process"): + loader = CensusUSACountryPopulation(FLAGS.input_path, cleaned_csv_path, + mcf_path, tmcf_path) + loader.process() if __name__ == "__main__": diff --git a/scripts/us_census/pep/monthly_population_estimate/preprocess_test.py b/scripts/us_census/pep/monthly_population_estimate/preprocess_test.py index fbe7121dd7..23a821b0ff 100644 --- a/scripts/us_census/pep/monthly_population_estimate/preprocess_test.py +++ b/scripts/us_census/pep/monthly_population_estimate/preprocess_test.py @@ -14,15 +14,16 @@ """ Script to automate the testing for USA Population preprocess script. """ - import os import unittest from os import path from preprocess import CensusUSACountryPopulation +from absl import flags # module_dir_ is the path to where this test is running from. -module_dir_ = os.path.dirname(__file__) -test_data_folder = os.path.join(module_dir_, "test_data") -op_data_folder = os.path.join(module_dir_, "test_output_data") +_MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) +_INPUT_FILE_PATH = os.path.join(_MODULE_DIR, "test_data", "input_files") +test_output_folder = os.path.join(_MODULE_DIR, "test_data", "output_data") +expected_op_folder = os.path.join(_MODULE_DIR, "test_data", "expected_output") class TestPreprocess(unittest.TestCase): @@ -31,13 +32,11 @@ class TestPreprocess(unittest.TestCase): properties which further requried for unit testing """ - cleaned_csv_file_path = os.path.join(op_data_folder, "data.csv") - mcf_file_path = os.path.join(op_data_folder, "test_census.mcf") - tmcf_file_path = os.path.join(op_data_folder, "test_census.tmcf") - - ip_data_path = [os.path.join(test_data_folder, "test_census_data.xlsx")] + cleaned_csv_file_path = os.path.join(test_output_folder, "data.csv") + mcf_file_path = os.path.join(test_output_folder, "test_census.mcf") + tmcf_file_path = os.path.join(test_output_folder, "test_census.tmcf") - base = CensusUSACountryPopulation(ip_data_path, cleaned_csv_file_path, + base = CensusUSACountryPopulation(_INPUT_FILE_PATH, cleaned_csv_file_path, mcf_file_path, tmcf_file_path) base.process() @@ -46,11 +45,11 @@ def test_mcf_tmcf_files(self): This method is required to test between output generated preprocess script and excepted output files like MCF File """ - expected_mcf_file_path = os.path.join( - test_data_folder, "expected_USA_Population_Count.mcf") + expected_mcf_file_path = os.path.join(expected_op_folder, + "USA_Population_Count.mcf") - expected_tmcf_file_path = os.path.join( - test_data_folder, "expected_USA_Population_Count.tmcf") + expected_tmcf_file_path = os.path.join(expected_op_folder, + "USA_Population_Count.tmcf") with open(expected_mcf_file_path, encoding="UTF-8") as expected_mcf_file: @@ -79,8 +78,8 @@ def test_create_csv(self): This method is required to test between output generated preprocess script and excepted output files like CSV """ - expected_csv_file_path = os.path.join( - test_data_folder, "expected_USA_Population_Count.csv") + expected_csv_file_path = os.path.join(expected_op_folder, + "USA_Population_Count.csv") expected_csv_data = "" with open(expected_csv_file_path, @@ -94,3 +93,7 @@ def test_create_csv(self): os.remove(self.cleaned_csv_file_path) self.assertEqual(expected_csv_data.strip(), csv_data.strip()) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/us_census/pep/monthly_population_estimate/test_data/download_expected_USA_Population_Count.xlsx b/scripts/us_census/pep/monthly_population_estimate/test_data/download_expected_USA_Population_Count.xlsx deleted file mode 100644 index 7249a32103..0000000000 Binary files a/scripts/us_census/pep/monthly_population_estimate/test_data/download_expected_USA_Population_Count.xlsx and /dev/null differ diff --git a/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.csv b/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.csv deleted file mode 100644 index be8bc94191..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.csv +++ /dev/null @@ -1,25 +0,0 @@ -Date,Location,Count_Person_InUSArmedForcesOverseas,Count_Person_USResident,Count_Person_USResidentOrInUSArmedForcesOverseas,Count_Person_Civilian,Count_Person_Civilian_NonInstitutionalized -2002-12,country/USA,308953,288910708,289219661,287657759,283567193 -2002-11,country/USA,299808,288716345,289016153,287455156,283367384 -2002-10,country/USA,298850,288484263,288783113,287215377,283130237 -2002-09,country/USA,300697,288233967,288534664,286963476,282881204 -2002-08,country/USA,305751,287973348,288279099,286707011,282627594 -2002-07,country/USA,300904,287726647,288027551,286460080,282383435 -2002-06,country/USA,295655,287493821,287789476,286230294,282154728 -2002-05,country/USA,301620,287250118,287551738,285998163,281923687 -2002-04,country/USA,302569,287025879,287328448,285783873,281710480 -2002-03,country/USA,306955,286817724,287124679,285584303,281512166 -2002-02,country/USA,296700,286634747,286931447,285395443,281324390 -2002-01,country/USA,298260,286429775,286728035,285203166,281132861 -2001-12,country/USA,287595,286225459,286513054,284988967,280920258 -2001-11,country/USA,283100,286004703,286287803,284777261,280709686 -2001-10,country/USA,277453,285770015,286047468,284562038,280495455 -2001-09,country/USA,223757,285571270,285795027,284324928,280259557 -2001-08,country/USA,227016,285298620,285525636,284058779,279994583 -2001-07,country/USA,227463,285039803,285267266,283803584,279740517 -2001-06,country/USA,227626,284795649,285023275,283566906,279505761 -2001-05,country/USA,225333,284550030,284775363,283321122,279261915 -2001-04,country/USA,222221,284327688,284549909,283092935,279035703 -2001-03,country/USA,221303,284100861,284322164,282863438,278808323 -2001-02,country/USA,221982,283890737,284112719,282654413,278601238 -2001-01,country/USA,208561,283690962,283899523,282442439,278390935 diff --git a/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.mcf b/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.mcf deleted file mode 100644 index 996fef3bda..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.mcf +++ /dev/null @@ -1,36 +0,0 @@ -Node: dcid:Count_Person_InUSArmedForcesOverseas -typeOf: dcs:StatisticalVariable -populationType: dcs:Person -statType: dcs:measuredValue -measuredProperty: dcs:count -residentStatus: dcs:InUSArmedForcesOverseas -armedForcesStatus: dcs:InArmedForces - -Node: dcid:Count_Person_USResident -typeOf: dcs:StatisticalVariable -populationType: dcs:Person -statType: dcs:measuredValue -measuredProperty: dcs:count -residentStatus: dcs:USResident - -Node: dcid:Count_Person_USResidentOrInUSArmedForcesOverseas -typeOf: dcs:StatisticalVariable -populationType: dcs:Person -statType: dcs:measuredValue -measuredProperty: dcs:count -residentStatus: dcs:USResident__InUSArmedForcesOverseas - -Node: dcid:Count_Person_Civilian -typeOf: dcs:StatisticalVariable -populationType: dcs:Person -statType: dcs:measuredValue -measuredProperty: dcs:count -armedForcesStatus: dcs:Civilian - -Node: dcid:Count_Person_Civilian_NonInstitutionalized -typeOf: dcs:StatisticalVariable -populationType: dcs:Person -statType: dcs:measuredValue -measuredProperty: dcs:count -armedForcesStatus: dcs:Civilian -institutionalization: dcs:USC_NonInstitutionalized \ No newline at end of file diff --git a/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.tmcf b/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.tmcf deleted file mode 100644 index 906e93b366..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/test_data/expected_USA_Population_Count.tmcf +++ /dev/null @@ -1,44 +0,0 @@ -Node: E:USA_Population_Count->E0 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Count_Person_InUSArmedForcesOverseas -measurementMethod: dcs:dcAggregate/CensusPEPSurvey -observationAbout: C:USA_Population_Count->Location -observationDate: C:USA_Population_Count->Date -observationPeriod: "P1M" -value: C:USA_Population_Count->Count_Person_InUSArmedForcesOverseas - -Node: E:USA_Population_Count->E1 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Count_Person_USResident -measurementMethod: dcs:CensusPEPSurvey -observationAbout: C:USA_Population_Count->Location -observationDate: C:USA_Population_Count->Date -observationPeriod: "P1M" -value: C:USA_Population_Count->Count_Person_USResident - -Node: E:USA_Population_Count->E2 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Count_Person_USResidentOrInUSArmedForcesOverseas -measurementMethod: dcs:CensusPEPSurvey -observationAbout: C:USA_Population_Count->Location -observationDate: C:USA_Population_Count->Date -observationPeriod: "P1M" -value: C:USA_Population_Count->Count_Person_USResidentOrInUSArmedForcesOverseas - -Node: E:USA_Population_Count->E3 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Count_Person_Civilian -measurementMethod: dcs:CensusPEPSurvey -observationAbout: C:USA_Population_Count->Location -observationDate: C:USA_Population_Count->Date -observationPeriod: "P1M" -value: C:USA_Population_Count->Count_Person_Civilian - -Node: E:USA_Population_Count->E4 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Count_Person_Civilian_NonInstitutionalized -measurementMethod: dcs:CensusPEPSurvey -observationAbout: C:USA_Population_Count->Location -observationDate: C:USA_Population_Count->Date -observationPeriod: "P1M" -value: C:USA_Population_Count->Count_Person_Civilian_NonInstitutionalized \ No newline at end of file diff --git a/scripts/us_census/pep/monthly_population_estimate/test_data/test_census_data.csv b/scripts/us_census/pep/monthly_population_estimate/test_data/test_census_data.csv deleted file mode 100644 index b92efb590e..0000000000 --- a/scripts/us_census/pep/monthly_population_estimate/test_data/test_census_data.csv +++ /dev/null @@ -1,151 +0,0 @@ -table with row headers in column A and column headers in rows 3 through 5 (leading dots indicate sub-parts),,,,,,,,,,,,,, -"Table 1. Monthly Population Estimates for the United States: April 1, 2000 to December 1, 2010",,,,,,,,,,,,,, -Year and Month,Resident Population,"Resident -Population Plus Armed Forces Overseas",Civilian Population,Civilian Noninstitutionalized Population,,,,,,,,,, -2000,,,,,,,,,,,,,, -.April 1,"281,424,602","281,652,670","280,200,922","276,162,490",,,,,,,,,, -.May 1,"281,646,806","281,876,634","280,428,534","276,389,920",,,,,,,,,, -.June 1,"281,894,718","282,126,112","280,675,165","276,636,370",,,,,,,,,, -.July 1,"282,171,957","282,384,579","280,927,342","276,888,367",,,,,,,,,, -.August 1,"282,441,258","282,652,794","281,192,111","277,151,151",,,,,,,,,, -.September 1,"282,721,654","282,932,017","281,467,352","277,424,407",,,,,,,,,, -.October 1,"282,995,517","283,200,677","281,732,198","277,687,268",,,,,,,,,, -.November 1,"283,243,960","283,452,914","281,985,624","277,938,709",,,,,,,,,, -.December 1,"283,493,503","283,695,587","282,230,629","278,181,729",,,,,,,,,, -2001,,,,,,,,,,,,,, -.January 1,"283,711,841","283,920,402","282,463,318","278,412,433",,,,,,,,,, -.February 1,"283,915,092","284,137,074","282,678,768","278,625,898",,,,,,,,,, -.March 1,"284,128,687","284,349,990","282,891,264","278,836,409",,,,,,,,,, -.April 1,"284,359,005","284,581,226","283,124,252","279,067,412",,,,,,,,,, -.May 1,"284,584,820","284,810,153","283,355,912","279,297,087",,,,,,,,,, -.June 1,"284,833,913","285,061,539","283,605,170","279,544,360",,,,,,,,,, -.July 1,"285,081,556","285,309,019","283,845,337","279,782,541",,,,,,,,,, -.August 1,"285,343,325","285,570,341","284,103,484","280,039,084",,,,,,,,,, -.September 1,"285,618,928","285,842,685","284,372,586","280,306,582",,,,,,,,,, -.October 1,"285,820,656","286,098,109","284,612,679","280,545,071",,,,,,,,,, -.November 1,"286,058,306","286,341,406","284,830,864","280,761,652",,,,,,,,,, -.December 1,"286,282,016","286,569,611","285,045,524","280,974,708",,,,,,,,,, -2002,,,,,,,,,,,,,, -.January 1,"286,489,300","286,787,560","285,262,691","281,190,271",,,,,,,,,, -.February 1,"286,697,229","286,993,929","285,457,925","281,383,901",,,,,,,,,, -.March 1,"286,883,155","287,190,110","285,649,734","281,574,106",,,,,,,,,, -.April 1,"287,094,280","287,396,849","285,852,274","281,775,042",,,,,,,,,, -.May 1,"287,321,472","287,623,092","286,069,517","281,990,681",,,,,,,,,, -.June 1,"287,568,123","287,863,778","286,304,596","282,224,156",,,,,,,,,, -.July 1,"287,803,914","288,104,818","286,537,347","282,455,312",,,,,,,,,, -.August 1,"288,053,796","288,359,547","286,787,459","282,702,455",,,,,,,,,, -.September 1,"288,317,604","288,618,301","287,047,113","282,959,140",,,,,,,,,, -.October 1,"288,571,092","288,869,942","287,302,206","283,211,264",,,,,,,,,, -.November 1,"288,806,336","289,106,144","287,545,147","283,451,236",,,,,,,,,, -.December 1,"289,003,868","289,312,821","287,750,919","283,654,039",,,,,,,,,, -2003,,,,,,,,,,,,,, -.January 1,"289,201,322","289,517,581","287,959,845","283,859,996",,,,,,,,,, -.February 1,"289,337,916","289,713,718","288,112,710","284,009,892",,,,,,,,,, -.March 1,"289,466,699","289,910,879","288,253,351","284,147,564",,,,,,,,,, -.April 1,"289,610,324","290,124,662","288,428,590","284,319,834",,,,,,,,,, -.May 1,"289,767,993","290,345,733","288,640,931","284,529,206",,,,,,,,,, -.June 1,"290,035,557","290,583,692","288,862,293","284,747,599",,,,,,,,,, -.July 1,"290,326,418","290,819,634","289,106,845","284,989,188",,,,,,,,,, -.August 1,"290,612,993","291,071,932","289,358,000","285,238,604",,,,,,,,,, -.September 1,"290,880,511","291,321,180","289,619,144","285,498,009",,,,,,,,,, -.October 1,"291,142,815","291,574,033","289,889,750","285,766,876",,,,,,,,,, -.November 1,"291,388,995","291,807,038","290,124,107","285,999,494",,,,,,,,,, -.December 1,"291,595,413","292,007,848","290,326,068","286,199,716",,,,,,,,,, -2004,,,,,,,,,,,,,, -.January 1,"291,786,304","292,191,890","290,503,644","286,375,553",,,,,,,,,, -.February 1,"291,950,419","292,367,612","290,674,712","286,544,882",,,,,,,,,, -.March 1,"292,123,354","292,560,692","290,872,867","286,741,298",,,,,,,,,, -.April 1,"292,344,890","292,778,691","291,097,274","286,963,966",,,,,,,,,, -.May 1,"292,588,044","292,997,480","291,326,299","287,191,252",,,,,,,,,, -.June 1,"292,811,010","293,222,756","291,555,947","287,419,161",,,,,,,,,, -.July 1,"293,045,739","293,463,185","291,784,900","287,646,373",,,,,,,,,, -.August 1,"293,299,261","293,718,707","292,038,514","287,898,986",,,,,,,,,, -.September 1,"293,551,697","293,971,409","292,289,303","288,148,774",,,,,,,,,, -.October 1,"293,816,859","294,229,581","292,548,449","288,406,919",,,,,,,,,, -.November 1,"294,044,976","294,466,162","292,779,083","288,636,552",,,,,,,,,, -.December 1,"294,258,671","294,694,170","293,005,004","288,861,472",,,,,,,,,, -2005,,,,,,,,,,,,,, -.January 1,"294,473,116","294,914,085","293,232,478","289,087,945",,,,,,,,,, -.February 1,"294,621,503","295,104,691","293,420,384","289,274,850",,,,,,,,,, -.March 1,"294,799,196","295,286,533","293,612,973","289,466,438",,,,,,,,,, -.April 1,"295,023,274","295,490,295","293,838,588","289,691,052",,,,,,,,,, -.May 1,"295,269,221","295,704,131","294,069,370","289,920,833",,,,,,,,,, -.June 1,"295,500,040","295,936,147","294,309,989","290,160,451",,,,,,,,,, -.July 1,"295,753,151","296,186,216","294,562,297","290,411,763",,,,,,,,,, -.August 1,"296,007,421","296,439,994","294,816,126","290,662,875",,,,,,,,,, -.September 1,"296,274,716","296,706,566","295,079,303","290,923,335",,,,,,,,,, -.October 1,"296,540,050","296,972,335","295,340,260","291,181,575",,,,,,,,,, -.November 1,"296,745,685","297,206,602","295,565,897","291,404,495",,,,,,,,,, -.December 1,"296,991,295","297,431,095","295,820,132","291,656,013",,,,,,,,,, -2006,,,,,,,,,,,,,, -.January 1,"297,213,401","297,646,557","296,048,518","291,881,682",,,,,,,,,, -.February 1,"297,442,801","297,854,109","296,263,090","292,093,537",,,,,,,,,, -.March 1,"297,645,240","298,059,587","296,474,308","292,302,038",,,,,,,,,, -.April 1,"297,869,107","298,281,380","296,699,593","292,524,606",,,,,,,,,, -.May 1,"298,087,150","298,496,496","296,920,413","292,742,709",,,,,,,,,, -.June 1,"298,333,748","298,738,764","297,162,428","292,982,007",,,,,,,,,, -.July 1,"298,593,212","298,995,825","297,413,314","293,230,183",,,,,,,,,, -.August 1,"298,855,875","299,263,434","297,674,257","293,488,861",,,,,,,,,, -.September 1,"299,145,336","299,554,491","297,958,769","293,771,108",,,,,,,,,, -.October 1,"299,408,794","299,835,212","298,241,553","294,051,627",,,,,,,,,, -.November 1,"299,656,685","300,094,448","298,505,133","294,312,942",,,,,,,,,, -.December 1,"299,931,461","300,339,839","298,755,324","294,560,868",,,,,,,,,, -2007,,,,,,,,,,,,,, -.January 1,"300,175,309","300,574,481","299,002,526","294,805,805",,,,,,,,,, -.February 1,"300,392,246","300,802,220","299,225,358","295,026,372",,,,,,,,,, -.March 1,"300,599,974","301,021,235","299,446,725","295,245,474",,,,,,,,,, -.April 1,"300,829,936","301,254,227","299,690,117","295,486,601",,,,,,,,,, -.May 1,"301,056,888","301,483,168","299,921,501","295,715,720",,,,,,,,,, -.June 1,"301,305,789","301,738,673","300,169,356","295,961,310",,,,,,,,,, -.July 1,"301,579,895","302,003,917","300,424,617","296,214,295",,,,,,,,,, -.August 1,"301,843,290","302,266,771","300,684,407","296,473,686",,,,,,,,,, -.September 1,"302,114,496","302,546,314","300,959,882","296,748,762",,,,,,,,,, -.October 1,"302,369,037","302,806,716","301,220,870","297,009,351",,,,,,,,,, -.November 1,"302,624,386","303,053,874","301,472,857","297,260,939",,,,,,,,,, -.December 1,"302,868,731","303,287,359","301,710,949","297,498,632",,,,,,,,,, -2008,,,,,,,,,,,,,, -.January 1,"303,088,358","303,506,469","301,934,471","297,721,755",,,,,,,,,, -.February 1,"303,290,150","303,710,955","302,123,393","297,910,278",,,,,,,,,, -.March 1,"303,491,865","303,907,397","302,317,725","298,104,211",,,,,,,,,, -.April 1,"303,684,948","304,116,991","302,528,079","298,314,166",,,,,,,,,, -.May 1,"303,901,726","304,323,167","302,733,464","298,519,152",,,,,,,,,, -.June 1,"304,127,454","304,555,519","302,966,162","298,751,451",,,,,,,,,, -.July 1,"304,374,846","304,797,761","303,202,282","298,987,171",,,,,,,,,, -.August 1,"304,628,302","305,045,094","303,446,271","299,231,160",,,,,,,,,, -.September 1,"304,892,254","305,308,941","303,699,448","299,484,337",,,,,,,,,, -.October 1,"305,127,551","305,554,049","303,938,005","299,722,894",,,,,,,,,, -.November 1,"305,359,225","305,785,716","304,163,876","299,948,765",,,,,,,,,, -.December 1,"305,583,122","306,003,990","304,379,842","300,164,731",,,,,,,,,, -20091,,,,,,,,,,,,,, -.January 1,"305,794,227","306,207,719","304,583,861","300,368,750",,,,,,,,,, -.February 1,"305,980,358","306,401,755","304,772,530","300,557,419",,,,,,,,,, -.March 1,"306,170,830","306,588,055","304,952,066","300,736,955",,,,,,,,,, -.April 1,"306,360,603","306,787,200","305,148,179","300,933,068",,,,,,,,,, -.May 1,"306,554,396","306,983,561","305,338,759","301,123,648",,,,,,,,,, -.June 1,"306,772,254","307,206,368","305,549,284","301,334,173",,,,,,,,,, -.July 1,"307,006,550","307,439,406","305,781,933","301,570,342",,,,,,,,,, -.August 1,"307,251,662","307,684,518","306,027,045","301,815,454",,,,,,,,,, -.September 1,"307,513,569","307,946,425","306,288,952","302,077,361",,,,,,,,,, -.October 1,"307,756,577","308,189,433","306,531,960","302,320,369",,,,,,,,,, -.November 1,"307,985,264","308,418,120","306,760,647","302,549,056",,,,,,,,,, -.December 1,"308,200,409","308,633,265","306,975,792","302,764,201",,,,,,,,,, -20101,,,,,,,,,,,,,, -.January 1,"308,400,408","308,833,264","307,175,791","302,964,200",,,,,,,,,, -.February 1,"308,593,755","309,026,611","307,369,138","303,157,547",,,,,,,,,, -.March 1,"308,779,455","309,212,311","307,554,838","303,343,247",,,,,,,,,, -.April 1,"308,977,944","309,410,800","307,753,327","303,541,736",,,,,,,,,, -.May 1,"309,173,793","309,606,649","307,949,176","303,737,585",,,,,,,,,, -.June 1,"309,396,380","309,829,236","308,171,763","303,960,172",,,,,,,,,, -.July 1,"309,629,415","310,062,271","308,404,798","304,193,207",,,,,,,,,, -.August 1,"309,874,567","310,307,423","308,649,950","304,438,359",,,,,,,,,, -.September 1,"310,136,722","310,569,578","308,912,105","304,700,514",,,,,,,,,, -.October 1,"310,379,841","310,812,697","309,155,224","304,943,633",,,,,,,,,, -.November 1,"310,608,332","311,041,188","309,383,715","305,172,124",,,,,,,,,, -.December 1,"310,823,152","311,256,008","309,598,535","305,386,944",,,,,,,,,, -"1The monthly estimates beginning with August 1, 2009 and forward are short-term projections.",,,,,,,,,,,,,, -"Note: The estimates are based on Census 2000 and any modifications as documented in the Count Question Resolution program. The April 1, 2000 population in this table is the Population Estimates base. ",,,,,,,,,,,,,, -Suggested Citation:,,,,,,,,,,,,,, -"Table 1. Monthly Population Estimates for the United States: April 1, 2000 to December 1, 2010 (NA-EST2009-01)",,,,,,,,,,,,,, -"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,, -Release Date: December 2009,,,,,,,,,,,,,, -,,,,,,,,,,,,,, diff --git a/scripts/us_census/pep/monthly_population_estimate/test_data/test_census_data.xlsx b/scripts/us_census/pep/monthly_population_estimate/test_data/test_census_data.xlsx deleted file mode 100644 index 5a934f2b9d..0000000000 Binary files a/scripts/us_census/pep/monthly_population_estimate/test_data/test_census_data.xlsx and /dev/null differ