From 3c6ef25e053c513b03b4429b9bda806523e10af2 Mon Sep 17 00:00:00 2001
From: Sudhisha K <sudhisha@google.com>
Date: Thu, 14 Nov 2024 09:01:32 +0000
Subject: [PATCH] USCensusPEP_Annual_Population_changes

---
 .../pep/annual_population/preprocess.py       | 262 ++++++++++++------
 1 file changed, 171 insertions(+), 91 deletions(-)

diff --git a/scripts/us_census/pep/annual_population/preprocess.py b/scripts/us_census/pep/annual_population/preprocess.py
index 167168edb..bc840b76f 100644
--- a/scripts/us_census/pep/annual_population/preprocess.py
+++ b/scripts/us_census/pep/annual_population/preprocess.py
@@ -55,8 +55,9 @@
 _FLAGS = flags.FLAGS
 
 flags.DEFINE_string('mode', '', 'Options: download or process')
-flags.DEFINE_bool('is_summary_levels', False,
-                  'Options: True for all summary_levels and False for only 162')
+flags.DEFINE_bool(
+    'is_summary_levels', False,
+    'Options: True for all summary_levels and False for only 162')
 
 _MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
 _INPUT_FILE_PATH = os.path.join(_MODULE_DIR, 'input_files')
@@ -64,11 +65,13 @@
 sys.path.insert(1, _MODULE_DIR)
 # pylint: disable=wrong-import-position
 # pylint: disable=import-error
-from clean import (clean_data_df, clean_1970_1989_county_txt, process_states_1900_1969,
-                   process_states_1970_1979, process_states_1980_1989, process_states_1990_1999)
+from clean import (clean_data_df, clean_1970_1989_county_txt,
+                   process_states_1900_1969, process_states_1970_1979,
+                   process_states_1980_1989, process_states_1990_1999)
 
-from constants import (INPUT_DIRS, OUTPUT_DIR, SCALING_FACTOR_STATE_1900_1960, USA, USA_GEO_ID,
-                       DISTRICT_OF_COLUMBIA_STATE_CODE, DISTRICT_OF_COLUMBIA_COUNTY_CODE, INPUT_DIR)
+from constants import (INPUT_DIRS, OUTPUT_DIR, SCALING_FACTOR_STATE_1900_1960,
+                       USA, USA_GEO_ID, DISTRICT_OF_COLUMBIA_STATE_CODE,
+                       DISTRICT_OF_COLUMBIA_COUNTY_CODE, INPUT_DIR)
 
 sys.path.insert(1, os.path.join(_MODULE_DIR, '../../../../'))
 import util.alpha2_to_dcid as alpha2todcid
@@ -236,10 +239,8 @@ def _process_csv_file(data_df: pd.DataFrame, area: str) -> pd.DataFrame:
         res_data_df = pd.concat([res_data_df, tmp_data_df])
     res_data_df = res_data_df.dropna(subset=["Count_Person"])
     for col in res_data_df.columns:
-        res_data_df[col] = res_data_df[col].str.replace(",", "",
-                                                        regex=False).str.replace(".",
-                                                                                 "",
-                                                                                 regex=False)
+        res_data_df[col] = res_data_df[col].str.replace(
+            ",", "", regex=False).str.replace(".", "", regex=False)
     res_data_df = res_data_df.reset_index().drop(columns=['index'])
     return res_data_df
 
@@ -250,7 +251,8 @@ def _states_full_to_short_form(data_df: pd.DataFrame,
                                replace_key: str = " ") -> pd.DataFrame:
     short_forms = _USSTATE_SHORT_FORM
     data_df[new_col] = data_df[data_col].str.replace(
-        replace_key, "", regex=False).apply(lambda row: short_forms.get(row, row))
+        replace_key, "",
+        regex=False).apply(lambda row: short_forms.get(row, row))
     return data_df
 
 
@@ -258,8 +260,10 @@ def _state_to_geo_id(state: str) -> str:
     return USSTATE_MAP.get(state, state)
 
 
-def _add_geo_id(data_df: pd.DataFrame, data_col: str, new_col: str) -> pd.DataFrame:
-    data_df[new_col] = data_df[data_col].apply(lambda rec: USSTATE_MAP.get(rec, pd.NA))
+def _add_geo_id(data_df: pd.DataFrame, data_col: str,
+                new_col: str) -> pd.DataFrame:
+    data_df[new_col] = data_df[data_col].apply(
+        lambda rec: USSTATE_MAP.get(rec, pd.NA))
     data_df = data_df.dropna(subset=[new_col])
     return data_df
 
@@ -302,8 +306,9 @@ def _process_nationals_1900_1979(ip_file: str, op_file: str) -> None:
                     year = int(line[9:13])
                     if year >= 1980:
                         continue
-                    national_pop_stats.write("\n" + str(year) + ",country/USA," +
-                                             line[14:30].replace(",", "").lstrip().rstrip())
+                    national_pop_stats.write(
+                        "\n" + str(year) + ",country/USA," +
+                        line[14:30].replace(",", "").lstrip().rstrip())
 
 
 def _process_nationals_1980_1989(ip_file: str) -> pd.DataFrame:
@@ -324,8 +329,13 @@ def _process_nationals_1980_1989(ip_file: str) -> pd.DataFrame:
         for line in data:
             if "United States" in line:
                 usa_rows += line.replace("00000", "").strip() + " "
-        usa_cleaned_row = [int(val) for val in usa_rows.split(" ") if val.isnumeric()]
-        year = ["1980", "1981", "1982", "1983", "1984", "1985", "1986", "1987", "1988", "1989"]
+        usa_cleaned_row = [
+            int(val) for val in usa_rows.split(" ") if val.isnumeric()
+        ]
+        year = [
+            "1980", "1981", "1982", "1983", "1984", "1985", "1986", "1987",
+            "1988", "1989"
+        ]
         geo_id = "country/USA"
         #df_cols = [["Year", "Count_Person"]
         data_df = pd.DataFrame(usa_cleaned_row, columns=["Count_Person"])
@@ -345,12 +355,17 @@ def _process_nationals_1990_1999(ip_file: str) -> pd.DataFrame:
     """
     data_df = _load_data_df(path=ip_file, file_format="csv", header=1)
 
-    df_cols = ["Year", "Age", "Count_Person", "Count_Person_Male", "Count_Person_Female"]
+    df_cols = [
+        "Year", "Age", "Count_Person", "Count_Person_Male",
+        "Count_Person_Female"
+    ]
     data_df.columns = df_cols
     data_df = data_df[(data_df["Age"] == "All Age") &
-                      (data_df["Year"].str.startswith("July"))].reset_index(drop=True)
+                      (data_df["Year"].str.startswith("July"))].reset_index(
+                          drop=True)
     data_df["Year"] = data_df["Year"].str.replace("July 1, ", "")
-    data_df = data_df.drop(columns=["Age", "Count_Person_Male", "Count_Person_Female"])
+    data_df = data_df.drop(
+        columns=["Age", "Count_Person_Male", "Count_Person_Female"])
     data_df["Location"] = "country/USA"
     data_df = data_df[["Year", "Location", "Count_Person"]]
     return data_df
@@ -412,7 +427,10 @@ def _process_nationals_2000_2009(file_path: str) -> pd.DataFrame:
     data_df = pd.DataFrame()
     data_df = _load_data_df(path=file_path, file_format="csv", header=3)
 
-    pop_cols = ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009"]
+    pop_cols = [
+        "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
+        "2009"
+    ]
     df_cols = ["Region", "042000"] + pop_cols + ["042010", "072010"]
     data_df.columns = df_cols
     data_df = data_df[data_df["Region"] == USA]
@@ -446,7 +464,8 @@ def _process_states_2000_2009(file_path: str) -> pd.DataFrame:
         data_df[col] = data_df[col].str.replace(",", "")
     data_df = data_df.dropna()
     data_df['Region'] = data_df['Region'].apply(_remove_initial_dot_values)
-    data_df = _states_full_to_short_form(data_df, "Region", "Location_short_form")
+    data_df = _states_full_to_short_form(data_df, "Region",
+                                         "Location_short_form")
     data_df = _add_geo_id(data_df, "Location_short_form", "Location")
     data_df = _unpivot_data_df(data_df, ["Location"], pop_cols)
     return data_df
@@ -490,7 +509,8 @@ def _process_states_2021(file_path: str) -> pd.DataFrame:
     df_cols = ["Region", "042020", "072020", "2021"]
     data_df.columns = df_cols
     data_df["Region"] = data_df["Region"].str.replace(".", "", regex=False)
-    data_df = _states_full_to_short_form(data_df, "Region", "Location_short_form")
+    data_df = _states_full_to_short_form(data_df, "Region",
+                                         "Location_short_form")
     data_df = _add_geo_id(data_df, "Location_short_form", "Location")
     data_df = _unpivot_data_df(data_df, ["Location"], ["2021"])
     return data_df
@@ -513,7 +533,8 @@ def _process_states_2029(file_path: str) -> pd.DataFrame:
         df_cols.append(str(dt.now().year))
 
     data_df["Region"] = data_df["Region"].str.replace(".", "", regex=False)
-    data_df = _states_full_to_short_form(data_df, "Region", "Location_short_form")
+    data_df = _states_full_to_short_form(data_df, "Region",
+                                         "Location_short_form")
     data_df = _add_geo_id(data_df, "Location_short_form", "Location")
     data_df = _unpivot_data_df(data_df, ["Location"], ["2022", "2023"])
     return data_df
@@ -548,7 +569,8 @@ def _process_county_file_99c8_00(file_path: str) -> pd.DataFrame:
                         if idx == 0:
                             continue
                         outfile.write(
-                            str(val) + ',' + str(fips_code) + "," + str(tmp_line[idx + 2]) + "\n")
+                            str(val) + ',' + str(fips_code) + "," +
+                            str(tmp_line[idx + 2]) + "\n")
     data_df = pd.read_csv("outfile.csv")
     data_df = data_df[data_df["Location"] != "country/USA"]
     os.remove("outfile.csv")
@@ -567,25 +589,26 @@ def _process_county_e8089co_e7079co(file_path: str) -> pd.DataFrame:
     # skip_rows is helpful in skipping intial unwanted rows from the source.
     skip_rows = 23
     first_data_df_cols = [
-        "Fips_Code", "Location", "extra_Location", "1970", "1971", "1972", "1973", "1974",
-        "extra_data_col_1", "extra_data_col_2"
+        "Fips_Code", "Location", "extra_Location", "1970", "1971", "1972",
+        "1973", "1974", "extra_data_col_1", "extra_data_col_2"
     ]
     second_data_df_cols = [
-        "Fips_Code", "Location", "extra_Location", "1975", "1976", "1977", "1978", "1979",
-        "extra_data_col_1", "extra_data_col_2"
+        "Fips_Code", "Location", "extra_Location", "1975", "1976", "1977",
+        "1978", "1979", "extra_data_col_1", "extra_data_col_2"
     ]
     if "e8089co.txt" in file_path:
         skip_rows = 0
         first_data_df_cols = [
-            "Fips_Code", "Location", "extra_Location", "1980", "1981", "1982", "1983", "1984",
-            "extra_data_col_1", "extra_data_col_2"
+            "Fips_Code", "Location", "extra_Location", "1980", "1981", "1982",
+            "1983", "1984", "extra_data_col_1", "extra_data_col_2"
         ]
         second_data_df_cols = [
-            "Fips_Code", "Location", "extra_Location", "1985", "1986", "1987", "1988", "1989",
-            "extra_data_col_1", "extra_data_col_2"
+            "Fips_Code", "Location", "extra_Location", "1985", "1986", "1987",
+            "1988", "1989", "extra_data_col_1", "extra_data_col_2"
         ]
     data_df = _load_data_df(file_path, "txt", None, skip_rows)
-    data_df = clean_1970_1989_county_txt(data_df, first_data_df_cols, second_data_df_cols)
+    data_df = clean_1970_1989_county_txt(data_df, first_data_df_cols,
+                                         second_data_df_cols)
     data_df = _unpivot_data_df(data_df, "Location", data_df.columns[1:])
     return data_df
 
@@ -602,9 +625,10 @@ def _process_county_coest2020(file_path: str) -> pd.DataFrame:
     data_df = _load_data_df(file_path, "csv", header=0, encoding='ISO-8859-1')
 
     cols = [
-        "STATE", "COUNTY", "STNAME", "CTYNAME", "POPESTIMATE2010", "POPESTIMATE2011",
-        "POPESTIMATE2012", "POPESTIMATE2013", "POPESTIMATE2014", "POPESTIMATE2015",
-        "POPESTIMATE2016", "POPESTIMATE2017", "POPESTIMATE2018", "POPESTIMATE2019",
+        "STATE", "COUNTY", "STNAME", "CTYNAME", "POPESTIMATE2010",
+        "POPESTIMATE2011", "POPESTIMATE2012", "POPESTIMATE2013",
+        "POPESTIMATE2014", "POPESTIMATE2015", "POPESTIMATE2016",
+        "POPESTIMATE2017", "POPESTIMATE2018", "POPESTIMATE2019",
         "POPESTIMATE042020", "POPESTIMATE2020"
     ]
     data_df = data_df[cols]
@@ -615,16 +639,23 @@ def _process_county_coest2020(file_path: str) -> pd.DataFrame:
                       .index.values[0]
     data_df.loc[idx, "CTYNAME"] = "Washington County"
 
-    data_df['COUNTY'] = data_df['COUNTY'].astype('str').str.pad(3, side='left', fillchar='0')
-    data_df['STATE'] = data_df['STATE'].astype('str').str.pad(2, side='left', fillchar='0')
-
-    data_df.insert(0, 'Location', data_df[["STATE", "COUNTY"]].apply(_geo_id, axis=1))
-    if data_df.shape != data_df[data_df['Location'].str.startswith('geo')].shape:
+    data_df['COUNTY'] = data_df['COUNTY'].astype('str').str.pad(3,
+                                                                side='left',
+                                                                fillchar='0')
+    data_df['STATE'] = data_df['STATE'].astype('str').str.pad(2,
+                                                              side='left',
+                                                              fillchar='0')
+
+    data_df.insert(0, 'Location', data_df[["STATE", "COUNTY"]].apply(_geo_id,
+                                                                     axis=1))
+    if data_df.shape != data_df[data_df['Location'].str.startswith(
+            'geo')].shape:
         logging.info(f"Check this file {file_path}")
     data_df.columns = data_df.columns.str.replace('POPESTIMATE', '')
 
     # Dropping Unwanted Columns
-    data_df = data_df.drop(columns=["STATE", "COUNTY", "STNAME", "CTYNAME", "042020"])
+    data_df = data_df.drop(
+        columns=["STATE", "COUNTY", "STNAME", "CTYNAME", "042020"])
     data_df = _unpivot_data_df(data_df, ["Location"], data_df.columns[1:])
     return data_df
 
@@ -641,8 +672,8 @@ def _process_county_coest2029(file_path: str) -> pd.DataFrame:
     data_df = _load_data_df(file_path, "csv", header=0, encoding='ISO-8859-1')
 
     cols = [
-        "STATE", "COUNTY", "STNAME", "CTYNAME", "POPESTIMATE2020", "POPESTIMATE2021",
-        "POPESTIMATE2022", "POPESTIMATE2023"
+        "STATE", "COUNTY", "STNAME", "CTYNAME", "POPESTIMATE2020",
+        "POPESTIMATE2021", "POPESTIMATE2022", "POPESTIMATE2023"
     ]
     data_df = data_df[cols]
     # Modifying actual city name for State: District of Columbia
@@ -652,11 +683,17 @@ def _process_county_coest2029(file_path: str) -> pd.DataFrame:
                       .index.values[0]
     data_df.loc[idx, "CTYNAME"] = "Washington County"
 
-    data_df['COUNTY'] = data_df['COUNTY'].astype('str').str.pad(3, side='left', fillchar='0')
-    data_df['STATE'] = data_df['STATE'].astype('str').str.pad(2, side='left', fillchar='0')
-
-    data_df.insert(0, 'Location', data_df[["STATE", "COUNTY"]].apply(_geo_id, axis=1))
-    if data_df.shape != data_df[data_df['Location'].str.startswith('geo')].shape:
+    data_df['COUNTY'] = data_df['COUNTY'].astype('str').str.pad(3,
+                                                                side='left',
+                                                                fillchar='0')
+    data_df['STATE'] = data_df['STATE'].astype('str').str.pad(2,
+                                                              side='left',
+                                                              fillchar='0')
+
+    data_df.insert(0, 'Location', data_df[["STATE", "COUNTY"]].apply(_geo_id,
+                                                                     axis=1))
+    if data_df.shape != data_df[data_df['Location'].str.startswith(
+            'geo')].shape:
         logging.info(f"Check this file {file_path}")
 
     data_df.columns = data_df.columns.str.replace('POPESTIMATE', '')
@@ -706,15 +743,16 @@ def _process_counties(file_path: str) -> pd.DataFrame:
         data_df = clean_data_df(data_df, "csv")
         data_df = data_df.dropna(subset=[11, 12])
         cols = [
-            "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009",
-            "Location"
+            "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007",
+            "2008", "2009", "Location"
         ]
         geo, data_df = data_df[0], data_df.iloc[:, 2:12]
 
         data_df['Location'] = geo
         data_df.columns = cols
         data_df = data_df.reset_index().drop(columns=["index"])
-        data_df["Location"] = data_df["Location"].apply(_remove_initial_dot_values)
+        data_df["Location"] = data_df["Location"].apply(
+            _remove_initial_dot_values)
         state = data_df.loc[0, 'Location']
         data_df['State'] = state
         data_df['State'] = data_df['State'].str.replace(" ", "", regex=False)
@@ -728,7 +766,8 @@ def _process_counties(file_path: str) -> pd.DataFrame:
         data_df["Location"] = data_df["Location"].apply(_state_to_geo_id)
         data_df = data_df.iloc[1:, :]
         data_df = _unpivot_data_df(data_df, ["Location"], data_df.columns[:-2])
-        data_df["Count_Person"] = data_df["Count_Person"].str.replace(",", "", regex=False)
+        data_df["Count_Person"] = data_df["Count_Person"].str.replace(
+            ",", "", regex=False)
         data_df = data_df[["Year", "Location", "Count_Person"]]
     data_df = data_df[data_df["Location"] != "country/USA"]
     return data_df
@@ -755,7 +794,8 @@ def _process_city_1990_1999(file_path: str) -> pd.DataFrame:
                 if len(line.strip()) == 0:
                     continue
                 # Skipping Unwanted Lines
-                if line.startswith("Block 2 of 2:") or line.startswith("Abbreviations:"):
+                if line.startswith("Block 2 of 2:") or line.startswith(
+                        "Abbreviations:"):
                     flag = False
                     continue
                 if search_str1 in line.strip():
@@ -786,7 +826,8 @@ def _process_city_1990_1999(file_path: str) -> pd.DataFrame:
         return data_df
 
 
-def _process_cities(file_path: str, is_summary_levels: bool = False) -> pd.DataFrame:
+def _process_cities(file_path: str,
+                    is_summary_levels: bool = False) -> pd.DataFrame:
     """
     Process DataFrame of Cities dataset
     Args:
@@ -804,9 +845,13 @@ def _process_cities(file_path: str, is_summary_levels: bool = False) -> pd.DataF
     if file_name_without_ext == 'su-99-7_us':
         data_df = _process_city_1990_1999(file_path)
     if file_name_without_ext in [
-            "sub-est2010-alt", "SUB-EST2020_ALL", "sub-est2021_all", "sub-est2023"
+            "sub-est2010-alt", "SUB-EST2020_ALL", "sub-est2021_all",
+            "sub-est2023"
     ]:
-        data_df = _load_data_df(file_path, file_format="csv", header=0, encoding="ISO-8859-1")
+        data_df = _load_data_df(file_path,
+                                file_format="csv",
+                                header=0,
+                                encoding="ISO-8859-1")
         # excluding SUMLEV=170 as no placed mapping in DC
         data_df = data_df[data_df['SUMLEV'] != 170]
         # drop place code 99990 codes
@@ -816,7 +861,8 @@ def _process_cities(file_path: str, is_summary_levels: bool = False) -> pd.DataF
             data_df = data_df[data_df['SUMLEV'] == 162]
 
         # generate FIPS code as perSUMLEV 040
-        data_df['Location'] = data_df.apply(lambda x: _generate_fips_code(x), axis=1)
+        data_df['Location'] = data_df.apply(lambda x: _generate_fips_code(x),
+                                            axis=1)
 
         data_df.dropna(subset=['Location'], inplace=True)
 
@@ -824,15 +870,26 @@ def _process_cities(file_path: str, is_summary_levels: bool = False) -> pd.DataF
             dup = data_df.copy()
             dup['cnt'] = dup.groupby(['Location'])['Location'].transform('size')
             #dup = dup[dup['cnt'] > 2]
-            dup['STATE'] = dup['STATE'].astype('str').str.pad(2, side='left', fillchar='0')
-            dup['COUNTY'] = dup['COUNTY'].astype('str').str.pad(3, side='left', fillchar='0')
-            dup['PLACE'] = dup['PLACE'].astype('str').str.pad(5, side='left', fillchar='0')
-            dup['COUSUB'] = dup['COUSUB'].astype('str').str.pad(5, side='left', fillchar='0')
-            dup.to_csv(f"dup_{file_name_without_ext}.csv", index=False, quotechar="'")
+            dup['STATE'] = dup['STATE'].astype('str').str.pad(2,
+                                                              side='left',
+                                                              fillchar='0')
+            dup['COUNTY'] = dup['COUNTY'].astype('str').str.pad(3,
+                                                                side='left',
+                                                                fillchar='0')
+            dup['PLACE'] = dup['PLACE'].astype('str').str.pad(5,
+                                                              side='left',
+                                                              fillchar='0')
+            dup['COUSUB'] = dup['COUSUB'].astype('str').str.pad(5,
+                                                                side='left',
+                                                                fillchar='0')
+            dup.to_csv(f"dup_{file_name_without_ext}.csv",
+                       index=False,
+                       quotechar="'")
         if "sub-est2010-alt" == file_name_without_ext:
             key = "POPESTIMATE07"
             pop_cols = [
-                "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009"
+                "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007",
+                "2008", "2009"
             ]
             final_cols = ["Location"] + pop_cols
         elif "sub-est2023" == file_name_without_ext:
@@ -842,7 +899,8 @@ def _process_cities(file_path: str, is_summary_levels: bool = False) -> pd.DataF
         elif "SUB-EST2020_ALL" == file_name_without_ext:
             key = "POPESTIMATE"
             pop_cols = [
-                "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
+                "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017",
+                "2018", "2019"
             ]
             final_cols = ["Location"] + pop_cols
         elif "sub-est2021_all" == file_name_without_ext:
@@ -865,12 +923,14 @@ def _generate_fips_code(row_data) -> str:
     if row_data['SUMLEV'] == 40:
         geo_id = str(row_data['STATE']).zfill(2)
     elif row_data['SUMLEV'] == 50:
-        geo_id = str(row_data['STATE']).zfill(2) + str(row_data['COUNTY']).zfill(3)
+        geo_id = str(row_data['STATE']).zfill(2) + str(
+            row_data['COUNTY']).zfill(3)
     elif row_data['SUMLEV'] == 61:
-        geo_id = str(row_data['STATE']).zfill(2) + str(row_data['COUNTY']).zfill(3) + str(
-            row_data['COUSUB']).zfill(5)
+        geo_id = str(row_data['STATE']).zfill(2) + str(
+            row_data['COUNTY']).zfill(3) + str(row_data['COUSUB']).zfill(5)
     elif row_data['SUMLEV'] in [71, 157, 162, 172]:
-        geo_id = str(row_data['STATE']).zfill(2) + str(row_data['PLACE']).zfill(5)
+        geo_id = str(row_data['STATE']).zfill(2) + str(
+            row_data['PLACE']).zfill(5)
     if geo_id:
         geo_id = "geoId/" + geo_id
     return geo_id
@@ -902,15 +962,16 @@ def _generate_tmcf(tmcf_file_path) -> None:
         f_out.write(_TMCF_TEMPLATE.rstrip('\n'))
 
 
-def process(input_path, cleaned_csv_file_path: str, mcf_file_path: str, tmcf_file_path: str,
-            is_summary_levels: bool):
+def process(input_path, cleaned_csv_file_path: str, mcf_file_path: str,
+            tmcf_file_path: str, is_summary_levels: bool):
     """
     This Method calls the required methods to generate cleaned CSV,
     MCF, and TMCF file
     """
     input_files = []
     input_files += [
-        os.path.join(input_path, file) for file in sorted(os.listdir(input_path))
+        os.path.join(input_path, file)
+        for file in sorted(os.listdir(input_path))
     ]
 
     final_df = pd.DataFrame()
@@ -954,29 +1015,36 @@ def process(input_path, cleaned_csv_file_path: str, mcf_file_path: str, tmcf_fil
                 state_df = _process_states_2029(file)
                 data_df = pd.concat([nat_df, state_df])
             elif file_name in [
-                    "st0009ts.txt", "st1019ts.txt", "st2029ts.txt", "st3039ts.txt", "st4049ts.txt",
-                    "st5060ts.txt", "st6070ts.txt"
+                    "st0009ts.txt", "st1019ts.txt", "st2029ts.txt",
+                    "st3039ts.txt", "st4049ts.txt", "st5060ts.txt",
+                    "st6070ts.txt"
             ]:
-                data_df = process_states_1900_1969(_STATE_CONFIG, file, file_name,
-                                                   SCALING_FACTOR_STATE_1900_1960)
+                data_df = process_states_1900_1969(
+                    _STATE_CONFIG, file, file_name,
+                    SCALING_FACTOR_STATE_1900_1960)
             elif "st7080ts" in file:
                 data_df = process_states_1970_1979(file)
             elif "st8090ts" in file:
                 data_df = process_states_1980_1989(file)
-                data_df["Location"] = data_df["Location"].apply(_state_to_geo_id)
+                data_df["Location"] = data_df["Location"].apply(
+                    _state_to_geo_id)
             elif "st-99-03" in file:
                 data_df = process_states_1990_1999(file)
-                data_df = _states_full_to_short_form(data_df, "Location", "Location")
-                data_df["Location"] = data_df["Location"].apply(_state_to_geo_id)
+                data_df = _states_full_to_short_form(data_df, "Location",
+                                                     "Location")
+                data_df["Location"] = data_df["Location"].apply(
+                    _state_to_geo_id)
             elif "e8089co.txt" in file:
                 nat_df = _process_nationals_1980_1989(file)
                 county_df = _process_counties(file)
                 data_df = pd.concat([nat_df, county_df])
-            elif file_name in ["e7079co.txt", "99c8_00.txt"] or "co-est" in file_name:
+            elif file_name in ["e7079co.txt", "99c8_00.txt"
+                              ] or "co-est" in file_name:
                 data_df = _process_counties(file)
             elif file_name in [
-                    'su-99-7_us.txt', "sub-est2010-alt.csv", "SUB-EST2020_ALL.csv",
-                    "sub-est2021_all.csv", "sub-est2023.csv"
+                    'su-99-7_us.txt', "sub-est2010-alt.csv",
+                    "SUB-EST2020_ALL.csv", "sub-est2021_all.csv",
+                    "sub-est2023.csv"
             ]:
                 data_df = _process_cities(file, is_summary_levels)
 
@@ -994,7 +1062,8 @@ def process(input_path, cleaned_csv_file_path: str, mcf_file_path: str, tmcf_fil
         final_df = final_df.sort_values(by=["Location", "Year"])
         final_df = final_df.drop_duplicates(["Year", "Location"])
         final_df["Count_Person"] = final_df["Count_Person"].astype("int")
-        final_df[["Year", "Location", "Count_Person"]].to_csv(cleaned_csv_file_path, index=False)
+        final_df[["Year", "Location",
+                  "Count_Person"]].to_csv(cleaned_csv_file_path, index=False)
         _generate_mcf(mcf_file_path)
         _generate_tmcf(tmcf_file_path)
     else:
@@ -1028,7 +1097,8 @@ def add_future_year_urls():
                 try:
                     check_url = requests.head(url_to_check)
                     if check_url.status_code == 200:
-                        _FILES_TO_DOWNLOAD.append({"download_path": url_to_check})
+                        _FILES_TO_DOWNLOAD.append(
+                            {"download_path": url_to_check})
 
                 except:
                     logging.error(f"URL is not accessable {url_to_check}")
@@ -1050,14 +1120,20 @@ def download_files():
         for file_to_dowload in _FILES_TO_DOWNLOAD:
             file_name_to_save = None
             url = file_to_dowload['download_path']
-            if 'file_name' in file_to_dowload and len(file_to_dowload['file_name'] > 5):
+            if 'file_name' in file_to_dowload and len(
+                    file_to_dowload['file_name'] > 5):
                 file_name_to_save = file_to_dowload['file_name']
             else:
                 file_name_to_save = url.split('/')[-1]
             if 'file_path' in file_to_dowload:
-                if not os.path.exists(os.path.join(_INPUT_FILE_PATH, file_to_dowload['file_path'])):
-                    os.makedirs(os.path.join(_INPUT_FILE_PATH, file_to_dowload['file_path']))
-                file_name_to_save = file_to_dowload['file_path'] + file_name_to_save
+                if not os.path.exists(
+                        os.path.join(_INPUT_FILE_PATH,
+                                     file_to_dowload['file_path'])):
+                    os.makedirs(
+                        os.path.join(_INPUT_FILE_PATH,
+                                     file_to_dowload['file_path']))
+                file_name_to_save = file_to_dowload[
+                    'file_path'] + file_name_to_save
             retry_number = 0
 
             is_file_downloaded = False
@@ -1066,7 +1142,9 @@ def download_files():
                     with session.get(url, stream=True) as response:
                         response.raise_for_status()
                         if response.status_code == 200:
-                            with open(os.path.join(_INPUT_FILE_PATH, file_name_to_save), 'wb') as f:
+                            with open(
+                                    os.path.join(_INPUT_FILE_PATH,
+                                                 file_name_to_save), 'wb') as f:
                                 # shutil.copyfileobj(response.raw, f)
                                 f.write(response.content)
                                 file_to_dowload['is_downloaded'] = True
@@ -1103,8 +1181,10 @@ def main(_):
         add_future_year_urls()
         download_files()
     if mode == "" or mode == "process":
-        process(_INPUT_FILE_PATH, cleaned_csv_path, mcf_path, tmcf_path, is_summary_levels)
+        process(_INPUT_FILE_PATH, cleaned_csv_path, mcf_path, tmcf_path,
+                is_summary_levels)
     logging.info("Processing completed!")
 
+
 if __name__ == "__main__":
     app.run(main)