EuroStatHealth_AlcoholConsumption_Modifications

datacommonsorg · Dec 19, 2024 · d4b7173 · d4b7173
1 parent 149fd2b
commit d4b7173
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 79 deletions.
diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/README.md b/scripts/eurostat/health_determinants/alcohol_consumption/README.md
@@ -18,18 +18,6 @@ The population is categorized by various set of combinations as below:
         10. Alcohol Consumption by Sex and Country of Birth.
         11. Alcohol Consumption by Sex and Country of Citizenship.
 
-
-### Download URL
-Input files are available for download from url: https://ec.europa.eu/eurostat/web/main/data/database -> Health -> Health determinants (hlth_det).
-
-### Import Procedure
-The below script will download the data and extract it.
-
-`python scripts/eurostat/health_determinants/common/download_eurostat_input_files.py --import_name alcohol_consumption`
-
-Files are created inside 'input_files' directory.
-
-
 #### Output
 Statistical variables for alcohol consumption are based on below properties available in input files.
 | Attribute                                     | Description                                                   	|
@@ -45,11 +33,6 @@ Statistical variables for alcohol consumption are based on below properties avai
 | Country of Citizenship                	| The citizenship of the population.                			|
 
 
-Below script will generate cleansed observation file (csv), mcf and tmcf files.
-
-`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`
-
-
 #### Cleaned Observation File
 Cleaned data will be persisted as a CSV file in output/eurostat_population_alcohol_consumption.csv with the following columns.
 
@@ -65,9 +48,31 @@ MCF and tMCF files are presisted in below mentioned path.
 - [output/eurostat_population_alcohol_consumption.mcf]
 - [output/eurostat_population_alcohol_consumption.tmcf]
 
+### Download URL
+
+The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det).
+The actual URLs are listed in import_download_details.py
 
 ### Running Tests
 
 Run the test cases
 
 `python3 -m unittest discover -v -s scripts/eurostat/health_determinants/alcohol_consumption/ -p process_test.py`
+
+### Import Procedure
+
+The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files.
+
+`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`
+
+if we want to perform only the download of this import, execute the below command:
+
+`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=download`
+
+if we want to perform only process for this import, execute the below command:
+
+`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=process`
+
+Downloaded Files are created inside 'input_files' directory.
+
+
diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/process.py b/scripts/eurostat/health_determinants/alcohol_consumption/process.py
@@ -18,20 +18,23 @@
 import os
 import sys
 import pandas as pd
+from absl import app, flags, logging
 
 _COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 sys.path.insert(1, _COMMON_PATH)
 # pylint: disable=wrong-import-position
 from common.euro_stat import EuroStat
+from common import import_download_details, download
 # pylint: enable=wrong-import-position
 
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('mode', '', 'Options: download or process')
 
 class EuroStatAlcoholConsumption(EuroStat):
     """
     This Class has requried methods to generate Cleaned CSV,
     MCF and TMCF Files.
     """
-    _import_name = "alcohol_consumption"
 
     _mcf_template = ("Node: dcid:{sv}"
                      "\n{sv_name}"
@@ -88,6 +91,25 @@ class EuroStatAlcoholConsumption(EuroStat):
         "NotACitizen": "citizenship",
     }
 
+    @staticmethod
+    def download_data(import_name):
+        """Downloads raw data from Eurostat website and stores it in instance data frame.
+        
+            Args:
+            import_name(str): A string representing the import name.
+            
+            Returns:True
+            
+        """
+        download_details = import_download_details.download_details[import_name]
+        download_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', import_name,"input_files"))
+        os.makedirs(download_path, exist_ok=True)
+
+        for file in download_details["filenames"]:
+            download_files_urls = [download_details["input_url"] + str(file) +download_details["file_extension"]]
+            download.download_files(download_files_urls, download_path)
+        return True
+
     # over-ridden parent abstract method
     def _property_correction(self):
         for k, v in self._sv_properties.items():
@@ -119,26 +141,36 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame:
         return df.rename(columns={'frequenc': 'frequenc_alcohol'})
 
 
-if __name__ == '__main__':
-    input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                              "input_files")
-    ip_files = os.listdir(input_path)
-    ip_files = [input_path + os.sep + file for file in ip_files]
+def main(_):
+    mode = _FLAGS.mode
+    global import_name
+    import_name = "alcohol_consumption"
+    if mode == "" or mode == "download":
+        EuroStatAlcoholConsumption.download_data(import_name)
+    if mode == "" or mode == "process":
+        try:
+            input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"input_files")
+            ip_files = os.listdir(input_path)
+            ip_files = [input_path + os.sep + file for file in ip_files]
+
+            # Defining Output Files
+            data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"output")
 
-    # Defining Output Files
-    data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                  "output")
+            csv_name = "eurostat_population_alcoholconsumption.csv"
+            mcf_name = "eurostat_population_alcoholconsumption.mcf"
+            tmcf_name = "eurostat_population_alcoholconsumption.tmcf"
 
-    csv_name = "eurostat_population_alcoholconsumption.csv"
-    mcf_name = "eurostat_population_alcoholconsumption.mcf"
-    tmcf_name = "eurostat_population_alcoholconsumption.tmcf"
+            cleaned_csv_path = os.path.join(data_file_path, csv_name)
+            mcf_path = os.path.join(data_file_path, mcf_name)
+            tmcf_path = os.path.join(data_file_path, tmcf_name)
 
-    cleaned_csv_path = os.path.join(data_file_path, csv_name)
-    mcf_path = os.path.join(data_file_path, mcf_name)
-    tmcf_path = os.path.join(data_file_path, tmcf_name)
+            loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,tmcf_path,import_name)
+            loader.generate_csv()
+            loader.generate_mcf()
+            loader.generate_tmcf()
+            print("Processing completed!")
+        except Exception as e:
+            logging.fatal(f'Download error')
 
-    loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,
-                                        tmcf_path)
-    loader.generate_csv()
-    loader.generate_mcf()
-    loader.generate_tmcf()
+if __name__ == "__main__":
+    app.run(main)  
diff --git a/scripts/eurostat/health_determinants/common/euro_stat.py b/scripts/eurostat/health_determinants/common/euro_stat.py
@@ -76,11 +76,13 @@ def __init__(self,
                  input_files: list,
                  csv_file_path: str = None,
                  mcf_file_path: str = None,
-                 tmcf_file_path: str = None) -> None:
+                 tmcf_file_path: str = None,
+                 import_name: str = None) -> None:
         self._input_files = input_files
         self._cleaned_csv_file_path = csv_file_path
         self._mcf_file_path = mcf_file_path
         self._tmcf_file_path = tmcf_file_path
+        self._import_name = import_name
         self._df = pd.DataFrame()
 
     # pylint: disable=pointless-statement
@@ -151,8 +153,8 @@ def _parse_file(self, file_name: str, df: pd.DataFrame,
         df = df[df['age'] == 'TOTAL']
         df = replace_col_values(df)
 
-        if file_name in file_to_sv_mapping[import_name]:
-            df['SV'] = eval(file_to_sv_mapping[import_name][file_name])
+        if file_name in file_to_sv_mapping[self._import_name]:
+            df['SV'] = eval(file_to_sv_mapping[self._import_name][file_name])
         else:
             print(
                 '#########\nERROR: File (', file_name,

diff --git a/scripts/eurostat/health_determinants/tobacco_consumption/README.md b/scripts/eurostat/health_determinants/tobacco_consumption/README.md
@@ -22,12 +22,6 @@ The population is categorized by various set of combinations as below:
         15. Daily Smokers of Cigarettes by Sex and Educational Attainment level
 	16. Daily Smokers of Cigarettes by Sex and Income Quintile
 	17. Daily Smokers by number of Cigarettes by Sex and Educational Attainment level
-
-
-### Download URL
-The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> 	Health -> Health determinants (hlth_det).
-The actual URLs are listed in input_files.py.
-
 
 #### API Output
 These are the attributes that will be used
@@ -44,9 +38,6 @@ These are the attributes that will be used
 | Country of Citizenship   				| The citizenship of the population.				|
 | Degree of Activity Limitation   				|  							|
 
-
-
-
 #### Cleaned Data
 Cleaned data will be inside [output/eurostat_population_tobaccoconsumption.csv] as a CSV file with the following columns.
 
@@ -56,7 +47,9 @@ Cleaned data will be inside [output/eurostat_population_tobaccoconsumption.csv]
 - Measurement_Method
 - Observation
 
-
+### Download URL
+The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det).
+The actual URLs are listed in import_download_details.py
 
 #### MCFs and Template MCFs
 - [output/eurostat_population_tobaccoconsumption.mcf]
@@ -68,15 +61,18 @@ Run the test cases
 
 `python3 -m unittest scripts/eurostat/health_determinants/Tobacco_consumption/process_test.py`
 
+### Import Procedure
 
+The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files.
 
+`python scripts/eurostat/health_determinants/Tobacco_consumption/process.py`
 
-### Import Procedure
+if we want to perform only the download of this import, execute the below command:
 
-The below script will download the data and extract it.
+`python scripts/eurostat/health_determinants/tobacco_consumption/process.py --mode=download`
 
-`/bin/python scripts/eurostat/health_determinants/Tobacco_consumption/input_files.py`
+if we want to perform only process for this import, execute the below command:
 
-The below script will clean the data, Also generate final csv, mcf and tmcf files.
+`python scripts/eurostat/health_determinants/tobacco_consumption/process.py --mode=process`
 
-`/bin/python scripts/eurostat/health_determinants/Tobacco_consumption/process.py`
+Downloaded Files are created inside 'input_files' directory.
diff --git a/scripts/eurostat/health_determinants/tobacco_consumption/process.py b/scripts/eurostat/health_determinants/tobacco_consumption/process.py
@@ -18,20 +18,23 @@
 import os
 import sys
 import pandas as pd
+from absl import app, flags, logging
 
 _COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 sys.path.insert(1, _COMMON_PATH)
 # pylint: disable=wrong-import-position
 from common.euro_stat import EuroStat
 # pylint: enable=wrong-import-position
+from common import import_download_details, download
 
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('mode', '', 'Options: download or process')
 
 class EuroStatTobaccoConsumption(EuroStat):
     """
     This Class has requried methods to generate Cleaned CSV,
     MCF and TMCF Files.
     """
-    _import_name = "tobacco_consumption"
 
     _mcf_template = ("Node: dcid:{sv}"
                      "\n{sv_name}"
@@ -103,6 +106,25 @@ class EuroStatTobaccoConsumption(EuroStat):
         "NotACitizen": "citizenship",
     }
 
+    @staticmethod
+    def download_data(import_name):
+        """Downloads raw data from Eurostat website and stores it in instance data frame.
+        
+            Args:
+            import_name(str): A string representing the import name.
+            
+            Returns:True
+            
+        """
+        download_details = import_download_details.download_details[import_name]
+        download_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', import_name,"input_files"))
+        os.makedirs(download_path, exist_ok=True)
+
+        for file in download_details["filenames"]:
+            download_files_urls = [download_details["input_url"] + str(file) +download_details["file_extension"]]
+            download.download_files(download_files_urls, download_path)
+        return True
+
     # over-ridden parent abstract method
     def _property_correction(self):
         """
@@ -157,27 +179,40 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame:
 
     # pylint: enable=no-self-use
 
+def main(_):
+    mode = _FLAGS.mode
+    global import_name
+    import_name = "tobacco_consumption"
+    if mode == "" or mode == "download":
+        EuroStatTobaccoConsumption.download_data(import_name)
+    if mode == "" or mode == "process":
+        try:
+            input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                    "input_files")
+            ip_files = os.listdir(input_path)
+            ip_files = [input_path + os.sep + file for file in ip_files]
+
+            # Defining Output Files
+            data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                        "output")
+
+            csv_name = "eurostat_population_tobaccoconsumption.csv"
+            mcf_name = "eurostat_population_tobaccoconsumption.mcf"
+            tmcf_name = "eurostat_population_tobaccoconsumption.tmcf"
+
+            cleaned_csv_path = os.path.join(data_file_path, csv_name)
+            mcf_path = os.path.join(data_file_path, mcf_name)
+            tmcf_path = os.path.join(data_file_path, tmcf_name)
+
+            loader = EuroStatTobaccoConsumption(ip_files, cleaned_csv_path, mcf_path,
+                                                tmcf_path, import_name)
+            loader.generate_csv()
+            loader.generate_mcf()
+            loader.generate_tmcf()
+            print("Processing completed!")
+        except Exception as e:
+            logging.fatal(f'Download error')
+
+if __name__ == "__main__":
+    app.run(main) 
 
-if __name__ == '__main__':
-    input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                              "input_files")
-    ip_files = os.listdir(input_path)
-    ip_files = [input_path + os.sep + file for file in ip_files]
-
-    # Defining Output Files
-    data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                  "output")
-
-    csv_name = "eurostat_population_tobaccoconsumption.csv"
-    mcf_name = "eurostat_population_tobaccoconsumption.mcf"
-    tmcf_name = "eurostat_population_tobaccoconsumption.tmcf"
-
-    cleaned_csv_path = os.path.join(data_file_path, csv_name)
-    mcf_path = os.path.join(data_file_path, mcf_name)
-    tmcf_path = os.path.join(data_file_path, tmcf_name)
-
-    loader = EuroStatTobaccoConsumption(ip_files, cleaned_csv_path, mcf_path,
-                                        tmcf_path)
-    loader.generate_csv()
-    loader.generate_mcf()
-    loader.generate_tmcf()