From d4b7173d8bedb822c67f77a4a9c80aabf7ac9a9f Mon Sep 17 00:00:00 2001 From: Sudhisha K Date: Thu, 19 Dec 2024 12:28:13 +0000 Subject: [PATCH] EuroStatHealth_AlcoholConsumption_Modifications --- .../alcohol_consumption/README.md | 39 +++++---- .../alcohol_consumption/process.py | 72 +++++++++++----- .../health_determinants/common/euro_stat.py | 8 +- .../tobacco_consumption/README.md | 26 +++--- .../tobacco_consumption/process.py | 83 +++++++++++++------ 5 files changed, 149 insertions(+), 79 deletions(-) diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/README.md b/scripts/eurostat/health_determinants/alcohol_consumption/README.md index ddc17ca1b1..27164f59e9 100644 --- a/scripts/eurostat/health_determinants/alcohol_consumption/README.md +++ b/scripts/eurostat/health_determinants/alcohol_consumption/README.md @@ -18,18 +18,6 @@ The population is categorized by various set of combinations as below: 10. Alcohol Consumption by Sex and Country of Birth. 11. Alcohol Consumption by Sex and Country of Citizenship. - -### Download URL -Input files are available for download from url: https://ec.europa.eu/eurostat/web/main/data/database -> Health -> Health determinants (hlth_det). - -### Import Procedure -The below script will download the data and extract it. - -`python scripts/eurostat/health_determinants/common/download_eurostat_input_files.py --import_name alcohol_consumption` - -Files are created inside 'input_files' directory. - - #### Output Statistical variables for alcohol consumption are based on below properties available in input files. | Attribute | Description | @@ -45,11 +33,6 @@ Statistical variables for alcohol consumption are based on below properties avai | Country of Citizenship | The citizenship of the population. | -Below script will generate cleansed observation file (csv), mcf and tmcf files. - -`python scripts/eurostat/health_determinants/alcohol_consumption/process.py` - - #### Cleaned Observation File Cleaned data will be persisted as a CSV file in output/eurostat_population_alcohol_consumption.csv with the following columns. @@ -65,9 +48,31 @@ MCF and tMCF files are presisted in below mentioned path. - [output/eurostat_population_alcohol_consumption.mcf] - [output/eurostat_population_alcohol_consumption.tmcf] +### Download URL + +The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det). +The actual URLs are listed in import_download_details.py ### Running Tests Run the test cases `python3 -m unittest discover -v -s scripts/eurostat/health_determinants/alcohol_consumption/ -p process_test.py` + +### Import Procedure + +The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files. + +`python scripts/eurostat/health_determinants/alcohol_consumption/process.py` + +if we want to perform only the download of this import, execute the below command: + +`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=download` + +if we want to perform only process for this import, execute the below command: + +`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=process` + +Downloaded Files are created inside 'input_files' directory. + + diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/process.py b/scripts/eurostat/health_determinants/alcohol_consumption/process.py index 08687ca8d9..93e8571a0b 100644 --- a/scripts/eurostat/health_determinants/alcohol_consumption/process.py +++ b/scripts/eurostat/health_determinants/alcohol_consumption/process.py @@ -18,20 +18,23 @@ import os import sys import pandas as pd +from absl import app, flags, logging _COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(1, _COMMON_PATH) # pylint: disable=wrong-import-position from common.euro_stat import EuroStat +from common import import_download_details, download # pylint: enable=wrong-import-position +_FLAGS = flags.FLAGS +flags.DEFINE_string('mode', '', 'Options: download or process') class EuroStatAlcoholConsumption(EuroStat): """ This Class has requried methods to generate Cleaned CSV, MCF and TMCF Files. """ - _import_name = "alcohol_consumption" _mcf_template = ("Node: dcid:{sv}" "\n{sv_name}" @@ -88,6 +91,25 @@ class EuroStatAlcoholConsumption(EuroStat): "NotACitizen": "citizenship", } + @staticmethod + def download_data(import_name): + """Downloads raw data from Eurostat website and stores it in instance data frame. + + Args: + import_name(str): A string representing the import name. + + Returns:True + + """ + download_details = import_download_details.download_details[import_name] + download_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', import_name,"input_files")) + os.makedirs(download_path, exist_ok=True) + + for file in download_details["filenames"]: + download_files_urls = [download_details["input_url"] + str(file) +download_details["file_extension"]] + download.download_files(download_files_urls, download_path) + return True + # over-ridden parent abstract method def _property_correction(self): for k, v in self._sv_properties.items(): @@ -119,26 +141,36 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame: return df.rename(columns={'frequenc': 'frequenc_alcohol'}) -if __name__ == '__main__': - input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "input_files") - ip_files = os.listdir(input_path) - ip_files = [input_path + os.sep + file for file in ip_files] +def main(_): + mode = _FLAGS.mode + global import_name + import_name = "alcohol_consumption" + if mode == "" or mode == "download": + EuroStatAlcoholConsumption.download_data(import_name) + if mode == "" or mode == "process": + try: + input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"input_files") + ip_files = os.listdir(input_path) + ip_files = [input_path + os.sep + file for file in ip_files] + + # Defining Output Files + data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"output") - # Defining Output Files - data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "output") + csv_name = "eurostat_population_alcoholconsumption.csv" + mcf_name = "eurostat_population_alcoholconsumption.mcf" + tmcf_name = "eurostat_population_alcoholconsumption.tmcf" - csv_name = "eurostat_population_alcoholconsumption.csv" - mcf_name = "eurostat_population_alcoholconsumption.mcf" - tmcf_name = "eurostat_population_alcoholconsumption.tmcf" + cleaned_csv_path = os.path.join(data_file_path, csv_name) + mcf_path = os.path.join(data_file_path, mcf_name) + tmcf_path = os.path.join(data_file_path, tmcf_name) - cleaned_csv_path = os.path.join(data_file_path, csv_name) - mcf_path = os.path.join(data_file_path, mcf_name) - tmcf_path = os.path.join(data_file_path, tmcf_name) + loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,tmcf_path,import_name) + loader.generate_csv() + loader.generate_mcf() + loader.generate_tmcf() + print("Processing completed!") + except Exception as e: + logging.fatal(f'Download error') - loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path, - tmcf_path) - loader.generate_csv() - loader.generate_mcf() - loader.generate_tmcf() +if __name__ == "__main__": + app.run(main) diff --git a/scripts/eurostat/health_determinants/common/euro_stat.py b/scripts/eurostat/health_determinants/common/euro_stat.py index ab3fa69354..cf2cc93187 100644 --- a/scripts/eurostat/health_determinants/common/euro_stat.py +++ b/scripts/eurostat/health_determinants/common/euro_stat.py @@ -76,11 +76,13 @@ def __init__(self, input_files: list, csv_file_path: str = None, mcf_file_path: str = None, - tmcf_file_path: str = None) -> None: + tmcf_file_path: str = None, + import_name: str = None) -> None: self._input_files = input_files self._cleaned_csv_file_path = csv_file_path self._mcf_file_path = mcf_file_path self._tmcf_file_path = tmcf_file_path + self._import_name = import_name self._df = pd.DataFrame() # pylint: disable=pointless-statement @@ -151,8 +153,8 @@ def _parse_file(self, file_name: str, df: pd.DataFrame, df = df[df['age'] == 'TOTAL'] df = replace_col_values(df) - if file_name in file_to_sv_mapping[import_name]: - df['SV'] = eval(file_to_sv_mapping[import_name][file_name]) + if file_name in file_to_sv_mapping[self._import_name]: + df['SV'] = eval(file_to_sv_mapping[self._import_name][file_name]) else: print( '#########\nERROR: File (', file_name, diff --git a/scripts/eurostat/health_determinants/tobacco_consumption/README.md b/scripts/eurostat/health_determinants/tobacco_consumption/README.md index 6823882ffd..ab77460336 100644 --- a/scripts/eurostat/health_determinants/tobacco_consumption/README.md +++ b/scripts/eurostat/health_determinants/tobacco_consumption/README.md @@ -22,12 +22,6 @@ The population is categorized by various set of combinations as below: 15. Daily Smokers of Cigarettes by Sex and Educational Attainment level 16. Daily Smokers of Cigarettes by Sex and Income Quintile 17. Daily Smokers by number of Cigarettes by Sex and Educational Attainment level - - -### Download URL -The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Health -> Health determinants (hlth_det). -The actual URLs are listed in input_files.py. - #### API Output These are the attributes that will be used @@ -44,9 +38,6 @@ These are the attributes that will be used | Country of Citizenship | The citizenship of the population. | | Degree of Activity Limitation | | - - - #### Cleaned Data Cleaned data will be inside [output/eurostat_population_tobaccoconsumption.csv] as a CSV file with the following columns. @@ -56,7 +47,9 @@ Cleaned data will be inside [output/eurostat_population_tobaccoconsumption.csv] - Measurement_Method - Observation - +### Download URL +The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det). +The actual URLs are listed in import_download_details.py #### MCFs and Template MCFs - [output/eurostat_population_tobaccoconsumption.mcf] @@ -68,15 +61,18 @@ Run the test cases `python3 -m unittest scripts/eurostat/health_determinants/Tobacco_consumption/process_test.py` +### Import Procedure +The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files. +`python scripts/eurostat/health_determinants/Tobacco_consumption/process.py` -### Import Procedure +if we want to perform only the download of this import, execute the below command: -The below script will download the data and extract it. +`python scripts/eurostat/health_determinants/tobacco_consumption/process.py --mode=download` -`/bin/python scripts/eurostat/health_determinants/Tobacco_consumption/input_files.py` +if we want to perform only process for this import, execute the below command: -The below script will clean the data, Also generate final csv, mcf and tmcf files. +`python scripts/eurostat/health_determinants/tobacco_consumption/process.py --mode=process` -`/bin/python scripts/eurostat/health_determinants/Tobacco_consumption/process.py` +Downloaded Files are created inside 'input_files' directory. diff --git a/scripts/eurostat/health_determinants/tobacco_consumption/process.py b/scripts/eurostat/health_determinants/tobacco_consumption/process.py index 626c093a33..1e2f750bfa 100644 --- a/scripts/eurostat/health_determinants/tobacco_consumption/process.py +++ b/scripts/eurostat/health_determinants/tobacco_consumption/process.py @@ -18,20 +18,23 @@ import os import sys import pandas as pd +from absl import app, flags, logging _COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(1, _COMMON_PATH) # pylint: disable=wrong-import-position from common.euro_stat import EuroStat # pylint: enable=wrong-import-position +from common import import_download_details, download +_FLAGS = flags.FLAGS +flags.DEFINE_string('mode', '', 'Options: download or process') class EuroStatTobaccoConsumption(EuroStat): """ This Class has requried methods to generate Cleaned CSV, MCF and TMCF Files. """ - _import_name = "tobacco_consumption" _mcf_template = ("Node: dcid:{sv}" "\n{sv_name}" @@ -103,6 +106,25 @@ class EuroStatTobaccoConsumption(EuroStat): "NotACitizen": "citizenship", } + @staticmethod + def download_data(import_name): + """Downloads raw data from Eurostat website and stores it in instance data frame. + + Args: + import_name(str): A string representing the import name. + + Returns:True + + """ + download_details = import_download_details.download_details[import_name] + download_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', import_name,"input_files")) + os.makedirs(download_path, exist_ok=True) + + for file in download_details["filenames"]: + download_files_urls = [download_details["input_url"] + str(file) +download_details["file_extension"]] + download.download_files(download_files_urls, download_path) + return True + # over-ridden parent abstract method def _property_correction(self): """ @@ -157,27 +179,40 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame: # pylint: enable=no-self-use +def main(_): + mode = _FLAGS.mode + global import_name + import_name = "tobacco_consumption" + if mode == "" or mode == "download": + EuroStatTobaccoConsumption.download_data(import_name) + if mode == "" or mode == "process": + try: + input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "input_files") + ip_files = os.listdir(input_path) + ip_files = [input_path + os.sep + file for file in ip_files] + + # Defining Output Files + data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "output") + + csv_name = "eurostat_population_tobaccoconsumption.csv" + mcf_name = "eurostat_population_tobaccoconsumption.mcf" + tmcf_name = "eurostat_population_tobaccoconsumption.tmcf" + + cleaned_csv_path = os.path.join(data_file_path, csv_name) + mcf_path = os.path.join(data_file_path, mcf_name) + tmcf_path = os.path.join(data_file_path, tmcf_name) + + loader = EuroStatTobaccoConsumption(ip_files, cleaned_csv_path, mcf_path, + tmcf_path, import_name) + loader.generate_csv() + loader.generate_mcf() + loader.generate_tmcf() + print("Processing completed!") + except Exception as e: + logging.fatal(f'Download error') + +if __name__ == "__main__": + app.run(main) -if __name__ == '__main__': - input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "input_files") - ip_files = os.listdir(input_path) - ip_files = [input_path + os.sep + file for file in ip_files] - - # Defining Output Files - data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "output") - - csv_name = "eurostat_population_tobaccoconsumption.csv" - mcf_name = "eurostat_population_tobaccoconsumption.mcf" - tmcf_name = "eurostat_population_tobaccoconsumption.tmcf" - - cleaned_csv_path = os.path.join(data_file_path, csv_name) - mcf_path = os.path.join(data_file_path, mcf_name) - tmcf_path = os.path.join(data_file_path, tmcf_name) - - loader = EuroStatTobaccoConsumption(ip_files, cleaned_csv_path, mcf_path, - tmcf_path) - loader.generate_csv() - loader.generate_mcf() - loader.generate_tmcf()