Skip to content

Commit

Permalink
EuroStatHealth_AlcoholConsumption_Modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
SudhishaK committed Dec 19, 2024
1 parent 149fd2b commit d4b7173
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 79 deletions.
39 changes: 22 additions & 17 deletions scripts/eurostat/health_determinants/alcohol_consumption/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,6 @@ The population is categorized by various set of combinations as below:
10. Alcohol Consumption by Sex and Country of Birth.
11. Alcohol Consumption by Sex and Country of Citizenship.


### Download URL
Input files are available for download from url: https://ec.europa.eu/eurostat/web/main/data/database -> Health -> Health determinants (hlth_det).

### Import Procedure
The below script will download the data and extract it.

`python scripts/eurostat/health_determinants/common/download_eurostat_input_files.py --import_name alcohol_consumption`

Files are created inside 'input_files' directory.


#### Output
Statistical variables for alcohol consumption are based on below properties available in input files.
| Attribute | Description |
Expand All @@ -45,11 +33,6 @@ Statistical variables for alcohol consumption are based on below properties avai
| Country of Citizenship | The citizenship of the population. |


Below script will generate cleansed observation file (csv), mcf and tmcf files.

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`


#### Cleaned Observation File
Cleaned data will be persisted as a CSV file in output/eurostat_population_alcohol_consumption.csv with the following columns.

Expand All @@ -65,9 +48,31 @@ MCF and tMCF files are presisted in below mentioned path.
- [output/eurostat_population_alcohol_consumption.mcf]
- [output/eurostat_population_alcohol_consumption.tmcf]

### Download URL

The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det).
The actual URLs are listed in import_download_details.py

### Running Tests

Run the test cases

`python3 -m unittest discover -v -s scripts/eurostat/health_determinants/alcohol_consumption/ -p process_test.py`

### Import Procedure

The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files.

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`

if we want to perform only the download of this import, execute the below command:

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=download`

if we want to perform only process for this import, execute the below command:

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=process`

Downloaded Files are created inside 'input_files' directory.


72 changes: 52 additions & 20 deletions scripts/eurostat/health_determinants/alcohol_consumption/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,23 @@
import os
import sys
import pandas as pd
from absl import app, flags, logging

_COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(1, _COMMON_PATH)
# pylint: disable=wrong-import-position
from common.euro_stat import EuroStat
from common import import_download_details, download
# pylint: enable=wrong-import-position

_FLAGS = flags.FLAGS
flags.DEFINE_string('mode', '', 'Options: download or process')

class EuroStatAlcoholConsumption(EuroStat):
"""
This Class has requried methods to generate Cleaned CSV,
MCF and TMCF Files.
"""
_import_name = "alcohol_consumption"

_mcf_template = ("Node: dcid:{sv}"
"\n{sv_name}"
Expand Down Expand Up @@ -88,6 +91,25 @@ class EuroStatAlcoholConsumption(EuroStat):
"NotACitizen": "citizenship",
}

@staticmethod
def download_data(import_name):
"""Downloads raw data from Eurostat website and stores it in instance data frame.
Args:
import_name(str): A string representing the import name.
Returns:True
"""
download_details = import_download_details.download_details[import_name]
download_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', import_name,"input_files"))
os.makedirs(download_path, exist_ok=True)

for file in download_details["filenames"]:
download_files_urls = [download_details["input_url"] + str(file) +download_details["file_extension"]]
download.download_files(download_files_urls, download_path)
return True

# over-ridden parent abstract method
def _property_correction(self):
for k, v in self._sv_properties.items():
Expand Down Expand Up @@ -119,26 +141,36 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame:
return df.rename(columns={'frequenc': 'frequenc_alcohol'})


if __name__ == '__main__':
input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"input_files")
ip_files = os.listdir(input_path)
ip_files = [input_path + os.sep + file for file in ip_files]
def main(_):
mode = _FLAGS.mode
global import_name
import_name = "alcohol_consumption"
if mode == "" or mode == "download":
EuroStatAlcoholConsumption.download_data(import_name)
if mode == "" or mode == "process":
try:
input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"input_files")
ip_files = os.listdir(input_path)
ip_files = [input_path + os.sep + file for file in ip_files]

# Defining Output Files
data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"output")

# Defining Output Files
data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"output")
csv_name = "eurostat_population_alcoholconsumption.csv"
mcf_name = "eurostat_population_alcoholconsumption.mcf"
tmcf_name = "eurostat_population_alcoholconsumption.tmcf"

csv_name = "eurostat_population_alcoholconsumption.csv"
mcf_name = "eurostat_population_alcoholconsumption.mcf"
tmcf_name = "eurostat_population_alcoholconsumption.tmcf"
cleaned_csv_path = os.path.join(data_file_path, csv_name)
mcf_path = os.path.join(data_file_path, mcf_name)
tmcf_path = os.path.join(data_file_path, tmcf_name)

cleaned_csv_path = os.path.join(data_file_path, csv_name)
mcf_path = os.path.join(data_file_path, mcf_name)
tmcf_path = os.path.join(data_file_path, tmcf_name)
loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,tmcf_path,import_name)
loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()
print("Processing completed!")
except Exception as e:
logging.fatal(f'Download error')

loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,
tmcf_path)
loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()
if __name__ == "__main__":
app.run(main)
8 changes: 5 additions & 3 deletions scripts/eurostat/health_determinants/common/euro_stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,13 @@ def __init__(self,
input_files: list,
csv_file_path: str = None,
mcf_file_path: str = None,
tmcf_file_path: str = None) -> None:
tmcf_file_path: str = None,
import_name: str = None) -> None:
self._input_files = input_files
self._cleaned_csv_file_path = csv_file_path
self._mcf_file_path = mcf_file_path
self._tmcf_file_path = tmcf_file_path
self._import_name = import_name
self._df = pd.DataFrame()

# pylint: disable=pointless-statement
Expand Down Expand Up @@ -151,8 +153,8 @@ def _parse_file(self, file_name: str, df: pd.DataFrame,
df = df[df['age'] == 'TOTAL']
df = replace_col_values(df)

if file_name in file_to_sv_mapping[import_name]:
df['SV'] = eval(file_to_sv_mapping[import_name][file_name])
if file_name in file_to_sv_mapping[self._import_name]:
df['SV'] = eval(file_to_sv_mapping[self._import_name][file_name])
else:
print(
'#########\nERROR: File (', file_name,
Expand Down
26 changes: 11 additions & 15 deletions scripts/eurostat/health_determinants/tobacco_consumption/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ The population is categorized by various set of combinations as below:
15. Daily Smokers of Cigarettes by Sex and Educational Attainment level
16. Daily Smokers of Cigarettes by Sex and Income Quintile
17. Daily Smokers by number of Cigarettes by Sex and Educational Attainment level


### Download URL
The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Health -> Health determinants (hlth_det).
The actual URLs are listed in input_files.py.


#### API Output
These are the attributes that will be used
Expand All @@ -44,9 +38,6 @@ These are the attributes that will be used
| Country of Citizenship | The citizenship of the population. |
| Degree of Activity Limitation | |




#### Cleaned Data
Cleaned data will be inside [output/eurostat_population_tobaccoconsumption.csv] as a CSV file with the following columns.

Expand All @@ -56,7 +47,9 @@ Cleaned data will be inside [output/eurostat_population_tobaccoconsumption.csv]
- Measurement_Method
- Observation


### Download URL
The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det).
The actual URLs are listed in import_download_details.py

#### MCFs and Template MCFs
- [output/eurostat_population_tobaccoconsumption.mcf]
Expand All @@ -68,15 +61,18 @@ Run the test cases

`python3 -m unittest scripts/eurostat/health_determinants/Tobacco_consumption/process_test.py`

### Import Procedure

The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files.

`python scripts/eurostat/health_determinants/Tobacco_consumption/process.py`

### Import Procedure
if we want to perform only the download of this import, execute the below command:

The below script will download the data and extract it.
`python scripts/eurostat/health_determinants/tobacco_consumption/process.py --mode=download`

`/bin/python scripts/eurostat/health_determinants/Tobacco_consumption/input_files.py`
if we want to perform only process for this import, execute the below command:

The below script will clean the data, Also generate final csv, mcf and tmcf files.
`python scripts/eurostat/health_determinants/tobacco_consumption/process.py --mode=process`

`/bin/python scripts/eurostat/health_determinants/Tobacco_consumption/process.py`
Downloaded Files are created inside 'input_files' directory.
83 changes: 59 additions & 24 deletions scripts/eurostat/health_determinants/tobacco_consumption/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,23 @@
import os
import sys
import pandas as pd
from absl import app, flags, logging

_COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(1, _COMMON_PATH)
# pylint: disable=wrong-import-position
from common.euro_stat import EuroStat
# pylint: enable=wrong-import-position
from common import import_download_details, download

_FLAGS = flags.FLAGS
flags.DEFINE_string('mode', '', 'Options: download or process')

class EuroStatTobaccoConsumption(EuroStat):
"""
This Class has requried methods to generate Cleaned CSV,
MCF and TMCF Files.
"""
_import_name = "tobacco_consumption"

_mcf_template = ("Node: dcid:{sv}"
"\n{sv_name}"
Expand Down Expand Up @@ -103,6 +106,25 @@ class EuroStatTobaccoConsumption(EuroStat):
"NotACitizen": "citizenship",
}

@staticmethod
def download_data(import_name):
"""Downloads raw data from Eurostat website and stores it in instance data frame.
Args:
import_name(str): A string representing the import name.
Returns:True
"""
download_details = import_download_details.download_details[import_name]
download_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', import_name,"input_files"))
os.makedirs(download_path, exist_ok=True)

for file in download_details["filenames"]:
download_files_urls = [download_details["input_url"] + str(file) +download_details["file_extension"]]
download.download_files(download_files_urls, download_path)
return True

# over-ridden parent abstract method
def _property_correction(self):
"""
Expand Down Expand Up @@ -157,27 +179,40 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame:

# pylint: enable=no-self-use

def main(_):
mode = _FLAGS.mode
global import_name
import_name = "tobacco_consumption"
if mode == "" or mode == "download":
EuroStatTobaccoConsumption.download_data(import_name)
if mode == "" or mode == "process":
try:
input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"input_files")
ip_files = os.listdir(input_path)
ip_files = [input_path + os.sep + file for file in ip_files]

# Defining Output Files
data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"output")

csv_name = "eurostat_population_tobaccoconsumption.csv"
mcf_name = "eurostat_population_tobaccoconsumption.mcf"
tmcf_name = "eurostat_population_tobaccoconsumption.tmcf"

cleaned_csv_path = os.path.join(data_file_path, csv_name)
mcf_path = os.path.join(data_file_path, mcf_name)
tmcf_path = os.path.join(data_file_path, tmcf_name)

loader = EuroStatTobaccoConsumption(ip_files, cleaned_csv_path, mcf_path,
tmcf_path, import_name)
loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()
print("Processing completed!")
except Exception as e:
logging.fatal(f'Download error')

if __name__ == "__main__":
app.run(main)

if __name__ == '__main__':
input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"input_files")
ip_files = os.listdir(input_path)
ip_files = [input_path + os.sep + file for file in ip_files]

# Defining Output Files
data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"output")

csv_name = "eurostat_population_tobaccoconsumption.csv"
mcf_name = "eurostat_population_tobaccoconsumption.mcf"
tmcf_name = "eurostat_population_tobaccoconsumption.tmcf"

cleaned_csv_path = os.path.join(data_file_path, csv_name)
mcf_path = os.path.join(data_file_path, mcf_name)
tmcf_path = os.path.join(data_file_path, tmcf_name)

loader = EuroStatTobaccoConsumption(ip_files, cleaned_csv_path, mcf_path,
tmcf_path)
loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()

0 comments on commit d4b7173

Please sign in to comment.