Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EuroStatHealth_AlcoholConsumption_Modifications #1145

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 22 additions & 17 deletions scripts/eurostat/health_determinants/alcohol_consumption/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,6 @@ The population is categorized by various set of combinations as below:
10. Alcohol Consumption by Sex and Country of Birth.
11. Alcohol Consumption by Sex and Country of Citizenship.


### Download URL
Input files are available for download from url: https://ec.europa.eu/eurostat/web/health/data/database -> Health -> Health determinants (hlth_det).

### Import Procedure
The below script will download the data and extract it.

`python scripts/eurostat/health_determinants/common/download_eurostat_input_files.py --import_name alcohol_consumption`

Files are created inside 'input_files' directory.


#### Output
Statistical variables for alcohol consumption are based on below properties available in input files.
| Attribute | Description |
Expand All @@ -45,11 +33,6 @@ Statistical variables for alcohol consumption are based on below properties avai
| Country of Citizenship | The citizenship of the population. |


Below script will generate cleansed observation file (csv), mcf and tmcf files.

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`


#### Cleaned Observation File
Cleaned data will be persisted as a CSV file in output/eurostat_population_alcohol_consumption.csv with the following columns.

Expand All @@ -65,9 +48,31 @@ MCF and tMCF files are presisted in below mentioned path.
- [output/eurostat_population_alcohol_consumption.mcf]
- [output/eurostat_population_alcohol_consumption.tmcf]

### Download URL

The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det).
The actual URLs are listed in import_download_details.py

### Running Tests

Run the test cases

`python3 -m unittest discover -v -s scripts/eurostat/health_determinants/alcohol_consumption/ -p process_test.py`

### Import Procedure

The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files.

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`

To download data for this import, run:

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=download`

To process the downloaded data, run:

`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=process`

Downloaded Files are created inside 'input_files' directory.


83 changes: 63 additions & 20 deletions scripts/eurostat/health_determinants/alcohol_consumption/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,24 @@
import os
import sys
import pandas as pd
from absl import app, flags, logging

_COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(1, _COMMON_PATH)
# pylint: disable=wrong-import-position
from common.euro_stat import EuroStat
from common import import_download_details, download
# pylint: enable=wrong-import-position

_FLAGS = flags.FLAGS
flags.DEFINE_string('mode', '', 'Options: download or process')


class EuroStatAlcoholConsumption(EuroStat):
"""
This Class has requried methods to generate Cleaned CSV,
MCF and TMCF Files.
"""
_import_name = "alcohol_consumption"

_mcf_template = ("Node: dcid:{sv}"
"\n{sv_name}"
Expand Down Expand Up @@ -88,6 +92,30 @@ class EuroStatAlcoholConsumption(EuroStat):
"NotACitizen": "citizenship",
}

@staticmethod
def download_data(import_name):
"""Downloads raw data from Eurostat website and stores it in instance data frame.

Args:
import_name(str): A string representing the import name.

Returns:True

"""
download_details = import_download_details.download_details[import_name]
download_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', import_name,
"input_files"))
os.makedirs(download_path, exist_ok=True)

for file in download_details["filenames"]:
download_files_urls = [
download_details["input_url"] + str(file) +
download_details["file_extension"]
]
download.download_files(download_files_urls, download_path)
return True

# over-ridden parent abstract method
def _property_correction(self):
for k, v in self._sv_properties.items():
Expand Down Expand Up @@ -119,26 +147,41 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame:
return df.rename(columns={'frequenc': 'frequenc_alcohol'})


if __name__ == '__main__':
input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"input_files")
ip_files = os.listdir(input_path)
ip_files = [input_path + os.sep + file for file in ip_files]
def main(_):
mode = _FLAGS.mode
global import_name
import_name = "alcohol_consumption"
if mode == "" or mode == "download":
EuroStatAlcoholConsumption.download_data(import_name)
if mode == "" or mode == "process":
try:
input_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "input_files")
ip_files = os.listdir(input_path)
ip_files = [input_path + os.sep + file for file in ip_files]

# Defining Output Files
data_file_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "output")

csv_name = "eurostat_population_alcoholconsumption.csv"
mcf_name = "eurostat_population_alcoholconsumption.mcf"
tmcf_name = "eurostat_population_alcoholconsumption.tmcf"

# Defining Output Files
data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"output")
cleaned_csv_path = os.path.join(data_file_path, csv_name)
mcf_path = os.path.join(data_file_path, mcf_name)
tmcf_path = os.path.join(data_file_path, tmcf_name)

csv_name = "eurostat_population_alcoholconsumption.csv"
mcf_name = "eurostat_population_alcoholconsumption.mcf"
tmcf_name = "eurostat_population_alcoholconsumption.tmcf"
loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path,
mcf_path, tmcf_path,
import_name)
loader.generate_csv()
SudhishaK marked this conversation as resolved.
Show resolved Hide resolved
loader.generate_mcf()
loader.generate_tmcf()
logging.info("Processing completed!")
except Exception as e:
logging.fatal(f'Processing error - {e}')

cleaned_csv_path = os.path.join(data_file_path, csv_name)
mcf_path = os.path.join(data_file_path, mcf_name)
tmcf_path = os.path.join(data_file_path, tmcf_name)

loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,
tmcf_path)
loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()
if __name__ == "__main__":
app.run(main)
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@

class AlcoholConsumptionTest(CommonTestClass.CommonTestCases):
_import_class = EuroStatAlcoholConsumption
_import_name = "alcohol_consumption"
_test_module_directory = os.path.dirname(__file__)
Loading
Loading