Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

converted new function from notebook into package #13

Merged
merged 1 commit into from
Jun 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe",
"python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe",
"restructuredtext.confPath": "${workspaceFolder}\\docs"
}
6 changes: 3 additions & 3 deletions conflict_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from . import selection
from . import utils
from . import analysis
from . import env_vars_nc
from . import get_boolean_conflict
from . import get_var_from_nc

__author__ = """Jannis M. Hoch"""
__email__ = '[email protected]'
__version__ = '0.0.1-beta'
__version__ = '0.0.1'
64 changes: 0 additions & 64 deletions conflict_model/analysis.py

This file was deleted.

83 changes: 0 additions & 83 deletions conflict_model/env_vars_nc.py

This file was deleted.

49 changes: 49 additions & 0 deletions conflict_model/get_boolean_conflict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year):
"""Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.

Args:
conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data)
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
sim_year (int): year for which data is extracted

Raises:
AssertionError: raised if the length of output list does not match length of input geo-dataframe

Returns:
list: list containing 0/1 per polygon depending on conflict occurence
"""

print('determining whether a conflict took place or not')

# select the entries which occured in this year
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]

# merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
data_merged = gpd.sjoin(temp_sel_year, extent_gdf)

# determine the aggregated amount of fatalities in one region (e.g. water province)
fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})

# loop through all regions and check if exists in sub-set
# if so, this means that there was conflict and thus assign value 1
list_out = []
for i in range(len(extent_gdf)):
i_watProv = extent_gdf.iloc[i]['watprovID']
if i_watProv in fatalities_per_watProv.index.values:
list_out.append(1)
else:
list_out.append(0)

if not len(extent_gdf) == len(list_out):
raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))

print('...DONE' + os.linesep)

return list_out
130 changes: 130 additions & 0 deletions conflict_model/get_var_from_nc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import xarray as xr
import rasterio as rio
import pandas as pd
import geopandas as gpd
import rasterstats as rstats
import numpy as np
import matplotlib.pyplot as plt
import os, sys

def nc_with_integer_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
"""This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
By default, the mean value of all cells within a polygon is computed.
The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.

NOTE:
The var_name must be identical to the key in the config-file.

NOTE:
This function is specifically written for netCDF-files where the time variable contains integer (year-)values, e.g. 1995, 1996, ...

NOTE:
Works only with nc-files with annual data.

Args:
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file
sim_year (int): year for which data is extracted
stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.

Raises:
ValueError: raised if the extracted variable at a time step does not contain data

Returns:
list: list containing statistical value per polygon, i.e. with same length as extent_gdf
"""
# get path to netCDF-file.
nc_fo = os.path.join(config.get('general', 'input_dir'),
config.get('env_vars', var_name))

print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))

# open nc-file with xarray as dataset
nc_ds = xr.open_dataset(nc_fo)
# get xarray data-array for specified variable
nc_var = nc_ds[var_name]

# open nc-file with rasterio to get affine information
affine = rio.open(nc_fo).transform

# get values from data-array for specified year
nc_arr = nc_var.sel(time=sim_year)
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

# initialize output list
list_out = []
# loop through all polygons in geo-dataframe and compute statistics, then append to output file
for i in range(len(extent_gdf)):
prov = extent_gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
list_out.append(zonal_stats[0][stat_func])

print('...DONE' + os.linesep)

return list_out

def nc_with_continous_regular_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
"""This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
By default, the mean value of all cells within a polygon is computed.
The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.

NOTE:
The var_name must be identical to the key in the config-file.

NOTE:
Works only with nc-files with annual data.

Args:
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file
sim_year (int): year for which data is extracted
stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.

Raises:
ValueError: raised if specfied year cannot be found in years in nc-file
ValueError: raised if the extracted variable at a time step does not contain data

Returns:
list: list containing statistical value per polygon, i.e. with same length as extent_gdf
"""
# get path to netCDF-file.
nc_fo = os.path.join(config.get('general', 'input_dir'),
config.get('env_vars', var_name))

print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))

# open nc-file with xarray as dataset
nc_ds = xr.open_dataset(nc_fo)
# get xarray data-array for specified variable
nc_var = nc_ds[var_name]
# get years contained in nc-file as integer array to be compatible with sim_year
years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
if sim_year not in years:
raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))

# get index which corresponds with sim_year in years in nc-file
sim_year_idx = int(np.where(years == sim_year)[0])
# get values from data-array for specified year based on index
nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

# open nc-file with rasterio to get affine information
affine = rio.open(nc_fo).transform

# initialize output list
list_out = []
# loop through all polygons in geo-dataframe and compute statistics, then append to output file
for i in range(len(extent_gdf)):
prov = extent_gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
list_out.append(zonal_stats[0][stat_func])

print('...DONE' + os.linesep)

return list_out
4 changes: 2 additions & 2 deletions data/run_setting.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ zones=BWh,BSh
code2class=KoeppenGeiger/classification_codes.txt

[env_vars]
GDP_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
GDP_per_capita_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
total_evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
Loading