Skip to content

Commit

Permalink
Merge pull request #12 from JannisHoch/dev
Browse files Browse the repository at this point in the history
first working version ML-SVC model
  • Loading branch information
JannisHoch authored Jun 17, 2020
2 parents aa7e9c5 + 3dd30b3 commit 57843f2
Show file tree
Hide file tree
Showing 36 changed files with 3,086 additions and 286 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ dmypy.json
.pyre/

# run settings
# */run_setting.cfg
*/run_setting.cfg

#output folders
OUT*/
13 changes: 13 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Config file for automatic testing at travis-ci.com

language: python
python:
- 3.8
- 3.7
- 3.6

# Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
install: pip install -U tox-travis

# Command to run tests, e.g. python setup.py test
script: tox
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe",
"python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe",
"restructuredtext.confPath": "${workspaceFolder}\\docs"
}
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ conflict_model
----------------
(Machine learning) model for mapping environmental drivers of conflict risk

.. image:: https://travis-ci.com/JannisHoch/conflict_model.svg?token=BnX1oxxHRbyd1dPyXAp2&branch=dev
:target: https://travis-ci.com/JannisHoch/conflict_model


installation
----------------

Expand Down
1 change: 1 addition & 0 deletions conflict_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from . import selection
from . import utils
from . import analysis
from . import env_vars_nc

__author__ = """Jannis M. Hoch"""
__email__ = '[email protected]'
Expand Down
128 changes: 50 additions & 78 deletions conflict_model/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,89 +4,61 @@
import matplotlib.pyplot as plt
import os

def conflict_in_year_bool(conflict_gdf, continent_gdf, config, saving_plots=False, showing_plots=False, out_dir=None):
"""Determins per year the number of fatalities per country and derivates a boolean value whether conflict has occured in one year in one country or not.
def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False):
"""Determines whether conflict took place in a region in one year and, if so, assigns a value of 1 to this region.
Arguments:
conflict_gdf {geodataframe}: geodataframe containing final selection of georeferenced conflicts
continent_gdf {geodataframe}: geodataframe containing country polygons of selected continent
config {configuration}: parsed configuration settings
conflict_gdf {[type]} -- [description]
extent_gdf {[type]} -- [description]
config {[type]} -- [description]
sim_year {[type]} -- [description]
out_dir {[type]} -- [description]
Keyword Arguments:
plotting {bool}: whether or not to make annual plots of boolean conflict and conflict fatalities (default: False)
"""

#out_dir
#if not set as keyword argument, then taken from cfg-file
if out_dir==None:
out_dir = config.get('general','output_dir')
else:
out_dir = out_dir
if not os.path.isdir(out_dir):
os.makedirs(out_dir)

print('output directory is', out_dir)
saving_plots (bool): whether or not to save the plot (default: False)
showing_plots (bool): whether or not to show the plot (default: False)
Returns:
dataframe: dataframe containing column with boolean information about conflict for each year
"""

print('determining whether a conflict took place or not')

out_df = extent_gdf.copy()

# each year initialize new column with default value 0 (=False)
out_df['boolean_conflict_' + str(sim_year)] = 0

# select the entries which occured in this year
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]

# merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
data_merged = gpd.sjoin(temp_sel_year, out_df)

# determine the aggregated amount of fatalities in one region (e.g. water province)
fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})

# loop through all regions and check if exists in sub-set
# if so, this means that there was conflict and thus assign value 1
for i in range(len(out_df)):
i_watProv = out_df.iloc[i]['watprovID']
if i_watProv in fatalities_per_watProv.index.values:
fats = int(fatalities_per_watProv.loc[i_watProv])
out_df.loc[i, 'boolean_conflict_' + str(sim_year)] = 1

print('...DONE' + os.linesep)

# plotting
fig, ax = plt.subplots(1, 1, figsize=(20,10))
ax.set_title('boolean_conflict_' + str(sim_year))
out_df.plot(ax=ax, column='boolean_conflict_' + str(sim_year), legend=True, categorical=True)
plt.tight_layout()

if saving_plots:
print('saving plots')
else:
print('not saving plots')

# get all years in the dataframe
years = conflict_gdf.year.unique()

# go through all years found
for year in np.sort(years):

# select the entries which occured in this year
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == year]

# merge this selection with the continent data
data_merged = gpd.sjoin(temp_sel_year, continent_gdf, how="inner", op='within')

# per country the annual total fatalities are computed and stored in a separate column
annual_fatalities_sum = pd.merge(continent_gdf,
data_merged['best'].groupby(data_merged['name']).sum().\
to_frame().rename(columns={"best": "best_SUM"}),
on='name')

# if the fatalities exceed 0.0, this entry is assigned a value 1, otherwise 0
annual_fatalities_sum['conflict_bool'] = np.where(annual_fatalities_sum['best_SUM']>0.0, 1, 0)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,10), sharey=True)

annual_fatalities_sum.plot(ax=ax1,column='conflict_bool',
vmin=0,
vmax=2,
categorical=True,
legend=True)

continent_gdf.boundary.plot(ax=ax1,
color='0.5',
linestyle=':')

ax1.set_xlim(continent_gdf.total_bounds[0]-1, continent_gdf.total_bounds[2]+1)
ax1.set_ylim(continent_gdf.total_bounds[1]-1, continent_gdf.total_bounds[3]+1)
ax1.set_title('conflict_bool ' + str(year))

annual_fatalities_sum.plot(ax=ax2, column='best_SUM',
vmin=0,
vmax=1500)

continent_gdf.boundary.plot(ax=ax2,
color='0.5',
linestyle=':')

ax2.set_xlim(continent_gdf.total_bounds[0]-1, continent_gdf.total_bounds[2]+1)
ax2.set_ylim(continent_gdf.total_bounds[1]-1, continent_gdf.total_bounds[3]+1)
ax2.set_title('aggr. fatalities ' + str(year))

fn_out = os.path.join(out_dir, 'plot' + str(year) + '.png')

if saving_plots:
plt.savefig(fn_out, dpi=300)
fn_out = os.path.join(out_dir, 'boolean_conflict_map_' + str(sim_year) + '.png')
plt.savefig(fn_out, dpi=300)

if not showing_plots:
plt.close()
if not showing_plots:
plt.close()

return
return out_df
83 changes: 83 additions & 0 deletions conflict_model/env_vars_nc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import xarray as xr
import rasterio as rio
import pandas as pd
import geopandas as gpd
import rasterstats as rstats
import numpy as np
import matplotlib.pyplot as plt
import os, sys

def rasterstats_GDP_PPP(gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False):

print('calculating GDP PPP mean per aggregation unit')

nc_fo = os.path.join(config.get('general', 'input_dir'),
config.get('env_vars', 'GDP_PPP'))

nc_ds = xr.open_dataset(nc_fo)

nc_var = nc_ds['GDP_per_capita_PPP']

# years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
# if sim_year not in years:
# raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
# sim_year_idx = int(np.where(years == sim_year)[0])

affine = rio.open(nc_fo).transform

# gdf['zonal_stats_min_' + str(sim_year)] = np.nan
# gdf['zonal_stats_max_' + str(sim_year)] = np.nan
# gdf['GDP_PPP_mean_' + str(sim_year)] = np.nan

nc_arr = nc_var.sel(time=sim_year)
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

list_GDP_PPP = []

for i in range(len(gdf)):
prov = gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
# gdf.loc[i, 'zonal_stats_min_' + str(sim_year)] = zonal_stats[0]['min']
# gdf.loc[i, 'zonal_stats_max_' + str(sim_year)] = zonal_stats[0]['max']
list_GDP_PPP.append(zonal_stats[0]['mean'])

print('...DONE' + os.linesep)

return list_GDP_PPP

def rasterstats_totalEvap(gdf_in, config, sim_year, out_dir):

print('calculating evaporation mean per aggregation unit')

nc_fo = os.path.join(config.get('general', 'input_dir'),
config.get('env_vars', 'evaporation'))

nc_ds = xr.open_dataset(nc_fo)

nc_var = nc_ds['total_evaporation']

years = nc_ds['time'].values
years = years[years>=config.getint('settings', 'y_start')]
years = years[years<=config.getint('settings', 'y_end')]

affine = rio.open(nc_fo).transform

gdf = gdf_in.copy()

gdf['evap_mean_' + str(sim_year)] = np.nan

nc_arr = nc_var.sel(time=sim_year)
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

for i in range(len(gdf)):
prov = gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
gdf.loc[i, 'evap_mean_' + str(sim_year)] = zonal_stats[0]['mean']

print('...DONE' + os.linesep)

return gdf
17 changes: 17 additions & 0 deletions conflict_model/machine_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd
import seaborn as sbs
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np
import os, sys

def prepare_data(df, Xvars, Yvar):

if len(Xvars) < 2:
raise ValueError('at least 2 variables need to be specified!')
if len(yvar) > 1:
raise ValueError('maximum 1 target variable must be specified!')

Y = np.append(Y, df[yvar].values)

return X, y
33 changes: 19 additions & 14 deletions conflict_model/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def select_period(gdf, config):

return gdf

def clip_to_continent(gdf, config):
def clip_to_extent(gdf, config):
"""As the original conflict data has global extent, this function clips the database to those entries which have occured on a specified continent.
Arguments:
Expand All @@ -73,14 +73,18 @@ def clip_to_continent(gdf, config):
geodataframe: geodataframe containing country polygons of selected continent
"""

world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
continent_gdf = world[world["continent"] == config.get('settings', 'continent')]
shp_fo = os.path.join(config.get('general', 'input_dir'),
config.get('extent', 'shp'))

print('clipping dataset to continent', str(config.get('settings', 'continent')) + os.linesep)

gdf = gpd.clip(gdf, continent_gdf)
print('reading extent and spatial aggregation level from file {}'.format(shp_fo))
extent_gdf = gpd.read_file(shp_fo)
print('...DONE' + os.linesep)

print('clipping datasets to extent')
gdf = gpd.clip(gdf, extent_gdf)
print('...DONE' + os.linesep)

return gdf, continent_gdf
return gdf, extent_gdf

def climate_zoning(gdf, config):
"""Only those conflicts falling in certain climate zones may be of interest and this functions keeps only those falling into the specified zones.
Expand Down Expand Up @@ -109,13 +113,14 @@ def climate_zoning(gdf, config):
code_nr = int(code2class['code'].loc[code2class['class'] == entry])
code_nrs.append(code_nr)

print('clipping to climate zones' + os.linesep)
KG_gdf = KG_gdf.loc[KG_gdf['GRIDCODE'].isin(code_nrs)]

if KG_gdf.crs != 'EPSG:4326':
KG_gdf = KG_gdf.to_crs('EPSG:4326')

print('clipping to climate zones{}'.format(look_up_classes))
gdf = gpd.clip(gdf, KG_gdf.buffer(0))
print('...DONE' + os.linesep)

return gdf

Expand All @@ -138,17 +143,17 @@ def select(gdf, config, plotting=False):

gdf = select_period(gdf, config)

gdf, continent_gdf = clip_to_continent(gdf, config)
gdf, extent_gdf = clip_to_extent(gdf, config)

gdf = climate_zoning(gdf, config)

# if specified, plot the result
if plotting:
print('plotting result' + os.linesep)
ax = continent_conflict_gdf.plot(figsize=(10,5), legend=True, label='PRIO/UCDP events')
continent_gdf.boundary.plot(ax=ax, color='0.5', linestyle=':')
ax = gdf.plot(figsize=(10,5), legend=True, label='PRIO/UCDP events')
extent_gdf.boundary.plot(ax=ax, color='0.5', linestyle=':')
plt.legend()
ax.set_xlim(continent_gdf.total_bounds[0]-1, continent_gdf.total_bounds[2]+1)
ax.set_ylim(continent_gdf.total_bounds[1]-1, continent_gdf.total_bounds[3]+1)
ax.set_xlim(extent_gdf.total_bounds[0]-1, extent_gdf.total_bounds[2]+1)
ax.set_ylim(extent_gdf.total_bounds[1]-1, extent_gdf.total_bounds[3]+1)

return gdf, continent_gdf
return gdf, extent_gdf
Loading

0 comments on commit 57843f2

Please sign in to comment.