Merge pull request #12 from JannisHoch/dev

first working version ML-SVC model
JannisHoch · Jun 17, 2020 · 57843f2 · 57843f2
2 parents aa7e9c5 + 3dd30b3
commit 57843f2
Show file tree

Hide file tree

Showing 36 changed files with 3,086 additions and 286 deletions.
diff --git a/.gitignore b/.gitignore
@@ -129,7 +129,7 @@ dmypy.json
 .pyre/
 
 # run settings
-# */run_setting.cfg
+*/run_setting.cfg
 
 #output folders
 OUT*/
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,13 @@
+# Config file for automatic testing at travis-ci.com
+
+language: python
+python:
+  - 3.8
+  - 3.7
+  - 3.6
+
+# Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
+install: pip install -U tox-travis
+
+# Command to run tests, e.g. python setup.py test
+script: tox
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,4 +1,4 @@
 {
-    "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe",
+    "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe",
     "restructuredtext.confPath": "${workspaceFolder}\\docs"
 }
diff --git a/README.rst b/README.rst
@@ -6,6 +6,10 @@ conflict_model
 ----------------
 (Machine learning) model for mapping environmental drivers of conflict risk
 
+.. image:: https://travis-ci.com/JannisHoch/conflict_model.svg?token=BnX1oxxHRbyd1dPyXAp2&branch=dev
+    :target: https://travis-ci.com/JannisHoch/conflict_model
+
+
 installation
 ----------------
 

diff --git a/conflict_model/__init__.py b/conflict_model/__init__.py
@@ -3,6 +3,7 @@
 from . import selection
 from . import utils
 from . import analysis
+from . import env_vars_nc
 
 __author__ = """Jannis M. Hoch"""
 __email__ = '[email protected]'

diff --git a/conflict_model/analysis.py b/conflict_model/analysis.py
@@ -4,89 +4,61 @@
 import matplotlib.pyplot as plt
 import os
 
-def conflict_in_year_bool(conflict_gdf, continent_gdf, config, saving_plots=False, showing_plots=False, out_dir=None):
-    """Determins per year the number of fatalities per country and derivates a boolean value whether conflict has occured in one year in one country or not.
+def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False):
+    """Determines whether conflict took place in a region in one year and, if so, assigns a value of 1 to this region.
 
     Arguments:
-        conflict_gdf {geodataframe}: geodataframe containing final selection of georeferenced conflicts
-        continent_gdf {geodataframe}: geodataframe containing country polygons of selected continent
-        config {configuration}: parsed configuration settings
+        conflict_gdf {[type]} -- [description]
+        extent_gdf {[type]} -- [description]
+        config {[type]} -- [description]
+        sim_year {[type]} -- [description]
+        out_dir {[type]} -- [description]
 
     Keyword Arguments:
-        plotting {bool}: whether or not to make annual plots of boolean conflict and conflict fatalities (default: False)
-    """    
-
-    #out_dir
-    #if not set as keyword argument, then taken from cfg-file
-    if out_dir==None:
-        out_dir = config.get('general','output_dir')
-    else:
-        out_dir = out_dir
-    if not os.path.isdir(out_dir):
-            os.makedirs(out_dir)
-
-    print('output directory is', out_dir)
+        saving_plots (bool): whether or not to save the plot (default: False)
+        showing_plots (bool): whether or not to show the plot (default: False)
 
+    Returns:
+        dataframe: dataframe containing column with boolean information about conflict for each year
+    """    
+
+    print('determining whether a conflict took place or not')
+
+    out_df = extent_gdf.copy()
+
+    # each year initialize new column with default value 0 (=False)
+    out_df['boolean_conflict_' + str(sim_year)] = 0
+
+    # select the entries which occured in this year
+    temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]   
+
+    # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
+    data_merged = gpd.sjoin(temp_sel_year, out_df)
+
+    # determine the aggregated amount of fatalities in one region (e.g. water province)
+    fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+
+    # loop through all regions and check if exists in sub-set
+    # if so, this means that there was conflict and thus assign value 1
+    for i in range(len(out_df)):
+        i_watProv = out_df.iloc[i]['watprovID']
+        if i_watProv in fatalities_per_watProv.index.values:
+            fats = int(fatalities_per_watProv.loc[i_watProv])
+            out_df.loc[i, 'boolean_conflict_' + str(sim_year)] = 1
+
+    print('...DONE' + os.linesep)
+
+    # plotting
+    fig, ax = plt.subplots(1, 1, figsize=(20,10))
+    ax.set_title('boolean_conflict_' + str(sim_year))
+    out_df.plot(ax=ax, column='boolean_conflict_' + str(sim_year), legend=True, categorical=True)
+    plt.tight_layout()
+
     if saving_plots:
-        print('saving plots')
-    else:
-        print('not saving plots')
-
-    # get all years in the dataframe
-    years = conflict_gdf.year.unique()
-
-    # go through all years found
-    for year in np.sort(years):
-
-        # select the entries which occured in this year
-        temp_sel_year = conflict_gdf.loc[conflict_gdf.year == year]
-
-        # merge this selection with the continent data
-        data_merged = gpd.sjoin(temp_sel_year, continent_gdf, how="inner", op='within')
-
-        # per country the annual total fatalities are computed and stored in a separate column
-        annual_fatalities_sum = pd.merge(continent_gdf,
-                                         data_merged['best'].groupby(data_merged['name']).sum().\
-                                         to_frame().rename(columns={"best": "best_SUM"}),
-                                         on='name')
-
-        # if the fatalities exceed 0.0, this entry is assigned a value 1, otherwise 0
-        annual_fatalities_sum['conflict_bool'] = np.where(annual_fatalities_sum['best_SUM']>0.0, 1, 0)
-
-        fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,10), sharey=True)
-
-        annual_fatalities_sum.plot(ax=ax1,column='conflict_bool',
-                                        vmin=0,
-                                        vmax=2,
-                                        categorical=True,
-                                        legend=True)
-
-        continent_gdf.boundary.plot(ax=ax1,
-                                    color='0.5',
-                                    linestyle=':')
-
-        ax1.set_xlim(continent_gdf.total_bounds[0]-1, continent_gdf.total_bounds[2]+1)
-        ax1.set_ylim(continent_gdf.total_bounds[1]-1, continent_gdf.total_bounds[3]+1)
-        ax1.set_title('conflict_bool ' + str(year))
-
-        annual_fatalities_sum.plot(ax=ax2, column='best_SUM',
-                                        vmin=0,
-                                        vmax=1500)
-
-        continent_gdf.boundary.plot(ax=ax2,
-                                    color='0.5',
-                                    linestyle=':')
-
-        ax2.set_xlim(continent_gdf.total_bounds[0]-1, continent_gdf.total_bounds[2]+1)
-        ax2.set_ylim(continent_gdf.total_bounds[1]-1, continent_gdf.total_bounds[3]+1)
-        ax2.set_title('aggr. fatalities ' + str(year))
-
-        fn_out = os.path.join(out_dir, 'plot' + str(year) + '.png')
-
-        if saving_plots:
-            plt.savefig(fn_out, dpi=300)
+        fn_out = os.path.join(out_dir, 'boolean_conflict_map_' + str(sim_year) + '.png')
+        plt.savefig(fn_out, dpi=300)
 
-        if not showing_plots:
-            plt.close()
+    if not showing_plots:
+        plt.close()
 
-    return 
+    return out_df
diff --git a/conflict_model/env_vars_nc.py b/conflict_model/env_vars_nc.py
@@ -0,0 +1,83 @@
+import xarray as xr
+import rasterio as rio
+import pandas as pd
+import geopandas as gpd
+import rasterstats as rstats
+import numpy as np
+import matplotlib.pyplot as plt
+import os, sys
+
+def rasterstats_GDP_PPP(gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False):
+
+    print('calculating GDP PPP mean per aggregation unit')
+
+    nc_fo = os.path.join(config.get('general', 'input_dir'), 
+                         config.get('env_vars', 'GDP_PPP'))
+
+    nc_ds = xr.open_dataset(nc_fo)
+
+    nc_var = nc_ds['GDP_per_capita_PPP']
+
+    # years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
+    # if sim_year not in years:
+    #     raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
+    # sim_year_idx = int(np.where(years == sim_year)[0])
+
+    affine = rio.open(nc_fo).transform
+
+    # gdf['zonal_stats_min_' + str(sim_year)] = np.nan
+    # gdf['zonal_stats_max_' + str(sim_year)] = np.nan
+    # gdf['GDP_PPP_mean_' + str(sim_year)] = np.nan
+
+    nc_arr = nc_var.sel(time=sim_year)
+    nc_arr_vals = nc_arr.values
+    if nc_arr_vals.size == 0:
+        raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+
+    list_GDP_PPP = []
+
+    for i in range(len(gdf)):
+        prov = gdf.iloc[i]
+        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
+        # gdf.loc[i, 'zonal_stats_min_' + str(sim_year)] = zonal_stats[0]['min']
+        # gdf.loc[i, 'zonal_stats_max_' + str(sim_year)] = zonal_stats[0]['max']
+        list_GDP_PPP.append(zonal_stats[0]['mean'])
+
+    print('...DONE' + os.linesep)
+
+    return list_GDP_PPP
+
+def rasterstats_totalEvap(gdf_in, config, sim_year, out_dir):
+
+    print('calculating evaporation mean per aggregation unit')
+
+    nc_fo = os.path.join(config.get('general', 'input_dir'), 
+                         config.get('env_vars', 'evaporation'))
+
+    nc_ds = xr.open_dataset(nc_fo)
+
+    nc_var = nc_ds['total_evaporation']
+
+    years = nc_ds['time'].values
+    years = years[years>=config.getint('settings', 'y_start')]
+    years = years[years<=config.getint('settings', 'y_end')]
+
+    affine = rio.open(nc_fo).transform
+
+    gdf = gdf_in.copy()
+
+    gdf['evap_mean_' + str(sim_year)] = np.nan
+
+    nc_arr = nc_var.sel(time=sim_year)
+    nc_arr_vals = nc_arr.values
+    if nc_arr_vals.size == 0:
+        raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+
+    for i in range(len(gdf)):
+        prov = gdf.iloc[i]
+        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
+        gdf.loc[i, 'evap_mean_' + str(sim_year)] = zonal_stats[0]['mean']
+
+    print('...DONE' + os.linesep)
+
+    return gdf
diff --git a/conflict_model/machine_learning.py b/conflict_model/machine_learning.py
@@ -0,0 +1,17 @@
+import pandas as pd
+import seaborn as sbs
+from sklearn import svm
+import matplotlib.pyplot as plt
+import numpy as np
+import os, sys
+
+def prepare_data(df, Xvars, Yvar):
+
+    if len(Xvars) < 2:
+        raise ValueError('at least 2 variables need to be specified!')
+    if len(yvar) > 1:
+        raise ValueError('maximum 1 target variable must be specified!')
+
+    Y  = np.append(Y, df[yvar].values)
+
+    return X, y
diff --git a/conflict_model/selection.py b/conflict_model/selection.py
@@ -61,7 +61,7 @@ def select_period(gdf, config):
 
     return gdf
 
-def clip_to_continent(gdf, config):
+def clip_to_extent(gdf, config):
     """As the original conflict data has global extent, this function clips the database to those entries which have occured on a specified continent.
 
     Arguments:
@@ -73,14 +73,18 @@ def clip_to_continent(gdf, config):
         geodataframe: geodataframe containing country polygons of selected continent
     """    
 
-    world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
-    continent_gdf = world[world["continent"] == config.get('settings', 'continent')]
+    shp_fo = os.path.join(config.get('general', 'input_dir'), 
+                          config.get('extent', 'shp'))
 
-    print('clipping dataset to continent', str(config.get('settings', 'continent')) + os.linesep)
-
-    gdf = gpd.clip(gdf, continent_gdf)
+    print('reading extent and spatial aggregation level from file {}'.format(shp_fo))
+    extent_gdf = gpd.read_file(shp_fo)
+    print('...DONE' + os.linesep)
+
+    print('clipping datasets to extent')    
+    gdf = gpd.clip(gdf, extent_gdf)
+    print('...DONE' + os.linesep)
 
-    return gdf, continent_gdf
+    return gdf, extent_gdf
 
 def climate_zoning(gdf, config):
     """Only those conflicts falling in certain climate zones may be of interest and this functions keeps only those falling into the specified zones.
@@ -109,13 +113,14 @@ def climate_zoning(gdf, config):
         code_nr = int(code2class['code'].loc[code2class['class'] == entry])
         code_nrs.append(code_nr)
 
-    print('clipping to climate zones' + os.linesep)
     KG_gdf = KG_gdf.loc[KG_gdf['GRIDCODE'].isin(code_nrs)]
 
     if KG_gdf.crs != 'EPSG:4326':
         KG_gdf = KG_gdf.to_crs('EPSG:4326')
 
+    print('clipping to climate zones{}'.format(look_up_classes))
     gdf = gpd.clip(gdf, KG_gdf.buffer(0))
+    print('...DONE' + os.linesep)
 
     return gdf
 
@@ -138,17 +143,17 @@ def select(gdf, config, plotting=False):
 
     gdf = select_period(gdf, config)
 
-    gdf, continent_gdf = clip_to_continent(gdf, config)
+    gdf, extent_gdf = clip_to_extent(gdf, config)
 
     gdf = climate_zoning(gdf, config)
 
     # if specified, plot the result
     if plotting:
         print('plotting result' + os.linesep)
-        ax = continent_conflict_gdf.plot(figsize=(10,5), legend=True, label='PRIO/UCDP events')
-        continent_gdf.boundary.plot(ax=ax, color='0.5', linestyle=':')
+        ax = gdf.plot(figsize=(10,5), legend=True, label='PRIO/UCDP events')
+        extent_gdf.boundary.plot(ax=ax, color='0.5', linestyle=':')
         plt.legend()
-        ax.set_xlim(continent_gdf.total_bounds[0]-1, continent_gdf.total_bounds[2]+1)
-        ax.set_ylim(continent_gdf.total_bounds[1]-1, continent_gdf.total_bounds[3]+1)
+        ax.set_xlim(extent_gdf.total_bounds[0]-1, extent_gdf.total_bounds[2]+1)
+        ax.set_ylim(extent_gdf.total_bounds[1]-1, extent_gdf.total_bounds[3]+1)
 
-    return gdf, continent_gdf
+    return gdf, extent_gdf