diff --git a/.vscode/settings.json b/.vscode/settings.json index ef0cd5e..6bd136b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,4 @@ { - "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe", + "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe", "restructuredtext.confPath": "${workspaceFolder}\\docs" } \ No newline at end of file diff --git a/conflict_model/__init__.py b/conflict_model/__init__.py index 3770c2c..56496f9 100644 --- a/conflict_model/__init__.py +++ b/conflict_model/__init__.py @@ -2,9 +2,9 @@ from . import selection from . import utils -from . import analysis -from . import env_vars_nc +from . import get_boolean_conflict +from . import get_var_from_nc __author__ = """Jannis M. Hoch""" __email__ = 'j.m.hoch@uu.nl' -__version__ = '0.0.1-beta' +__version__ = '0.0.1' diff --git a/conflict_model/analysis.py b/conflict_model/analysis.py deleted file mode 100644 index 6817c39..0000000 --- a/conflict_model/analysis.py +++ /dev/null @@ -1,64 +0,0 @@ -import geopandas as gpd -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import os - -def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False): - """Determines whether conflict took place in a region in one year and, if so, assigns a value of 1 to this region. - - Arguments: - conflict_gdf {[type]} -- [description] - extent_gdf {[type]} -- [description] - config {[type]} -- [description] - sim_year {[type]} -- [description] - out_dir {[type]} -- [description] - - Keyword Arguments: - saving_plots (bool): whether or not to save the plot (default: False) - showing_plots (bool): whether or not to show the plot (default: False) - - Returns: - dataframe: dataframe containing column with boolean information about conflict for each year - """ - - print('determining whether a conflict took place or not') - - out_df = extent_gdf.copy() - - # each year initialize new column with default value 0 (=False) - out_df['boolean_conflict_' + str(sim_year)] = 0 - - # select the entries which occured in this year - temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year] - - # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions - data_merged = gpd.sjoin(temp_sel_year, out_df) - - # determine the aggregated amount of fatalities in one region (e.g. water province) - fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'}) - - # loop through all regions and check if exists in sub-set - # if so, this means that there was conflict and thus assign value 1 - for i in range(len(out_df)): - i_watProv = out_df.iloc[i]['watprovID'] - if i_watProv in fatalities_per_watProv.index.values: - fats = int(fatalities_per_watProv.loc[i_watProv]) - out_df.loc[i, 'boolean_conflict_' + str(sim_year)] = 1 - - print('...DONE' + os.linesep) - - # plotting - fig, ax = plt.subplots(1, 1, figsize=(20,10)) - ax.set_title('boolean_conflict_' + str(sim_year)) - out_df.plot(ax=ax, column='boolean_conflict_' + str(sim_year), legend=True, categorical=True) - plt.tight_layout() - - if saving_plots: - fn_out = os.path.join(out_dir, 'boolean_conflict_map_' + str(sim_year) + '.png') - plt.savefig(fn_out, dpi=300) - - if not showing_plots: - plt.close() - - return out_df \ No newline at end of file diff --git a/conflict_model/env_vars_nc.py b/conflict_model/env_vars_nc.py deleted file mode 100644 index fd11798..0000000 --- a/conflict_model/env_vars_nc.py +++ /dev/null @@ -1,83 +0,0 @@ -import xarray as xr -import rasterio as rio -import pandas as pd -import geopandas as gpd -import rasterstats as rstats -import numpy as np -import matplotlib.pyplot as plt -import os, sys - -def rasterstats_GDP_PPP(gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False): - - print('calculating GDP PPP mean per aggregation unit') - - nc_fo = os.path.join(config.get('general', 'input_dir'), - config.get('env_vars', 'GDP_PPP')) - - nc_ds = xr.open_dataset(nc_fo) - - nc_var = nc_ds['GDP_per_capita_PPP'] - - # years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int) - # if sim_year not in years: - # raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo)) - # sim_year_idx = int(np.where(years == sim_year)[0]) - - affine = rio.open(nc_fo).transform - - # gdf['zonal_stats_min_' + str(sim_year)] = np.nan - # gdf['zonal_stats_max_' + str(sim_year)] = np.nan - # gdf['GDP_PPP_mean_' + str(sim_year)] = np.nan - - nc_arr = nc_var.sel(time=sim_year) - nc_arr_vals = nc_arr.values - if nc_arr_vals.size == 0: - raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo)) - - list_GDP_PPP = [] - - for i in range(len(gdf)): - prov = gdf.iloc[i] - zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean") - # gdf.loc[i, 'zonal_stats_min_' + str(sim_year)] = zonal_stats[0]['min'] - # gdf.loc[i, 'zonal_stats_max_' + str(sim_year)] = zonal_stats[0]['max'] - list_GDP_PPP.append(zonal_stats[0]['mean']) - - print('...DONE' + os.linesep) - - return list_GDP_PPP - -def rasterstats_totalEvap(gdf_in, config, sim_year, out_dir): - - print('calculating evaporation mean per aggregation unit') - - nc_fo = os.path.join(config.get('general', 'input_dir'), - config.get('env_vars', 'evaporation')) - - nc_ds = xr.open_dataset(nc_fo) - - nc_var = nc_ds['total_evaporation'] - - years = nc_ds['time'].values - years = years[years>=config.getint('settings', 'y_start')] - years = years[years<=config.getint('settings', 'y_end')] - - affine = rio.open(nc_fo).transform - - gdf = gdf_in.copy() - - gdf['evap_mean_' + str(sim_year)] = np.nan - - nc_arr = nc_var.sel(time=sim_year) - nc_arr_vals = nc_arr.values - if nc_arr_vals.size == 0: - raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo)) - - for i in range(len(gdf)): - prov = gdf.iloc[i] - zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean") - gdf.loc[i, 'evap_mean_' + str(sim_year)] = zonal_stats[0]['mean'] - - print('...DONE' + os.linesep) - - return gdf \ No newline at end of file diff --git a/conflict_model/get_boolean_conflict.py b/conflict_model/get_boolean_conflict.py new file mode 100644 index 0000000..5de14c6 --- /dev/null +++ b/conflict_model/get_boolean_conflict.py @@ -0,0 +1,49 @@ +import geopandas as gpd +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os + +def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year): + """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not. + + Args: + conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data) + extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted + config (config): parsed configuration settings of run + sim_year (int): year for which data is extracted + + Raises: + AssertionError: raised if the length of output list does not match length of input geo-dataframe + + Returns: + list: list containing 0/1 per polygon depending on conflict occurence + """ + + print('determining whether a conflict took place or not') + + # select the entries which occured in this year + temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year] + + # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions + data_merged = gpd.sjoin(temp_sel_year, extent_gdf) + + # determine the aggregated amount of fatalities in one region (e.g. water province) + fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'}) + + # loop through all regions and check if exists in sub-set + # if so, this means that there was conflict and thus assign value 1 + list_out = [] + for i in range(len(extent_gdf)): + i_watProv = extent_gdf.iloc[i]['watprovID'] + if i_watProv in fatalities_per_watProv.index.values: + list_out.append(1) + else: + list_out.append(0) + + if not len(extent_gdf) == len(list_out): + raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out))) + + print('...DONE' + os.linesep) + + return list_out \ No newline at end of file diff --git a/conflict_model/get_var_from_nc.py b/conflict_model/get_var_from_nc.py new file mode 100644 index 0000000..e7fffb6 --- /dev/null +++ b/conflict_model/get_var_from_nc.py @@ -0,0 +1,130 @@ +import xarray as xr +import rasterio as rio +import pandas as pd +import geopandas as gpd +import rasterstats as rstats +import numpy as np +import matplotlib.pyplot as plt +import os, sys + +def nc_with_integer_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'): + """This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year. + By default, the mean value of all cells within a polygon is computed. + The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning. + + NOTE: + The var_name must be identical to the key in the config-file. + + NOTE: + This function is specifically written for netCDF-files where the time variable contains integer (year-)values, e.g. 1995, 1996, ... + + NOTE: + Works only with nc-files with annual data. + + Args: + extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted + config (config): parsed configuration settings of run + var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file + sim_year (int): year for which data is extracted + stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'. + + Raises: + ValueError: raised if the extracted variable at a time step does not contain data + + Returns: + list: list containing statistical value per polygon, i.e. with same length as extent_gdf + """ + # get path to netCDF-file. + nc_fo = os.path.join(config.get('general', 'input_dir'), + config.get('env_vars', var_name)) + + print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year)) + + # open nc-file with xarray as dataset + nc_ds = xr.open_dataset(nc_fo) + # get xarray data-array for specified variable + nc_var = nc_ds[var_name] + + # open nc-file with rasterio to get affine information + affine = rio.open(nc_fo).transform + + # get values from data-array for specified year + nc_arr = nc_var.sel(time=sim_year) + nc_arr_vals = nc_arr.values + if nc_arr_vals.size == 0: + raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo)) + + # initialize output list + list_out = [] + # loop through all polygons in geo-dataframe and compute statistics, then append to output file + for i in range(len(extent_gdf)): + prov = extent_gdf.iloc[i] + zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func) + list_out.append(zonal_stats[0][stat_func]) + + print('...DONE' + os.linesep) + + return list_out + +def nc_with_continous_regular_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'): + """This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year. + By default, the mean value of all cells within a polygon is computed. + The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning. + + NOTE: + The var_name must be identical to the key in the config-file. + + NOTE: + Works only with nc-files with annual data. + + Args: + extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted + config (config): parsed configuration settings of run + var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file + sim_year (int): year for which data is extracted + stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'. + + Raises: + ValueError: raised if specfied year cannot be found in years in nc-file + ValueError: raised if the extracted variable at a time step does not contain data + + Returns: + list: list containing statistical value per polygon, i.e. with same length as extent_gdf + """ + # get path to netCDF-file. + nc_fo = os.path.join(config.get('general', 'input_dir'), + config.get('env_vars', var_name)) + + print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year)) + + # open nc-file with xarray as dataset + nc_ds = xr.open_dataset(nc_fo) + # get xarray data-array for specified variable + nc_var = nc_ds[var_name] + # get years contained in nc-file as integer array to be compatible with sim_year + years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int) + if sim_year not in years: + raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo)) + + # get index which corresponds with sim_year in years in nc-file + sim_year_idx = int(np.where(years == sim_year)[0]) + # get values from data-array for specified year based on index + nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx]) + nc_arr_vals = nc_arr.values + if nc_arr_vals.size == 0: + raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo)) + + # open nc-file with rasterio to get affine information + affine = rio.open(nc_fo).transform + + # initialize output list + list_out = [] + # loop through all polygons in geo-dataframe and compute statistics, then append to output file + for i in range(len(extent_gdf)): + prov = extent_gdf.iloc[i] + zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func) + list_out.append(zonal_stats[0][stat_func]) + + print('...DONE' + os.linesep) + + return list_out \ No newline at end of file diff --git a/data/run_setting.cfg b/data/run_setting.cfg index 574c0a6..e3214a0 100644 --- a/data/run_setting.cfg +++ b/data/run_setting.cfg @@ -4,7 +4,7 @@ output_dir=C:\Users\hoch0001\Documents\_code\conflict_model\data\OUT [settings] y_start=2000 -y_end=2011 +y_end=2015 [extent] shp=waterProvinces/waterProvinces_Africa.shp @@ -21,5 +21,6 @@ zones=BWh,BSh code2class=KoeppenGeiger/classification_codes.txt [env_vars] -GDP_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc -evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc \ No newline at end of file +#variable name here needs to be identical with variable name in nc-file +GDP_per_capita_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc +total_evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc \ No newline at end of file diff --git a/example/example_notebook.html b/example/example_notebook.html index d81d928..552a3fe 100644 --- a/example/example_notebook.html +++ b/example/example_notebook.html @@ -13139,7 +13139,7 @@

Import libraries and file with
Python version: 3.7.7 (default, Apr 15 2020, 05:09:04) [MSC v.1916 64 bit (AMD64)]
-conflict_model version: 0.0.1-beta
+conflict_model version: 0.0.1
 geopandas version: 0.7.0
 xarray version: 0.15.1
 rasterio version: 1.1.0
@@ -13364,140 +13364,6 @@ 

Applying functionsFunctions

- -
-
-
In [9]:
-
-
-
def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year): 
-    
-    print('determining whether a conflict took place or not')
-
-    # each year initialize new column with default value 0 (=False)
-    list_boolConflict = []
-    
-    # select the entries which occured in this year
-    temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]   
-    
-    # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
-    data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
-    
-    # determine the aggregated amount of fatalities in one region (e.g. water province)
-    fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
- 
-    # loop through all regions and check if exists in sub-set
-    # if so, this means that there was conflict and thus assign value 1
-    for i in range(len(extent_gdf)):
-        i_watProv = extent_gdf.iloc[i]['watprovID']
-        if i_watProv in fatalities_per_watProv.index.values:
-            list_boolConflict.append(1)
-        else:
-            list_boolConflict.append(0)
-            
-    if not len(extent_gdf) == len(list_boolConflict):
-        raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_boolConflict)))
-    
-    print('...DONE' + os.linesep)
-
-    return list_boolConflict
-
- -
-
-
- -
-
-
-
In [10]:
-
-
-
def rasterstats_GDP_PPP(gdf, config, sim_year):
-
-    print('calculating GDP PPP mean per aggregation unit')
-    
-    nc_fo = os.path.join(config.get('general', 'input_dir'), 
-                         config.get('env_vars', 'GDP_PPP'))
-
-    nc_ds = xr.open_dataset(nc_fo)
-
-    nc_var = nc_ds['GDP_per_capita_PPP']
-
-    affine = rio.open(nc_fo).transform
-
-    nc_arr = nc_var.sel(time=sim_year)
-    nc_arr_vals = nc_arr.values
-    if nc_arr_vals.size == 0:
-        raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
-
-    list_GDP_PPP = []
-    
-    for i in range(len(gdf)):
-        prov = gdf.iloc[i]
-        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
-        list_GDP_PPP.append(zonal_stats[0]['mean'])
-
-    print('...DONE' + os.linesep)
-
-    return list_GDP_PPP
-
- -
-
-
- -
-
-
-
In [11]:
-
-
-
def rasterstats_totalEvap(gdf, config, sim_year):
-    
-    nc_fo = os.path.join(config.get('general', 'input_dir'), 
-                         config.get('env_vars', 'evaporation'))
-    
-    print('calculating evaporation mean per aggregation unit from {}'.format(nc_fo))
-
-    nc_ds = xr.open_dataset(nc_fo)
-
-    nc_var = nc_ds.total_evaporation
-
-    years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
-
-    if sim_year not in years:
-        raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
-
-    affine = rio.open(nc_fo).transform
-
-    gdf['evap_mean_' + str(sim_year)] = np.nan
-    
-    sim_year_idx = int(np.where(years == sim_year)[0])
-
-    nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
-    
-    nc_arr_vals = nc_arr.values
-    
-    if nc_arr_vals.size == 0:
-        raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
-
-    list_Evap = []
-    
-    for i in range(len(gdf)):
-        prov = gdf.iloc[i]
-        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
-        list_Evap.append(zonal_stats[0]["mean"])
-
-    print('...DONE' + os.linesep)
-
-    return list_Evap
-
- -
-
-
-
@@ -13509,7 +13375,7 @@

Analysis per year
-
In [12]:
+
In [9]:
print('simulation period from', str(config.getint('settings', 'y_start')), 'to', str(config.getint('settings', 'y_end')))
@@ -13524,16 +13390,16 @@ 

Analysis per yearprint('entering year {}'.format(sim_year) + os.linesep) - list_boolConflict = conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year) + list_boolConflict = conflict_model.get_boolean_conflict.conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year) Y = Y.append(pd.Series(list_boolConflict, dtype=int), ignore_index=True) - list_GDP_PPP = rasterstats_GDP_PPP(extent_gdf, config, sim_year) + list_GDP_PPP = conflict_model.get_var_from_nc.nc_with_integer_timestamp(extent_gdf, config, 'GDP_per_capita_PPP', sim_year) X1 = X1.append(pd.Series(list_GDP_PPP), ignore_index=True) if not len(list_GDP_PPP) == len(list_boolConflict): raise AssertionError('length of lists do not match, they are {0} and {1}'.format(len(list_GDP_PPP), len(list_boolConflict))) - list_Evap = rasterstats_totalEvap(extent_gdf, config, sim_year) + list_Evap = conflict_model.get_var_from_nc.nc_with_continous_regular_timestamp(extent_gdf, config, 'total_evaporation', sim_year) X2 = X2.append(pd.Series(list_Evap), ignore_index=True) if not len(list_Evap) == len(list_boolConflict): @@ -13563,7 +13429,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2000
 

@@ -13618,7 +13484,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2001
 

@@ -13673,7 +13539,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2002
 

@@ -13728,7 +13594,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2003
 
@@ -13783,7 +13649,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2004
 
@@ -13838,7 +13704,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2005
 
@@ -13893,7 +13759,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2006
 
@@ -13948,7 +13814,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2007
 
@@ -14003,7 +13869,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2008
 
@@ -14058,7 +13924,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2009
 
@@ -14113,7 +13979,7 @@

Analysis per yearAnalysis per year
...DONE

 
-calculating evaporation mean per aggregation unit from C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+calculating mean total_evaporation per aggregation unit from file C:\Users\hoch0001\Documents\_code\conflict_model\data\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc for year 2010
 
@@ -14182,14 +14048,14 @@

Machine Learning
-

First, create the X (variables) from all columns sampled and Y (target) and concatenate/copy.

+

First, create a pandas dataframe from all variables and targets and kick out rows with missing values (they do not work with ML)

-
In [13]:
+
In [10]:
XY_data = list(zip(X1, X2, Y))
@@ -14225,7 +14091,7 @@ 

Machine Learning
-
In [14]:
+
In [11]:
XY_data
@@ -14241,7 +14107,7 @@ 

Machine Learning -
Out[14]:
+
Out[11]:
@@ -14358,7 +14224,7 @@

Machine Learning
-
In [15]:
+
In [12]:
X = XY_data[['GDP_PPP', 'ET']].to_numpy()
@@ -14375,7 +14241,7 @@ 

Machine Learning -
Out[15]:
+
Out[12]:
@@ -14398,7 +14264,7 @@

Machine Learning
-
In [16]:
+
In [13]:
+
+
+
+

The scatterplot of the (two) variables in X looks like this. Also the sample size n is provided.

+ +
+
-
In [18]:
+
In [15]:
@@ -14505,7 +14371,7 @@

Machine Learning
-
In [19]:
+
In [16]:
preprocessing.scale(X).mean(axis=0), preprocessing.scale(X).std(axis=0)
@@ -14521,7 +14387,7 @@ 

Machine Learning -
Out[19]:
+
Out[16]:
@@ -14553,10 +14419,10 @@

Model

-
In [20]:
+
In [17]:
-
clf = svm.SVC(class_weight='balanced')
+
clf = svm.SVC(class_weight='balanced', C=0.9)
 
@@ -14574,7 +14440,7 @@

Model

-
In [21]:
+
In [18]:
clf.fit(X_train, y_train)
@@ -14590,13 +14456,13 @@ 

Model

-
Out[21]:
+
Out[18]:
-
SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
+
SVC(C=0.9, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False)
@@ -14618,7 +14484,7 @@

Model

-
In [22]:
+
In [19]:
y_pred = clf.predict(X_test)
@@ -14635,13 +14501,13 @@ 

Model

-
Out[22]:
+
Out[19]:
-
array([1, 1, 0, ..., 0, 0, 0])
+
array([1, 0, 0, ..., 0, 1, 0])

@@ -14649,10 +14515,18 @@

Model

+
+
+

No clue right now what this does... look it up!

+ +
+

-
In [23]:
+
In [20]:
y_score = clf.decision_function(X_test)
@@ -14669,14 +14543,14 @@ 

Model

-
Out[23]:
+
Out[20]:
-
array([ 0.30562033,  0.78824461, -1.62267313, ..., -0.46900522,
-       -1.36638482, -1.35364663])
+
array([ 0.71345701, -2.26722865, -1.34963997, ..., -1.12775605,
+        0.6524443 , -0.24453507])

@@ -14704,7 +14578,7 @@

Evaluation
-
In [24]:
+
In [21]:
@@ -14749,7 +14623,7 @@

Evaluation
-
In [25]:
+
In [22]:
average_precision = metrics.average_precision_score(y_test, y_score)
@@ -14782,7 +14656,7 @@ 

Evaluation
-
In [26]:
+
In [23]:
diff --git a/example/example_notebook.ipynb b/example/example_notebook.ipynb index 30e9ff6..c45e3eb 100644 --- a/example/example_notebook.ipynb +++ b/example/example_notebook.ipynb @@ -49,7 +49,7 @@ "output_type": "stream", "text": [ "Python version: 3.7.7 (default, Apr 15 2020, 05:09:04) [MSC v.1916 64 bit (AMD64)]\n", - "conflict_model version: 0.0.1-beta\n", + "conflict_model version: 0.0.1\n", "geopandas version: 0.7.0\n", "xarray version: 0.15.1\n", "rasterio version: 1.1.0\n", @@ -209,128 +209,6 @@ "# Functions" ] }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year): \n", - " \n", - " print('determining whether a conflict took place or not')\n", - "\n", - " # each year initialize new column with default value 0 (=False)\n", - " list_boolConflict = []\n", - " \n", - " # select the entries which occured in this year\n", - " temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year] \n", - " \n", - " # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions\n", - " data_merged = gpd.sjoin(temp_sel_year, extent_gdf)\n", - " \n", - " # determine the aggregated amount of fatalities in one region (e.g. water province)\n", - " fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={\"best\": 'total_fatalities'})\n", - " \n", - " # loop through all regions and check if exists in sub-set\n", - " # if so, this means that there was conflict and thus assign value 1\n", - " for i in range(len(extent_gdf)):\n", - " i_watProv = extent_gdf.iloc[i]['watprovID']\n", - " if i_watProv in fatalities_per_watProv.index.values:\n", - " list_boolConflict.append(1)\n", - " else:\n", - " list_boolConflict.append(0)\n", - " \n", - " if not len(extent_gdf) == len(list_boolConflict):\n", - " raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_boolConflict)))\n", - " \n", - " print('...DONE' + os.linesep)\n", - "\n", - " return list_boolConflict" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "def rasterstats_GDP_PPP(gdf, config, sim_year):\n", - "\n", - " print('calculating GDP PPP mean per aggregation unit')\n", - " \n", - " nc_fo = os.path.join(config.get('general', 'input_dir'), \n", - " config.get('env_vars', 'GDP_PPP'))\n", - "\n", - " nc_ds = xr.open_dataset(nc_fo)\n", - "\n", - " nc_var = nc_ds['GDP_per_capita_PPP']\n", - "\n", - " affine = rio.open(nc_fo).transform\n", - "\n", - " nc_arr = nc_var.sel(time=sim_year)\n", - " nc_arr_vals = nc_arr.values\n", - " if nc_arr_vals.size == 0:\n", - " raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))\n", - "\n", - " list_GDP_PPP = []\n", - " \n", - " for i in range(len(gdf)):\n", - " prov = gdf.iloc[i]\n", - " zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=\"mean\")\n", - " list_GDP_PPP.append(zonal_stats[0]['mean'])\n", - "\n", - " print('...DONE' + os.linesep)\n", - "\n", - " return list_GDP_PPP" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "def rasterstats_totalEvap(gdf, config, sim_year):\n", - " \n", - " nc_fo = os.path.join(config.get('general', 'input_dir'), \n", - " config.get('env_vars', 'evaporation'))\n", - " \n", - " print('calculating evaporation mean per aggregation unit from {}'.format(nc_fo))\n", - "\n", - " nc_ds = xr.open_dataset(nc_fo)\n", - "\n", - " nc_var = nc_ds.total_evaporation\n", - "\n", - " years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)\n", - "\n", - " if sim_year not in years:\n", - " raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))\n", - "\n", - " affine = rio.open(nc_fo).transform\n", - "\n", - " gdf['evap_mean_' + str(sim_year)] = np.nan\n", - " \n", - " sim_year_idx = int(np.where(years == sim_year)[0])\n", - "\n", - " nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])\n", - " \n", - " nc_arr_vals = nc_arr.values\n", - " \n", - " if nc_arr_vals.size == 0:\n", - " raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))\n", - "\n", - " list_Evap = []\n", - " \n", - " for i in range(len(gdf)):\n", - " prov = gdf.iloc[i]\n", - " zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=\"mean\")\n", - " list_Evap.append(zonal_stats[0][\"mean\"])\n", - "\n", - " print('...DONE' + os.linesep)\n", - "\n", - " return list_Evap" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -342,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -356,19 +234,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -385,7 +251,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -407,19 +273,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -436,7 +290,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -458,19 +312,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -487,7 +329,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -509,19 +351,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -538,7 +368,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -560,19 +390,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -589,7 +407,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -611,19 +429,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -640,7 +446,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -662,19 +468,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -691,7 +485,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -713,19 +507,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -742,7 +524,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -764,19 +546,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -793,7 +563,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -815,19 +585,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -844,7 +602,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -866,19 +624,7 @@ "determining whether a conflict took place or not\n", "...DONE\r\n", "\n", - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - " ..\n", - "381 0\n", - "382 0\n", - "383 0\n", - "384 0\n", - "385 0\n", - "Length: 386, dtype: int32\n", - "calculating GDP PPP mean per aggregation unit\n" + "calculating mean GDP_per_capita_PPP per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc\n" ] }, { @@ -895,7 +641,7 @@ "text": [ "...DONE\r\n", "\n", - "calculating evaporation mean per aggregation unit from C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" + "calculating mean total_evaporation per aggregation unit from file C:\\Users\\hoch0001\\Documents\\_code\\conflict_model\\data\\PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc\n" ] }, { @@ -929,16 +675,16 @@ " \n", " print('entering year {}'.format(sim_year) + os.linesep)\n", " \n", - " list_boolConflict = conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year)\n", + " list_boolConflict = conflict_model.get_boolean_conflict.conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year)\n", " Y = Y.append(pd.Series(list_boolConflict, dtype=int), ignore_index=True)\n", " \n", - " list_GDP_PPP = rasterstats_GDP_PPP(extent_gdf, config, sim_year)\n", + " list_GDP_PPP = conflict_model.get_var_from_nc.nc_with_integer_timestamp(extent_gdf, config, 'GDP_per_capita_PPP', sim_year)\n", " X1 = X1.append(pd.Series(list_GDP_PPP), ignore_index=True)\n", " \n", " if not len(list_GDP_PPP) == len(list_boolConflict):\n", " raise AssertionError('length of lists do not match, they are {0} and {1}'.format(len(list_GDP_PPP), len(list_boolConflict)))\n", " \n", - " list_Evap = rasterstats_totalEvap(extent_gdf, config, sim_year)\n", + " list_Evap = conflict_model.get_var_from_nc.nc_with_continous_regular_timestamp(extent_gdf, config, 'total_evaporation', sim_year)\n", " X2 = X2.append(pd.Series(list_Evap), ignore_index=True)\n", " \n", " if not len(list_Evap) == len(list_boolConflict):\n", @@ -960,12 +706,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, create the X (variables) from all columns sampled and Y (target) and concatenate/copy." + "First, create a pandas dataframe from all variables and targets and kick out rows with missing values (they do not work with ML)" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -987,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1021,31 +767,31 @@ " 0\n", " 2361.934264\n", " 0.042316\n", - " [0]\n", + " 0\n", " \n", " \n", " 1\n", " 3104.051687\n", " 0.040520\n", - " [0]\n", + " 0\n", " \n", " \n", " 2\n", " 1192.025215\n", " 0.039277\n", - " [0]\n", + " 0\n", " \n", " \n", " 3\n", " 1275.859490\n", " 0.025305\n", - " [0]\n", + " 0\n", " \n", " \n", " 4\n", " 1182.202026\n", " 0.036308\n", - " [0]\n", + " 0\n", " \n", " \n", " ...\n", @@ -1057,31 +803,31 @@ " 4241\n", " 3277.156738\n", " 0.060997\n", - " [0]\n", + " 0\n", " \n", " \n", " 4242\n", " 3277.156738\n", " 0.068696\n", - " [0]\n", + " 0\n", " \n", " \n", " 4243\n", " 1381.966901\n", " 0.048329\n", - " [0]\n", + " 0\n", " \n", " \n", " 4244\n", " 1390.211488\n", " 0.052157\n", - " [0]\n", + " 0\n", " \n", " \n", " 4245\n", " 1391.689834\n", " 0.052872\n", - " [0]\n", + " 0\n", " \n", " \n", "\n", @@ -1089,23 +835,23 @@ "
" ], "text/plain": [ - " GDP_PPP ET conflict\n", - "0 2361.934264 0.042316 [0]\n", - "1 3104.051687 0.040520 [0]\n", - "2 1192.025215 0.039277 [0]\n", - "3 1275.859490 0.025305 [0]\n", - "4 1182.202026 0.036308 [0]\n", - "... ... ... ...\n", - "4241 3277.156738 0.060997 [0]\n", - "4242 3277.156738 0.068696 [0]\n", - "4243 1381.966901 0.048329 [0]\n", - "4244 1390.211488 0.052157 [0]\n", - "4245 1391.689834 0.052872 [0]\n", + " GDP_PPP ET conflict\n", + "0 2361.934264 0.042316 0\n", + "1 3104.051687 0.040520 0\n", + "2 1192.025215 0.039277 0\n", + "3 1275.859490 0.025305 0\n", + "4 1182.202026 0.036308 0\n", + "... ... ... ...\n", + "4241 3277.156738 0.060997 0\n", + "4242 3277.156738 0.068696 0\n", + "4243 1381.966901 0.048329 0\n", + "4244 1390.211488 0.052157 0\n", + "4245 1391.689834 0.052872 0\n", "\n", "[4224 rows x 3 columns]" ] }, - "execution_count": 83, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1123,7 +869,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1138,7 +884,7 @@ " [1.39168983e+03, 5.28718745e-02]])" ] }, - "execution_count": 88, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1150,7 +896,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1159,7 +905,7 @@ "array([0, 0, 0, ..., 0, 0, 0])" ] }, - "execution_count": 86, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1169,13 +915,6 @@ "Y" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The scatterplot of the (two) variables in X looks like this. Also the sample size n is provided." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1185,23 +924,30 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = model_selection.train_test_split(preprocessing.scale(X),\n", " Y,\n", - " test_size=0.5)" + " test_size=0.7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The scatterplot of the (two) variables in X looks like this. Also the sample size n is provided." ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1225,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1234,7 +980,7 @@ "(array([-4.71003707e-17, 6.39219317e-17]), array([1., 1.]))" ] }, - "execution_count": 96, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1261,11 +1007,11 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "clf = svm.SVC(class_weight='balanced')" + "clf = svm.SVC(class_weight='balanced', C=0.9)" ] }, { @@ -1277,19 +1023,19 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,\n", + "SVC(C=0.9, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,\n", " decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',\n", " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", " tol=0.001, verbose=False)" ] }, - "execution_count": 98, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -1307,16 +1053,16 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([0, 0, 0, ..., 0, 0, 1])" + "array([0, 0, 0, ..., 0, 1, 0])" ] }, - "execution_count": 99, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -1326,19 +1072,26 @@ "y_pred" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No clue right now what this does... look it up!" + ] + }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([-2.43595112, -2.90152701, -0.41222322, ..., -0.47675971,\n", - " -0.26396479, 0.03287877])" + "array([-0.60452387, -1.03416277, -0.95833466, ..., -0.6758252 ,\n", + " 1.07185603, -1.04955169])" ] }, - "execution_count": 100, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1368,16 +1121,16 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.6875\n", - "Precision: 0.09971509971509972\n", - "Recall: 0.7142857142857143\n" + "Accuracy: 0.6604666892120392\n", + "Precision: 0.09620721554116558\n", + "Recall: 0.7938931297709924\n" ] } ], @@ -1400,14 +1153,14 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Average precision-recall score: 0.11\n" + "Average precision-recall score: 0.12\n" ] } ], @@ -1419,12 +1172,12 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] diff --git a/scripts/runner.py b/scripts/runner.py index 270a241..1e63f5d 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -6,18 +6,13 @@ from os.path import isdir, dirname, abspath from os import makedirs import geopandas as gpd +import pandas as pd import numpy as np +import seaborn as sbs +from sklearn import svm, preprocessing, model_selection, metrics +import matplotlib.pyplot as plt import os, sys -# ad-hoc functions -def parse_dir(param, path): - try: - path = abspath(path) - if not isdir(path): - os.makedirs(path) - except: - raise click.BadParameter("Couldn't understand or create folder directory for the '{}' argument.".format(param)) - return path @click.group() def cli(): @@ -25,8 +20,9 @@ def cli(): @click.command() @click.argument('cfg',) +@click.option('-so', '--safe-output', default=False, help='whether or not to save output', type=click.BOOL) -def main(cfg, out_dir=None, safe_plots=False): +def main(cfg, safe_output=False): """ Runs the conflict_model from command line with several options and the settings cfg-file as argument. @@ -41,27 +37,86 @@ def main(cfg, out_dir=None, safe_plots=False): config = RawConfigParser(allow_no_value=True) config.read(cfg) - #out_dir - out_dir = config.get('general','output_dir') - if not os.path.isdir(out_dir): + if safe_output: + out_dir = config.get('general','output_dir') + if not os.path.isdir(out_dir): os.makedirs(out_dir) - print('for the record, saving output to folder {}'.format(out_dir) + os.linesep) + print('saving output to folder {}'.format(out_dir) + os.linesep) + else: + print('not saving output' + os.linesep) - conflict_gdf = conflict_model.utils.get_geodataframe(config) + gdf = conflict_model.utils.get_geodataframe(config) - selected_conflict_gdf, extent_gdf = conflict_model.selection.select(conflict_gdf, config) + conflict_gdf, extent_gdf = conflict_model.selection.select(gdf, config) - sim_years = np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end'), 1) + print('data retrieval period from', str(config.getint('settings', 'y_start')), 'to', str(config.getint('settings', 'y_end'))) + print('') - print('preps are all done, now entering annual analysis' + os.linesep) + X1 = pd.Series(dtype=float) + X2 = pd.Series(dtype=float) + Y = pd.Series(dtype=int) for sim_year in np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end'), 1): print('entering year {}'.format(sim_year) + os.linesep) - conflict_gdf_perYear, extent_conflict_merged, fatalities_per_waterProvince, extent_waterProvinces_with_boolFatalities = conflict_model.analysis.conflict_in_year_bool(selected_conflict_gdf, extent_gdf, config, sim_year, out_dir, saving_plots=True) + list_boolConflict = conflict_model.get_boolean_conflict.conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year) + Y = Y.append(pd.Series(list_boolConflict, dtype=int), ignore_index=True) + + list_GDP_PPP = conflict_model.get_var_from_nc.nc_with_integer_timestamp(extent_gdf, config, 'GDP_per_capita_PPP', sim_year) + X1 = X1.append(pd.Series(list_GDP_PPP), ignore_index=True) + + if not len(list_GDP_PPP) == len(list_boolConflict): + raise AssertionError('length of lists do not match, they are {0} and {1}'.format(len(list_GDP_PPP), len(list_boolConflict))) + + list_Evap = conflict_model.get_var_from_nc.nc_with_continous_regular_timestamp(extent_gdf, config, 'total_evaporation', sim_year) + X2 = X2.append(pd.Series(list_Evap), ignore_index=True) + + if not len(list_Evap) == len(list_boolConflict): + raise AssertionError('length of lists do not match, they are {0} and {1}'.format(len(list_Evap), len(list_boolConflict))) + + print('...all data retrieved' + os.linesep) + + print('preparing data for Machine Learning model' + os.linesep) + XY_data = list(zip(X1, X2, Y)) + XY_data = pd.DataFrame(XY_data, columns=['GDP_PPP', 'ET', 'conflict']) + XY_data = XY_data.dropna() + X = XY_data[['GDP_PPP', 'ET']].to_numpy() + Y = XY_data.conflict.astype(int).to_numpy() + + print('scaling data and splitting into trainings and test samples' + os.linesep) + X_train, X_test, y_train, y_test = model_selection.train_test_split(preprocessing.scale(X), + Y, + test_size=0.7) + + plt.figure(figsize=(10,10)) + sbs.scatterplot(x=X_train[:,0], + y=X_train[:,1], + hue=y_train) + + plt.title('n=' + str(len(X_train))) + if safe_output: + plt.savefig(os.path.join(out_dir, 'scatter_plot.png'), dpi=300) + + print('initializing Support Vector Classification model' + os.linesep) + clf = svm.SVC(class_weight='balanced') + + print('fitting model with trainings data' + os.linesep) + clf.fit(X_train, y_train) + + print('making a prediction' + os.linesep) + y_pred = clf.predict(X_test) + + y_score = clf.decision_function(X_test) + + print('Model evaluation') + print("...Accuracy:", metrics.accuracy_score(y_test, y_pred)) + print("...Precision:", metrics.precision_score(y_test, y_pred)) + print("...Recall:", metrics.recall_score(y_test, y_pred)) + print('...Average precision-recall score: {0:0.2f}'.format(metrics.average_precision_score(y_test, y_score))) - GDP_PPP_gdf = conflict_model.env_vars_nc.rasterstats_GDP_PPP(extent_waterProvinces_with_boolFatalities, extent_gdf, config, sim_year, out_dir, saving_plots=True) + disp = metrics.plot_precision_recall_curve(clf, X_test, y_test) + disp.ax_.set_title('2-class Precision-Recall curve: AP={0:0.2f}'.format(metrics.average_precision_score(y_test, y_score))) if __name__ == '__main__': main() \ No newline at end of file