diff --git a/.vscode/settings.json b/.vscode/settings.json index 6bd136b..ef0cd5e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,4 @@ { - "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe", + "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe", "restructuredtext.confPath": "${workspaceFolder}\\docs" } \ No newline at end of file diff --git a/conflict_model/analysis.py b/conflict_model/analysis.py index e03a47b..6817c39 100644 --- a/conflict_model/analysis.py +++ b/conflict_model/analysis.py @@ -22,7 +22,7 @@ def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year, out_dir, s dataframe: dataframe containing column with boolean information about conflict for each year """ - print('determining whether a conflict took place or not...') + print('determining whether a conflict took place or not') out_df = extent_gdf.copy() diff --git a/conflict_model/env_vars_nc.py b/conflict_model/env_vars_nc.py index 06b2561..fd11798 100644 --- a/conflict_model/env_vars_nc.py +++ b/conflict_model/env_vars_nc.py @@ -1,14 +1,15 @@ import xarray as xr import rasterio as rio +import pandas as pd import geopandas as gpd import rasterstats as rstats import numpy as np import matplotlib.pyplot as plt import os, sys -def rasterstats_GDP_PPP(gdf_in, config, sim_year, out_dir, saving_plots=False, showing_plots=False): +def rasterstats_GDP_PPP(gdf, config, sim_year, out_dir, saving_plots=False, showing_plots=False): - print('calculating zonal statistics per aggregation unit') + print('calculating GDP PPP mean per aggregation unit') nc_fo = os.path.join(config.get('general', 'input_dir'), config.get('env_vars', 'GDP_PPP')) @@ -17,6 +18,46 @@ def rasterstats_GDP_PPP(gdf_in, config, sim_year, out_dir, saving_plots=False, s nc_var = nc_ds['GDP_per_capita_PPP'] + # years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int) + # if sim_year not in years: + # raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo)) + # sim_year_idx = int(np.where(years == sim_year)[0]) + + affine = rio.open(nc_fo).transform + + # gdf['zonal_stats_min_' + str(sim_year)] = np.nan + # gdf['zonal_stats_max_' + str(sim_year)] = np.nan + # gdf['GDP_PPP_mean_' + str(sim_year)] = np.nan + + nc_arr = nc_var.sel(time=sim_year) + nc_arr_vals = nc_arr.values + if nc_arr_vals.size == 0: + raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo)) + + list_GDP_PPP = [] + + for i in range(len(gdf)): + prov = gdf.iloc[i] + zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean") + # gdf.loc[i, 'zonal_stats_min_' + str(sim_year)] = zonal_stats[0]['min'] + # gdf.loc[i, 'zonal_stats_max_' + str(sim_year)] = zonal_stats[0]['max'] + list_GDP_PPP.append(zonal_stats[0]['mean']) + + print('...DONE' + os.linesep) + + return list_GDP_PPP + +def rasterstats_totalEvap(gdf_in, config, sim_year, out_dir): + + print('calculating evaporation mean per aggregation unit') + + nc_fo = os.path.join(config.get('general', 'input_dir'), + config.get('env_vars', 'evaporation')) + + nc_ds = xr.open_dataset(nc_fo) + + nc_var = nc_ds['total_evaporation'] + years = nc_ds['time'].values years = years[years>=config.getint('settings', 'y_start')] years = years[years<=config.getint('settings', 'y_end')] @@ -25,9 +66,7 @@ def rasterstats_GDP_PPP(gdf_in, config, sim_year, out_dir, saving_plots=False, s gdf = gdf_in.copy() - gdf['zonal_stats_min_' + str(sim_year)] = np.nan - gdf['zonal_stats_max_' + str(sim_year)] = np.nan - gdf['zonal_stats_mean_' + str(sim_year)] = np.nan + gdf['evap_mean_' + str(sim_year)] = np.nan nc_arr = nc_var.sel(time=sim_year) nc_arr_vals = nc_arr.values @@ -36,68 +75,9 @@ def rasterstats_GDP_PPP(gdf_in, config, sim_year, out_dir, saving_plots=False, s for i in range(len(gdf)): prov = gdf.iloc[i] - zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean min max") - gdf.loc[i, 'zonal_stats_min_' + str(sim_year)] = zonal_stats[0]['min'] - gdf.loc[i, 'zonal_stats_max_' + str(sim_year)] = zonal_stats[0]['max'] - gdf.loc[i, 'zonal_stats_mean_' + str(sim_year)] = zonal_stats[0]['mean'] + zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean") + gdf.loc[i, 'evap_mean_' + str(sim_year)] = zonal_stats[0]['mean'] print('...DONE' + os.linesep) - fig, axes = plt.subplots(1, 3 , figsize=(20, 10)) - - fig.suptitle(str(int(sim_year)), y=0.78) - - gdf.plot(ax=axes[0], - column='zonal_stats_min_' + str(sim_year), - vmin=2000, - vmax=15000, - legend=True, - legend_kwds={'label': "min GDP_PPP", - 'orientation': "vertical", - 'shrink': 0.5, - 'extend': 'both'}) - gdf.boundary.plot(ax=axes[0], - color='0.5', - linestyle=':', - label='water province borders') - - gdf.plot(ax=axes[1], - column='zonal_stats_max_' + str(sim_year), - vmin=2000, - vmax=15000, - legend=True, - legend_kwds={'label': "max GDP_PPP", - 'orientation': "vertical", - 'shrink': 0.5, - 'extend': 'both'}) - gdf.boundary.plot(ax=axes[1], - color='0.5', - linestyle=':', - label='water province borders') - - gdf.plot(ax=axes[2], - column='zonal_stats_mean_' + str(sim_year), - vmin=2000, - vmax=15000, - legend=True, - legend_kwds={'label': "mean GDP_PPP", - 'orientation': "vertical", - 'shrink': 0.5, - 'extend': 'both'}) - gdf.boundary.plot(ax=axes[2], - color='0.5', - linestyle=':', - label='water province borders') - - plt.tight_layout() - - plt_name = 'GDP_PPP_zonal_stats_' + str(int(sim_year)) + '.png' - plt_name = os.path.join(out_dir, plt_name) - - if saving_plots: - plt.savefig(plt_name, dpi=300) - - if showing_plots == False: - plt.close() - return gdf \ No newline at end of file diff --git a/data/run_setting.cfg b/data/run_setting.cfg index 52c9f14..574c0a6 100644 --- a/data/run_setting.cfg +++ b/data/run_setting.cfg @@ -21,4 +21,5 @@ zones=BWh,BSh code2class=KoeppenGeiger/classification_codes.txt [env_vars] -GDP_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc \ No newline at end of file +GDP_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc +evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc \ No newline at end of file diff --git a/example/example_notebook.html b/example/example_notebook.html index 40243ef..d81d928 100644 --- a/example/example_notebook.html +++ b/example/example_notebook.html @@ -13102,6 +13102,7 @@
def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year):
+
+ print('determining whether a conflict took place or not')
+
+ # each year initialize new column with default value 0 (=False)
+ list_boolConflict = []
+
+ # select the entries which occured in this year
+ temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]
+
+ # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
+ data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
+
+ # determine the aggregated amount of fatalities in one region (e.g. water province)
+ fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+
+ # loop through all regions and check if exists in sub-set
+ # if so, this means that there was conflict and thus assign value 1
+ for i in range(len(extent_gdf)):
+ i_watProv = extent_gdf.iloc[i]['watprovID']
+ if i_watProv in fatalities_per_watProv.index.values:
+ list_boolConflict.append(1)
+ else:
+ list_boolConflict.append(0)
+
+ if not len(extent_gdf) == len(list_boolConflict):
+ raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_boolConflict)))
+
+ print('...DONE' + os.linesep)
+
+ return list_boolConflict
+
def rasterstats_GDP_PPP(gdf, config, sim_year):
+
+ print('calculating GDP PPP mean per aggregation unit')
+
+ nc_fo = os.path.join(config.get('general', 'input_dir'),
+ config.get('env_vars', 'GDP_PPP'))
+
+ nc_ds = xr.open_dataset(nc_fo)
+
+ nc_var = nc_ds['GDP_per_capita_PPP']
+
+ affine = rio.open(nc_fo).transform
+
+ nc_arr = nc_var.sel(time=sim_year)
+ nc_arr_vals = nc_arr.values
+ if nc_arr_vals.size == 0:
+ raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+
+ list_GDP_PPP = []
+
+ for i in range(len(gdf)):
+ prov = gdf.iloc[i]
+ zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
+ list_GDP_PPP.append(zonal_stats[0]['mean'])
+
+ print('...DONE' + os.linesep)
+
+ return list_GDP_PPP
+
def rasterstats_totalEvap(gdf, config, sim_year):
+
+ nc_fo = os.path.join(config.get('general', 'input_dir'),
+ config.get('env_vars', 'evaporation'))
+
+ print('calculating evaporation mean per aggregation unit from {}'.format(nc_fo))
+
+ nc_ds = xr.open_dataset(nc_fo)
+
+ nc_var = nc_ds.total_evaporation
+
+ years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
+
+ if sim_year not in years:
+ raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
+
+ affine = rio.open(nc_fo).transform
+
+ gdf['evap_mean_' + str(sim_year)] = np.nan
+
+ sim_year_idx = int(np.where(years == sim_year)[0])
+
+ nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
+
+ nc_arr_vals = nc_arr.values
+
+ if nc_arr_vals.size == 0:
+ raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+
+ list_Evap = []
+
+ for i in range(len(gdf)):
+ prov = gdf.iloc[i]
+ zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats="mean")
+ list_Evap.append(zonal_stats[0]["mean"])
+
+ print('...DONE' + os.linesep)
+
+ return list_Evap
+
print('simulation period from', str(config.getint('settings', 'y_start')), 'to', str(config.getint('settings', 'y_end')))
print('')
-X1 = pd.DataFrame()
-X2 = pd.DataFrame()
-Y = pd.DataFrame() # []
+X1 = pd.Series(dtype=float)
+X2 = pd.Series(dtype=float)
+Y = pd.Series(dtype=int) # not bool, because otherwise 0 is converted to False and 1 to True but we need 0/1
# go through all simulation years as specified in config-file
for sim_year in np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end'), 1):
print('entering year {}'.format(sim_year) + os.linesep)
- # add column whether there was conflict/non-conflict in one year in one region
- out_df = conflict_model.analysis.conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year, out_dir, saving_plots=True)
+ list_boolConflict = conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year)
+ Y = Y.append(pd.Series(list_boolConflict, dtype=int), ignore_index=True)
- # add column with zonal statistics of GDP per year per region
- out_df = conflict_model.env_vars_nc.rasterstats_GDP_PPP(out_df, config, sim_year, out_dir, saving_plots=True)
+ list_GDP_PPP = rasterstats_GDP_PPP(extent_gdf, config, sim_year)
+ X1 = X1.append(pd.Series(list_GDP_PPP), ignore_index=True)
- # drop all rows with at least one MVs since sklearn does not like NaNs
- out_df = out_df.dropna()
+ if not len(list_GDP_PPP) == len(list_boolConflict):
+ raise AssertionError('length of lists do not match, they are {0} and {1}'.format(len(list_GDP_PPP), len(list_boolConflict)))
- print(len(X1), len(X2), len(Y))
+ list_Evap = rasterstats_totalEvap(extent_gdf, config, sim_year)
+ X2 = X2.append(pd.Series(list_Evap), ignore_index=True)
- # create arrays with input variables X and target variable Y
- X1 = pd.concat([X1, out_df['zonal_stats_min_' + str(sim_year)]])
- X2 = pd.concat([X2, out_df['zonal_stats_max_' + str(sim_year)]])
- Y = pd.concat([Y, out_df['boolean_conflict_' + str(sim_year)]])
+ if not len(list_Evap) == len(list_boolConflict):
+ raise AssertionError('length of lists do not match, they are {0} and {1}'.format(len(list_Evap), len(list_boolConflict)))
- extent_gdf = out_df.copy()
-
print('...simulation DONE')
...DONE -0 0 0 entering year 2001 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++
...DONE -384 384 384 entering year 2002 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++
...DONE -766 766 766 entering year 2003 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -1146 1146 1146 entering year 2004 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -1524 1524 1524 entering year 2005 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -1900 1900 1900 entering year 2006 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -2274 2274 2274 entering year 2007 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -2646 2646 2646 entering year 2008 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -3016 3016 3016 entering year 2009 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -3384 3384 3384 entering year 2010 -determining whether a conflict took place or not... +determining whether a conflict took place or not ...DONE -calculating zonal statistics per aggregation unit +calculating GDP PPP mean per aggregation unit ++ + + +
...DONE -3750 3750 3750 ...simulation DONE@@ -13793,75 +14189,220 @@
X_df = pd.concat([X1,
- X2], axis=1)
-
-Y_df = Y.copy()
+XY_data = list(zip(X1, X2, Y))
+XY_data = pd.DataFrame(XY_data, columns=['GDP_PPP', 'ET', 'conflict'])
+print(len(XY_data))
+XY_data = XY_data.dropna()
+print(len(XY_data))
Then, convert them to numpy arrays
X = X_df.to_numpy()
+XY_data
Then, convert them to numpy arrays
+ +Y = Y_df.to_numpy()
+X = XY_data[['GDP_PPP', 'ET']].to_numpy()
+X
The scatterplot of the (two) variables in X looks like this. Also the sample size n is provided.
plt.figure(figsize=(10,10))
-sbs.scatterplot(x=X[:,0],
- y=X[:,1],
- hue=Y[:,0])
-
-plt.title('n=' + str(len(X1)))
-plt.savefig(os.path.join(out_dir, 'scatter_plot.png'), dpi=300)
-plt.show()
+Y = XY_data.conflict.astype(int).to_numpy()
+Y
The scatterplot of the (two) variables in X looks like this. Also the sample size n is provided.
+ +X_train, X_test, y_train, y_test = model_selection.train_test_split(preprocessing.scale(X),
- Y[:,0],
+ Y,
test_size=0.5)
plt.figure(figsize=(10,10))
+sbs.scatterplot(x=X_train[:,0],
+ y=X_train[:,1],
+ hue=y_train)
+
+plt.title('n=' + str(len(X_train)))
+plt.savefig(os.path.join(out_dir, 'scatter_plot.png'), dpi=300)
+plt.show()
+
preprocessing.scale(X).mean(axis=0), preprocessing.scale(X).std(axis=0)
@@ -13932,13 +14521,13 @@ Machine Learning
- Out[15]:
+ Out[19]:
clf = svm.SVC(class_weight='balanced')
@@ -13985,7 +14574,7 @@ Model¶
-In [17]:
+In [21]:
clf.fit(X_train, y_train)
@@ -14001,7 +14590,7 @@ Model¶
- Out[17]:
+ Out[21]:
@@ -14029,7 +14618,7 @@ Model¶
-In [18]:
+In [22]:
y_pred = clf.predict(X_test)
@@ -14046,13 +14635,13 @@ Model¶
- Out[18]:
+ Out[22]:
@@ -14063,7 +14652,7 @@ Model¶
-In [19]:
+In [23]:
y_score = clf.decision_function(X_test)
@@ -14080,14 +14669,14 @@ Model¶
- Out[19]:
+ Out[23]:
@@ -14115,7 +14704,7 @@ Evaluation¶
-In [20]:
+In [24]: