Merge pull request #15 from JannisHoch/dev

aligned runner.py with notebook
JannisHoch · Jun 18, 2020 · c35568a · c35568a
2 parents 57843f2 + 350e3b2
commit c35568a
Show file tree

Hide file tree

Showing 10 changed files with 435 additions and 720 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,4 +1,4 @@
 {
-    "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe",
+    "python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe",
     "restructuredtext.confPath": "${workspaceFolder}\\docs"
 }
diff --git a/conflict_model/__init__.py b/conflict_model/__init__.py
@@ -2,9 +2,9 @@
 
 from . import selection
 from . import utils
-from . import analysis
-from . import env_vars_nc
+from . import get_boolean_conflict
+from . import get_var_from_nc
 
 __author__ = """Jannis M. Hoch"""
 __email__ = '[email protected]'
-__version__ = '0.0.1-beta'
+__version__ = '0.0.1'
diff --git a/conflict_model/analysis.py b/conflict_model/analysis.py
diff --git a/conflict_model/env_vars_nc.py b/conflict_model/env_vars_nc.py
diff --git a/conflict_model/get_boolean_conflict.py b/conflict_model/get_boolean_conflict.py
@@ -0,0 +1,49 @@
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year): 
+    """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
+
+    Args:
+        conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data)
+        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
+        config (config): parsed configuration settings of run
+        sim_year (int): year for which data is extracted
+
+    Raises:
+        AssertionError: raised if the length of output list does not match length of input geo-dataframe
+
+    Returns:
+        list: list containing 0/1 per polygon depending on conflict occurence
+    """    
+
+    print('determining whether a conflict took place or not')
+
+    # select the entries which occured in this year
+    temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]   
+
+    # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
+    data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
+
+    # determine the aggregated amount of fatalities in one region (e.g. water province)
+    fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+
+    # loop through all regions and check if exists in sub-set
+    # if so, this means that there was conflict and thus assign value 1
+    list_out = []
+    for i in range(len(extent_gdf)):
+        i_watProv = extent_gdf.iloc[i]['watprovID']
+        if i_watProv in fatalities_per_watProv.index.values:
+            list_out.append(1)
+        else:
+            list_out.append(0)
+
+    if not len(extent_gdf) == len(list_out):
+        raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))
+
+    print('...DONE' + os.linesep)
+
+    return list_out
diff --git a/conflict_model/get_var_from_nc.py b/conflict_model/get_var_from_nc.py
@@ -0,0 +1,130 @@
+import xarray as xr
+import rasterio as rio
+import pandas as pd
+import geopandas as gpd
+import rasterstats as rstats
+import numpy as np
+import matplotlib.pyplot as plt
+import os, sys
+
+def nc_with_integer_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
+    """This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
+    By default, the mean value of all cells within a polygon is computed.
+    The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
+
+    NOTE:
+    The var_name must be identical to the key in the config-file. 
+
+    NOTE:
+    This function is specifically written for netCDF-files where the time variable contains integer (year-)values, e.g. 1995, 1996, ...
+
+    NOTE:
+    Works only with nc-files with annual data.
+
+    Args:
+        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
+        config (config): parsed configuration settings of run 
+        var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file
+        sim_year (int): year for which data is extracted
+        stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
+
+    Raises:
+        ValueError: raised if the extracted variable at a time step does not contain data
+
+    Returns:
+        list: list containing statistical value per polygon, i.e. with same length as extent_gdf
+    """   
+    # get path to netCDF-file.
+    nc_fo = os.path.join(config.get('general', 'input_dir'), 
+                         config.get('env_vars', var_name))
+
+    print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))
+
+    # open nc-file with xarray as dataset
+    nc_ds = xr.open_dataset(nc_fo)
+    # get xarray data-array for specified variable
+    nc_var = nc_ds[var_name]
+
+    # open nc-file with rasterio to get affine information
+    affine = rio.open(nc_fo).transform
+
+    # get values from data-array for specified year
+    nc_arr = nc_var.sel(time=sim_year)
+    nc_arr_vals = nc_arr.values
+    if nc_arr_vals.size == 0:
+        raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+
+    # initialize output list
+    list_out = []
+    # loop through all polygons in geo-dataframe and compute statistics, then append to output file
+    for i in range(len(extent_gdf)):
+        prov = extent_gdf.iloc[i]
+        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
+        list_out.append(zonal_stats[0][stat_func])
+
+    print('...DONE' + os.linesep)
+
+    return list_out
+
+def nc_with_continous_regular_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
+    """This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
+    By default, the mean value of all cells within a polygon is computed.
+    The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
+
+    NOTE:
+    The var_name must be identical to the key in the config-file. 
+
+    NOTE:
+    Works only with nc-files with annual data.
+
+    Args:
+        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
+        config (config): parsed configuration settings of run 
+        var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file
+        sim_year (int): year for which data is extracted
+        stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
+
+    Raises:
+        ValueError: raised if specfied year cannot be found in years in nc-file
+        ValueError: raised if the extracted variable at a time step does not contain data
+
+    Returns:
+        list: list containing statistical value per polygon, i.e. with same length as extent_gdf
+    """   
+    # get path to netCDF-file.
+    nc_fo = os.path.join(config.get('general', 'input_dir'), 
+                         config.get('env_vars', var_name))
+
+    print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))
+
+    # open nc-file with xarray as dataset
+    nc_ds = xr.open_dataset(nc_fo)
+    # get xarray data-array for specified variable
+    nc_var = nc_ds[var_name]
+    # get years contained in nc-file as integer array to be compatible with sim_year
+    years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
+    if sim_year not in years:
+        raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
+
+    # get index which corresponds with sim_year in years in nc-file
+    sim_year_idx = int(np.where(years == sim_year)[0])
+    # get values from data-array for specified year based on index
+    nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
+    nc_arr_vals = nc_arr.values
+    if nc_arr_vals.size == 0:
+        raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+
+    # open nc-file with rasterio to get affine information
+    affine = rio.open(nc_fo).transform
+
+    # initialize output list
+    list_out = []
+    # loop through all polygons in geo-dataframe and compute statistics, then append to output file
+    for i in range(len(extent_gdf)):
+        prov = extent_gdf.iloc[i]
+        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
+        list_out.append(zonal_stats[0][stat_func])
+
+    print('...DONE' + os.linesep)
+
+    return list_out
diff --git a/data/run_setting.cfg b/data/run_setting.cfg
@@ -4,7 +4,7 @@ output_dir=C:\Users\hoch0001\Documents\_code\conflict_model\data\OUT
 
 [settings]
 y_start=2000
-y_end=2011
+y_end=2015
 
 [extent]
 shp=waterProvinces/waterProvinces_Africa.shp
@@ -21,5 +21,6 @@ zones=BWh,BSh
 code2class=KoeppenGeiger/classification_codes.txt
 
 [env_vars]
-GDP_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
-evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
+#variable name here needs to be identical with variable name in nc-file
+GDP_per_capita_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
+total_evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc