JannisHoch · JannisHoch · Jul 16, 2020 · Jul 16, 2020 · Jul 16, 2020 · Jul 16, 2020
diff --git a/.gitignore b/.gitignore
@@ -128,8 +128,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-# run settings
-*/run_setting.cfg
-
 #output folders
 OUT*/
diff --git a/README.rst b/README.rst
@@ -19,7 +19,7 @@ To install the conflict model, first clone the code from GitHub. It is advised t
 
     $ git clone https://github.com/JannisHoch/conflict_model.git
     $ cd path/to/conflict_model
-    $ conda-env create -f=environment.yml
+    $ conda env create -f environment.yml
     $ conda activate conflict_model
     $ python setup.py develop
 
@@ -38,6 +38,8 @@ To run the example jupyter notebook, follow these instructions
 
 This automatically executes the notebook and converts it to a html-file, also stored in the example folder.
 
+.. note:: It is of course also possible to execute the notebook cell by cell using jupyter notebook
+
 with runner script
 ^^^^^^^^^^^^^^^^^^
 
@@ -48,14 +50,15 @@ To run the model from command line, a command line script is provided. In the mo
     $ cd path/to/conflict_model/scripts
     $ python runner.py path/to/conflict_model/data/run_setting.cfg
 
-.. note:: by default, no output is stored in the current version of the model!
-
-If output is to be stored in an output map, this currently needs to be specified in the runner scipt explictely (-s option).
-By default, output is stored to the output directory specified in the settings-file. Alternatively, this can be provided via command line too (-o option)
+If output is to be stored in an output map, this currently needs to be specified in the runner scipt explictely (-so option).
+By default, output is stored to the output directory specified in the settings-file. Alternatively, this can be provided via command line too (-o option).
+For some minimal verbose output, please specify this using the -v option.
 
 .. code-block:: console
 
-    $ python runner.py -s True -o path/to/output/folder path/to/conflict_model/data/run_setting.cfg
+    $ python runner.py -o path/to/output/folder path/to/conflict_model/data/run_setting.cfg -so -v
+
+.. note:: for convenience, there is a 'run_script.sh' file executing just this command.
 
 For help, try this if you are in the scripts folder:
 

diff --git a/conflict_model/__init__.py b/conflict_model/__init__.py
@@ -4,6 +4,7 @@
 from . import utils
 from . import get_boolean_conflict
 from . import get_var_from_nc
+from . import machine_learning
 
 __author__ = """Jannis M. Hoch"""
 __email__ = '[email protected]'

diff --git a/conflict_model/get_boolean_conflict.py b/conflict_model/get_boolean_conflict.py
@@ -43,7 +43,5 @@ def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year):
 
     if not len(extent_gdf) == len(list_out):
         raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))
-
-    print('...DONE' + os.linesep)
 
     return list_out
diff --git a/conflict_model/get_var_from_nc.py b/conflict_model/get_var_from_nc.py
@@ -62,8 +62,6 @@ def nc_with_float_timestamp(extent_gdf, config, var_name, sim_year, stat_func='m
         zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
         list_out.append(zonal_stats[0][stat_func])
 
-    print('...DONE' + os.linesep)
-
     return list_out
 
 def nc_with_continous_datetime_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
@@ -92,7 +90,7 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, var_name, sim_year,
         list: list containing statistical value per polygon, i.e. with same length as extent_gdf
     """   
     # get path to netCDF-file.
-    nc_fo = os.path.join(config.get('general', 'input_dir'), 
+    nc_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')), 
                          config.get('env_vars', var_name))
 
     print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))
@@ -125,6 +123,4 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, var_name, sim_year,
         zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
         list_out.append(zonal_stats[0][stat_func])
 
-    print('...DONE' + os.linesep)
-
     return list_out
diff --git a/conflict_model/machine_learning.py b/conflict_model/machine_learning.py
@@ -0,0 +1,66 @@
+import os
+from sklearn import svm, neighbors, preprocessing
+
+def define_scaling(config):
+    """[summary]
+
+    Args:
+        config ([type]): [description]
+
+    Raises:
+        ValueError: [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    if config.getboolean('general', 'sensitivity_analysis'):
+        scalers = [preprocessing.MinMaxScaler(),
+                   preprocessing.StandardScaler(),
+                   preprocessing.RobustScaler(),
+                   preprocessing.QuantileTransformer(random_state=42)]
+
+    elif not config.getboolean('general', 'sensitivity_analysis'):
+        if config.get('machine_learning', 'scaler') == 'MinMaxScaler':
+            scalers = [preprocessing.MinMaxScaler()]
+        elif config.get('machine_learning', 'scaler') == 'StandardScaler':
+            scalers = [preprocessing.StandardScaler()]
+        elif config.get('machine_learning', 'scaler') == 'RobustScaler':
+            scalers = [preprocessing.RobustScaler()]
+        elif config.get('machine_learning', 'scaler') == 'QuantileTransformer':
+            scalers = [preprocessing.QuantileTransformer(random_state=42)]
+        else:
+            raise ValueError('no supported scaling-algorithm selected - choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer')
+
+    print('chosen scaling method is {}'.format(scalers[0]))
+
+    return scalers
+
+def define_model(config):
+    """[summary]
+
+    Args:
+        config ([type]): [description]
+
+    Raises:
+        ValueError: [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    if config.getboolean('general', 'sensitivity_analysis'):
+        clfs = [svm.NuSVC(nu=0.1, kernel='rbf', class_weight={1: 100}, random_state=42, probability=True, degree=10, gamma=10),
+                neighbors.KNeighborsClassifier(n_neighbors=10, weights='distance')]
+
+    elif not config.getboolean('general', 'sensitivity_analysis'):
+        if config.get('machine_learning', 'model') == 'NuSVC':
+            clfs = [svm.NuSVC(nu=0.1, kernel='rbf', class_weight={1: 100}, random_state=42, probability=True, degree=10, gamma=10)]
+        elif config.get('machine_learning', 'model') == 'KNeighborsClassifier':
+            clfs = [neighbors.KNeighborsClassifier(n_neighbors=10, weights='distance')]
+        else:
+            raise ValueError('no supported ML model selected - choose between NuSVC or KNeighborsClassifier')
+
+    print('chosen ML model is {}'.format(clfs[0]))
+
+    return clfs
diff --git a/data/run_setting.cfg b/data/run_setting.cfg
@@ -1,6 +1,9 @@
 [general]
 input_dir=../data
 output_dir=../data\OUT
+# if True, all possible scaler and model combinations (see 'machine_learning' section) are run
+# if False, only the selected scaler and model combination is run
+sensitivity_analysis=False
 
 [settings]
 y_start=2000
@@ -21,7 +24,7 @@ zones=BWh,BSh
 code2class=KoeppenGeiger/classification_codes.txt
 
 [env_vars]
-#variable name here needs to be identical with variable name in nc-file
+# variable name here needs to be identical with variable name in nc-file
 GDP_per_capita_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
 total_evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
 precipitation=PCRGLOBWB/precip/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
@@ -33,4 +36,8 @@ int_grazing=IMAGE/intensityGrazing.nc
 ext_grazing=IMAGE/extensiveGrazing.nc
 
 [machine_learning]
+# choose from: MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
+scaler=QuantileTransformer
+# choose from: NuSVC, KNeighborsClassifier
+model= KNeighborsClassifier
 train_fraction=0.9