From 2425c336f97b8f123703c6232c044782d6d893aa Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Tue, 28 May 2024 13:43:19 +0200
Subject: [PATCH 1/3] first steps

---
 copro/selection.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/copro/selection.py b/copro/selection.py
index 8a66025..71da5c6 100644
--- a/copro/selection.py
+++ b/copro/selection.py
@@ -68,26 +68,30 @@ def _filter_conflict_properties(
         gpd.GeoDataFrame: geo-dataframe containing filtered entries.
     """
 
-    # create dictionary with all selection criteria
-    selection_criteria = {
-        "best": config.getint("conflict", "min_nr_casualties"),
-        "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","),
-    }
+    conflict_items = dict(config.items("conflict"))
+
+    # # create dictionary with all selection criteria
+    # selection_criteria = {
+    #     "best": config.getint("conflict", "min_nr_casualties"),
+    #     "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","),
+    # }
 
     click.echo("Filtering based on conflict properties.")
     # go through all criteria
-    for key, value in selection_criteria.items():
-
-        # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold
-        if key == "best" and value != "":
-            click.echo(f"Filtering key {key} with lower value {value}.")
-            gdf = gdf[gdf["best"] >= value]
-        # for other criteria, select all entries matching the specified value(s) per criterion
-        if key == "type_of_violence" and value != "":
-            click.echo(f"Filtering key {key} with value(s) {value}.")
-            # NOTE: check if this works like this
-            values = [literal_eval(i) for i in value]
-            gdf = gdf[gdf[key].isin(values)]
+    for key, value in conflict_items.items():
+
+        if key not in ["conflict_file", "predicted_var"]:
+
+            # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold
+            if key == "best" and value != "":
+                click.echo(f"Filtering key {key} with lower value {value}.")
+                gdf = gdf[gdf["best"] >= value]
+            # for other criteria, select all entries matching the specified value(s) per criterion
+            if key == "type_of_violence" and value != "":
+                click.echo(f"Filtering key {key} with value(s) {value}.")
+                # NOTE: check if this works like this
+                values = [literal_eval(i) for i in value]
+                gdf = gdf[gdf[key].isin(values)]
 
     return gdf
 

From de683e6b88fe1a0d691aa598648df43251b81da9 Mon Sep 17 00:00:00 2001
From: Jannis Hoch <10956703+JannisHoch@users.noreply.github.com>
Date: Thu, 30 May 2024 19:18:45 +0200
Subject: [PATCH 2/3] Switch from cfg to yaml (#194)

* parse YAML file

* parsing settings from YAML-file works

* improved conflict property selection based on YAML-file input

* fix _clip_to_extent

* reading indicator data works, also with options

* adding poly_id and conflict_id

* added todos

* fixed problem with perm. importance and debugged workflow to be compatible with yaml input
---
 copro/conflict.py             |  31 +++---
 copro/io.py                   |   9 +-
 copro/machine_learning.py     |  36 ++++---
 copro/models.py               |  24 +++--
 copro/scripts/copro_runner.py |  31 +++---
 copro/selection.py            |  82 ++++++++--------
 copro/settings.py             |  40 ++++----
 copro/utils.py                |  11 +--
 copro/variables.py            | 171 +++++++++++-----------------------
 copro/xydata.py               | 139 +++++++++++++--------------
 10 files changed, 252 insertions(+), 322 deletions(-)

diff --git a/copro/conflict.py b/copro/conflict.py
index a85bb5b..bb80283 100644
--- a/copro/conflict.py
+++ b/copro/conflict.py
@@ -16,7 +16,8 @@ def conflict_in_year_bool(
     extent_gdf: gpd.GeoDataFrame,
     sim_year: int,
     out_dir: click.Path,
-    identifier="watprovID",
+    poly_identifier="watprovID",
+    conflict_identifier="event_id_cnty",
 ) -> list:
     """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
 
@@ -46,17 +47,17 @@ def conflict_in_year_bool(
 
     # determine the aggregated amount of fatalities in one region (e.g. water province)
     fatalities_per_poly = (
-        data_merged["best"]
-        .groupby(data_merged[identifier])
-        .sum()
+        data_merged[conflict_identifier]
+        .groupby(data_merged[poly_identifier])
+        .count()
         .to_frame()
-        .rename(columns={"best": "total_fatalities"})
+        .rename(columns={conflict_identifier: "total_fatalities"})
     )
 
     out_dir = os.path.join(out_dir, "files")
     Path.mkdir(Path(out_dir), exist_ok=True)
 
-    if sim_year == config.getint("settings", "y_end"):
+    if sim_year == config["general"]["y_end"]:
         _store_boolean_conflict_data_to_csv(
             fatalities_per_poly, extent_gdf, sim_year, out_dir
         )
@@ -65,7 +66,7 @@ def conflict_in_year_bool(
     # if so, this means that there was conflict and thus assign value 1
     list_out = []
     for i, _ in extent_gdf.iterrows():
-        i_poly = extent_gdf.iloc[i][identifier]
+        i_poly = extent_gdf.iloc[i][poly_identifier]
         if i_poly in fatalities_per_poly.index.values:
             list_out.append(1)
         else:
@@ -80,7 +81,8 @@ def conflict_in_previous_year_bool(
     sim_year: int,
     check_neighbors: bool = False,
     neighboring_matrix: Union[None, pd.DataFrame] = None,
-    identifier="watprovID",
+    poly_identifier="watprovID",  # TODO: no kwarg, should come from config
+    conflict_identifier="event_id_cnty",  # TODO: no kwarg, should come from config
 ) -> list:
     """Creates a list for each timestep with boolean information whether 
     a conflict took place in the previous year in a polygon or not.
@@ -104,27 +106,28 @@ def conflict_in_previous_year_bool(
     else:
         click.echo("Checking for conflict event in polygon at t-1")
 
+    # TODO: screening whether there is any conflict data in sim_year should be done earlier
     # get conflicts at t-1
-    temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year - 1]
+    temp_sel_year = conflict_gdf[conflict_gdf.year == sim_year - 1]
     if temp_sel_year.empty:
         warnings.warn(
             f"No conflicts were found in sampled conflict data set for year {sim_year - 1}."
         )
-
     # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
     data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
-
     conflicts_per_poly = (
-        data_merged.id.groupby(data_merged[identifier])
+        data_merged[conflict_identifier]
+        .groupby(data_merged[poly_identifier])
         .count()
         .to_frame()
-        .rename(columns={"id": "conflict_count"})
+        .rename(columns={conflict_identifier: "conflict_count"})
     )
+    # NOTE: WORKS UNTIL HERE
 
     # loop through all polygons
     list_out = []
     for i in range(len(extent_gdf)):
-        i_poly = extent_gdf[identifier].iloc[i]
+        i_poly = extent_gdf[poly_identifier].iloc[i]
         # check if polygon is in list with conflict polygons
         if i_poly in conflicts_per_poly.index.values:
             # if so, check if neighboring polygons contain conflict and assign boolean value
diff --git a/copro/io.py b/copro/io.py
index 719699c..1a9e9e3 100644
--- a/copro/io.py
+++ b/copro/io.py
@@ -1,20 +1,19 @@
 import pandas as pd
 import numpy as np
 from typing import Union
-from configparser import RawConfigParser
 from pathlib import Path
 import os
 import click
 
 
 def make_and_collect_output_dirs(
-    config: RawConfigParser, root_dir: click.Path, config_dict: dict
+    config: dict, root_dir: click.Path, config_dict: dict
 ) -> dict:
-    """Creates the output folder at location specfied in cfg-file
+    """Creates the output folder at location specfied in YAML-file
     and returns dictionary with config-objects and out-dir per run.
 
     Args:
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        config (dict): dictionary containing the parsed configuration-settings of the model.
         root_dir (Path): absolute path to location of configurations-file
         config_dict (dict): dictionary containing config-objects for reference run and all projection.
 
@@ -23,7 +22,7 @@ def make_and_collect_output_dirs(
     """
 
     # get path to main output directory as specified in cfg-file
-    out_dir = os.path.join(root_dir, config.get("general", "output_dir"))
+    out_dir = os.path.join(root_dir, config["general"]["output_dir"])
     click.echo(f"Saving output to main output folder {out_dir}.")
 
     # initalize list for all out-dirs
diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index 48e6a0c..8dc5f21 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -2,7 +2,6 @@
 import pickle
 import pandas as pd
 import numpy as np
-from configparser import RawConfigParser
 from sklearn import ensemble, preprocessing, model_selection, inspection
 from typing import Union, Tuple
 import click
@@ -12,7 +11,7 @@
 
 
 class MachineLearning:
-    def __init__(self, config: RawConfigParser) -> None:
+    def __init__(self, config: dict) -> None:
         self.config = config
         self.scaler = define_scaling(config)
         self.clf = ensemble.RandomForestClassifier(random_state=42)
@@ -49,7 +48,7 @@ def split_scale_train_test_split(
         X_train, X_test, y_train, y_test = model_selection.train_test_split(
             X_cs,
             Y,
-            test_size=1 - self.config.getfloat("machine_learning", "train_fraction"),
+            test_size=1 - self.config["machine_learning"]["train_fraction"],
         )
 
         # for training-set and test-set, split in ID, geometry, and values
@@ -77,7 +76,7 @@ def fit_predict(
         tune_hyperparameters=False,
         n_jobs=2,
         verbose=0,
-    ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Fits classifier based on training-data and makes predictions.
         The fitted classifier is dumped to file with pickle to be used again during projections.
         Makes prediction with test-data including probabilities of those predictions.
@@ -96,7 +95,7 @@ def fit_predict(
         Returns:
             np.ndarray: array with the predictions made.
             np.ndarray: array with probabilities of the predictions made.
-            pd.DataFrame: dataframe containing permutation importances of variables.
+            np.ndarray: dataframe containing permutation importances of variables.
         """
 
         if tune_hyperparameters:
@@ -117,11 +116,8 @@ def fit_predict(
             random_state=42,
             n_jobs=n_jobs,
         )
-        sorted_importances_idx = perm_importances.importances_mean.argsort()
-        perm_importances_df = pd.DataFrame(
-            perm_importances.importances[sorted_importances_idx].T,
-            # columns=X_train.columns[sorted_importances_idx],
-        )
+        # transpose because by default features are in rows
+        perm_importances_arr = perm_importances["importances"].T
 
         # create folder to store all classifiers with pickle
         clf_pickle_rep = os.path.join(out_dir, "clfs")
@@ -137,16 +133,16 @@ def fit_predict(
         # make prediction of probability
         y_prob = fitted_estimator.predict_proba(X_test)
 
-        return y_pred, y_prob, perm_importances_df
+        return y_pred, y_prob, perm_importances_arr
 
 
-def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:
+def load_clfs(config: dict, out_dir: str) -> list[str]:
     """Loads the paths to all previously fitted classifiers to a list.
     Classifiers were saved to file in fit_predict().
     With this list, the classifiers can be loaded again during projections.
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
         out_dir (path): path to output folder.
 
     Returns:
@@ -155,7 +151,7 @@ def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:
 
     clfs = os.listdir(os.path.join(out_dir, "clfs"))
 
-    if len(clfs) != config.getint("machine_learning", "n_runs"):
+    if len(clfs) != config["machine_learning"]["n_runs"]:
         raise ValueError(
             "Number of loaded classifiers does not match the specified number of runs in cfg-file!"
         )
@@ -188,7 +184,7 @@ def _split_conflict_geom_data(
 
 
 def define_scaling(
-    config: RawConfigParser,
+    config: dict,
 ) -> Union[
     preprocessing.MinMaxScaler,
     preprocessing.StandardScaler,
@@ -198,19 +194,19 @@ def define_scaling(
     """Defines scaling method based on model configurations.
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
 
     Returns:
         scaler: the specified scaling method instance.
     """
 
-    if config.get("machine_learning", "scaler") == "MinMaxScaler":
+    if config["machine_learning"]["scaler"] == "MinMaxScaler":
         scaler = preprocessing.MinMaxScaler()
-    elif config.get("machine_learning", "scaler") == "StandardScaler":
+    elif config["machine_learning"]["scaler"] == "StandardScaler":
         scaler = preprocessing.StandardScaler()
-    elif config.get("machine_learning", "scaler") == "RobustScaler":
+    elif config["machine_learning"]["scaler"] == "RobustScaler":
         scaler = preprocessing.RobustScaler()
-    elif config.get("machine_learning", "scaler") == "QuantileTransformer":
+    elif config["machine_learning"]["scaler"] == "QuantileTransformer":
         scaler = preprocessing.QuantileTransformer(random_state=42)
     else:
         raise ValueError(
diff --git a/copro/models.py b/copro/models.py
index 1ee5766..9cbcb80 100644
--- a/copro/models.py
+++ b/copro/models.py
@@ -58,7 +58,7 @@ def run(
         Returns:
             pd.DataFrame: Prediction dataframes.
             pd.DataFrame: model output on polygon-basis.
-            pd.DataFrame: containing permutation importances for all runs.
+            np.ndarray: containing permutation importances for all runs.
             dict: evaluation dictionary.
         """
 
@@ -67,7 +67,7 @@ def run(
         # - initializing output variables
         out_X_df = pd.DataFrame()
         out_y_df = pd.DataFrame()
-        out_perm_importances_df = pd.DataFrame()
+        out_perm_importances_arr = np.array([]).reshape(0, self.X.shape[1] - 2)
         out_dict = evaluation.init_out_dict()
 
         click.echo("Training and testing machine learning model")
@@ -75,25 +75,23 @@ def run(
             click.echo(f"Run {n+1} of {number_runs}.")
 
             # - run machine learning model and return outputs
-            X_df, y_df, eval_dict, perm_importances_df_n = self._n_run(
+            X_df, y_df, eval_dict, perm_importances_arr_n = self._n_run(
                 run_nr=n, tune_hyperparameters=tune_hyperparameters
             )
 
             # - append per model execution
             out_X_df = pd.concat([out_X_df, X_df], axis=0, ignore_index=True)
             out_y_df = pd.concat([out_y_df, y_df], axis=0, ignore_index=True)
-            out_perm_importances_df = pd.concat(
-                [out_perm_importances_df, perm_importances_df_n],
-                axis=0,
-                ignore_index=True,
+            out_perm_importances_arr = np.vstack(
+                [out_perm_importances_arr, perm_importances_arr_n]
             )
             out_dict = evaluation.fill_out_dict(out_dict, eval_dict)
 
-        return out_X_df, out_y_df, out_perm_importances_df, out_dict
+        return out_X_df, out_y_df, out_perm_importances_arr, out_dict
 
     def _n_run(
         self, run_nr: int, tune_hyperparameters=False
-    ) -> tuple[pd.DataFrame, pd.DataFrame, dict, pd.DataFrame]:
+    ) -> tuple[pd.DataFrame, pd.DataFrame, dict, np.ndarray]:
         """Runs workflow per specified number of runs.
         The model workflow is executed for each classifier.
 
@@ -105,7 +103,7 @@ def _n_run(
             pd.DataFrame: containing the test-data X-array values.
             pd.DataFrame: containing model output on polygon-basis.
             dict: dictionary containing evaluation metrics per simulation.
-            pd.DataFrame: containing permutation importances for run n.
+            np.ndarray: containing permutation importances for run n.
         """
 
         MLmodel = machine_learning.MachineLearning(
@@ -128,7 +126,7 @@ def _n_run(
         X_df = pd.DataFrame(X_test)
 
         # fit classifier and make prediction with test-set
-        y_pred, y_prob, perm_importances_df_n = MLmodel.fit_predict(
+        y_pred, y_prob, perm_importances_arr_n = MLmodel.fit_predict(
             X_train,
             y_train,
             X_test,
@@ -149,7 +147,7 @@ def _n_run(
             X_test_ID, X_test_geom, y_test, y_pred, y_prob_0, y_prob_1
         )
 
-        return X_df, y_df, eval_dict, perm_importances_df_n
+        return X_df, y_df, eval_dict, perm_importances_arr_n
 
     def run_prediction(
         self,
@@ -180,7 +178,7 @@ def run_prediction(
         clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF)
 
         # going through each projection specified
-        for each_key, _ in config_REF.items("PROJ_files"):
+        for each_key, _ in config_REF.items():
 
             # get config-object and out-dir per projection
             click.echo(f"Loading config-object for projection run: {each_key}.")
diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py
index 389bffc..dc64662 100644
--- a/copro/scripts/copro_runner.py
+++ b/copro/scripts/copro_runner.py
@@ -2,6 +2,7 @@
 
 import click
 import numpy as np
+import pandas as pd
 import os
 
 import warnings
@@ -62,17 +63,19 @@ def cli(cfg: click.Path, cores: int, verbose: int):
     )
 
     # - fit-transform on scaler to be used later during projections
-
-    _, out_y_df, out_perm_importances_df, out_dict = ModelWorkflow.run(
-        config_REF.getint("machine_learning", "n_runs"), tune_hyperparameters=True
+    _, out_y_df, out_perm_importances_arr, out_dict = ModelWorkflow.run(
+        config_REF["machine_learning"]["n_runs"], tune_hyperparameters=True
     )
 
     # - save output to files
-    out_perm_importances_df.columns = [
-        key
-        for key in XY_class.XY_dict
-        if key not in ["poly_ID", "poly_geometry", "conflict"]
-    ]
+    out_perm_importances_df = pd.DataFrame(
+        data=out_perm_importances_arr,
+        columns=[
+            key
+            for key in XY_class.XY_dict
+            if key not in ["poly_ID", "poly_geometry", "conflict"]
+        ],
+    )
     out_perm_importances_df.to_parquet(
         os.path.join(out_dir_REF, "perm_importances.parquet")
     )
@@ -84,7 +87,7 @@ def cli(cfg: click.Path, cores: int, verbose: int):
         click.echo(
             "Average {} of run with {} repetitions is {:0.3f}".format(
                 key,
-                config_REF.getint("machine_learning", "n_runs"),
+                config_REF["machine_learning"]["n_runs"],
                 np.mean(value),
             )
         )
@@ -97,10 +100,10 @@ def cli(cfg: click.Path, cores: int, verbose: int):
 
     click.echo(click.style("\nINFO: reference run succesfully finished\n", fg="cyan"))
 
-    click.echo(click.style("INFO: starting projections\n", fg="cyan"))
-
-    # - running prediction runs
-    # TODO: scaler_fitted is now not part of the class
-    ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf)
+    if "projections" in config_REF.keys():
+        click.echo(click.style("INFO: starting projections\n", fg="cyan"))
+        # - running prediction runs
+        # TODO: scaler_fitted is now not part of the class
+        ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf)
 
     click.echo(click.style("\nINFO: all projections succesfully finished\n", fg="cyan"))
diff --git a/copro/selection.py b/copro/selection.py
index 71da5c6..3a84896 100644
--- a/copro/selection.py
+++ b/copro/selection.py
@@ -2,14 +2,13 @@
 import pandas as pd
 import os
 from copro import utils
-from configparser import RawConfigParser
 import click
 from typing import Tuple
-from ast import literal_eval
+import warnings
 
 
 def select(
-    config: RawConfigParser, out_dir: click.Path, root_dir: click.Path
+    config: dict, out_dir: click.Path, root_dir: click.Path
 ) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame, pd.DataFrame]:
     """Main function performing the selection procedure.
     First, selects only conflicts matching specified properties.
@@ -17,7 +16,7 @@ def select(
     Third, retrieves the geometry of all polygons in the spatial extent and assigns IDs.
 
     Args:
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
         out_dir (Path): path to output folder.
         root_dir (Path): path to location of cfg-file for reference run.
 
@@ -55,66 +54,71 @@ def select(
 
 
 def _filter_conflict_properties(
-    gdf: gpd.GeoDataFrame, config: RawConfigParser
+    gdf: gpd.GeoDataFrame, config: dict
 ) -> gpd.GeoDataFrame:
-    """Filters conflict database according to certain conflict properties
-    such as number of casualties, type of violence or country.
+    """Filters conflict database according to certain treshold options.
+    These options are 'values', 'vmin' and 'vmax'.
+    These options and the conflict properties to which they are applied
+    need to be specified in the YAML-file.
 
     Args:
-        gdf (gpd.GeoDataFrame): geo-dataframe containing entries with conflicts.
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        gdf (gpd.GeoDataFrame): Geodataframe containing entries with conflicts.
+        config (dict): Parsed configuration-settings of the model.
 
     Returns:
         gpd.GeoDataFrame: geo-dataframe containing filtered entries.
     """
 
-    conflict_items = dict(config.items("conflict"))
+    if "thresholds" not in config["data"]["conflict"]:
+        click.echo("No thresholding options found in configuration file.")
+        return gdf
 
-    # # create dictionary with all selection criteria
-    # selection_criteria = {
-    #     "best": config.getint("conflict", "min_nr_casualties"),
-    #     "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","),
-    # }
-
-    click.echo("Filtering based on conflict properties.")
     # go through all criteria
-    for key, value in conflict_items.items():
-
-        if key not in ["conflict_file", "predicted_var"]:
-
-            # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold
-            if key == "best" and value != "":
-                click.echo(f"Filtering key {key} with lower value {value}.")
-                gdf = gdf[gdf["best"] >= value]
-            # for other criteria, select all entries matching the specified value(s) per criterion
-            if key == "type_of_violence" and value != "":
-                click.echo(f"Filtering key {key} with value(s) {value}.")
-                # NOTE: check if this works like this
-                values = [literal_eval(i) for i in value]
-                gdf = gdf[gdf[key].isin(values)]
+    for key, value in config["data"]["conflict"]["thresholds"].items():
+
+        if key not in gdf.columns:
+            warnings.warn(
+                f"{key} is not found in geodataframe columns, will be skipped."
+            )
+        else:
+            click.echo(f"Tresholding conflict data on {key}.")
+            for v, k in value.items():
+                if v == "values":
+                    click.echo(f"Selecting datapoints with values {k}.")
+                    gdf = gdf[gdf[key].isin(k)]
+                elif v == "vmin":
+                    click.echo(f"Selecting datapoints greater or equal to {k}.")
+                    gdf = gdf[gdf[key] >= k]
+                elif v == "vmax":
+                    click.echo(f"Selecting datapoints less or equal to {k}.")
+                    gdf = gdf[gdf[key] <= k]
+                else:
+                    raise ValueError(
+                        f"{v} is not a recognized tresholding option - use 'values', 'vmin' or 'vmax'."
+                    )
 
     return gdf
 
 
 def _clip_to_extent(
-    conflict_gdf: gpd.GeoDataFrame, config: RawConfigParser, root_dir: click.Path
+    conflict_gdf: gpd.GeoDataFrame, config: dict, root_dir: click.Path
 ) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
     """As the original conflict data has global extent, this function clips the database
     to those entries which have occured on a specified continent.
 
     Args:
-        conflict_gdf (geo-dataframe): geo-dataframe containing entries with conflicts.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        root_dir (str): path to location of cfg-file.
+        conflict_gdf (gpd.GeoDataFrame): Geodataframe containing entries with conflicts.
+        config (dict): Parsed configuration-settings of the model.
+        root_dir (str): Path to location of cfg-file.
 
     Returns:
-        geo-dataframe: geo-dataframe containing filtered entries.
-        geo-dataframe: geo-dataframe containing country polygons of selected continent.
+        gpd.GeoDataFrame: Geodataframe containing filtered entries.
+        gpd.GeoDataFrame: Geodataframe containing country polygons of selected continent.
     """
 
     # get path to file with polygons for which analysis is carried out
     shp_fo = os.path.join(
-        root_dir, config.get("general", "input_dir"), config.get("extent", "shp")
+        root_dir, config["general"]["input_dir"], config["data"]["extent"]["file"]
     )
 
     # read file
@@ -126,7 +130,7 @@ def _clip_to_extent(
     extent_gdf.geometry = extent_gdf.buffer(0)
 
     # clip the conflict dataframe to the specified polygons
-    click.echo("Clipping clipping conflict dataset to extent.")
+    click.echo("Clipping conflict dataset to extent.")
     conflict_gdf = gpd.clip(conflict_gdf, extent_gdf)
 
     return conflict_gdf, extent_gdf
diff --git a/copro/settings.py b/copro/settings.py
index f22c3d7..b53d695 100644
--- a/copro/settings.py
+++ b/copro/settings.py
@@ -6,6 +6,8 @@
 from typing import Tuple
 from copro import utils, io
 
+import yaml
+
 
 def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]:
     """Initiates the model set-up.
@@ -53,25 +55,25 @@ def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]:
     return main_dict, root_dir
 
 
-def _parse_settings(settings_file: click.Path) -> RawConfigParser:
-    """Reads the model configuration file.
+def _parse_settings(settings_file: click.Path) -> dict:
+    """Reads the model configuration YAML-file and returns contant as dictionary.
 
     Args:
         settings_file (Path): path to settings-file (cfg-file).
 
     Returns:
-        RawConfigParser: parsed model configuration.
+        dict: parsed model configuration.
     """
 
     click.echo(f"Parsing settings from file {settings_file}.")
-    config = RawConfigParser(allow_no_value=True, inline_comment_prefixes="#")
-    config.optionxform = lambda option: option
-    config.read(settings_file)
+
+    with open(settings_file, "r") as stream:
+        config = yaml.safe_load(stream)
 
     return config
 
 
-def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path) -> dict:
+def _collect_simulation_settings(config: dict, root_dir: click.Path) -> dict:
     """Collects the configuration settings for the reference run and all projection runs.
     These cfg-files need to be specified one by one in the PROJ_files section of the cfg-file for the reference run.
     The function returns then a dictionary with the name of the run and the associated config-object.
@@ -86,7 +88,7 @@ def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path)
         config_dict = {'_REF': [config_REF], 'run1': [config_run1], 'run2': [config_run2]}
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings \
+        config (dict): dictionariy containing the parsed configuration-settings \
             of the model for the reference run.
         root_dir (Path): path to location of the cfg-file for the reference run.
 
@@ -99,17 +101,19 @@ def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path)
     # first entry is config-object for reference run
     config_dict["_REF"] = config
 
-    # loop through all keys and values in PROJ_files section of reference config-object
-    for (each_key, each_val) in config.items("PROJ_files"):
+    if "PROJ_files" in config["general"].keys():
+
+        # loop through all keys and values in PROJ_files section of reference config-object
+        for (each_key, each_val) in config.items("PROJ_files"):
 
-        # for each value (here representing the cfg-files of the projections), get the absolute path
-        each_val = os.path.abspath(os.path.join(root_dir, each_val))
+            # for each value (here representing the cfg-files of the projections), get the absolute path
+            each_val = os.path.abspath(os.path.join(root_dir, each_val))
 
-        # parse each config-file specified
-        each_config = _parse_settings(each_val)
+            # parse each config-file specified
+            each_config = _parse_settings(each_val)
 
-        # update the output dictionary with key and config-object
-        config_dict[each_key] = [each_config]
+            # update the output dictionary with key and config-object
+            config_dict[each_key] = [each_config]
 
     return config_dict
 
@@ -131,8 +135,8 @@ def determine_projection_period(
 
     # get all years of projection period
     projection_period = np.arange(
-        config_REF.getint("settings", "y_end") + 1,
-        config_PROJ.getint("settings", "y_proj") + 1,
+        config_REF["general"]["y_end"] + 1,
+        config_PROJ["general"]["y_proj"] + 1,
         1,
     )
     # convert to list
diff --git a/copro/utils.py b/copro/utils.py
index fa83f3a..e505579 100644
--- a/copro/utils.py
+++ b/copro/utils.py
@@ -2,14 +2,13 @@
 import pandas as pd
 import numpy as np
 import os
-from configparser import RawConfigParser
 from datetime import date
 import click
 from copro import __version__, __author__, __email__
 
 
 def get_conflict_geodataframe(
-    config: RawConfigParser,
+    config: dict,
     root_dir: click.Path,
     longitude="longitude",
     latitude="latitude",
@@ -18,21 +17,21 @@ def get_conflict_geodataframe(
     """Converts a csv-file containing geo-referenced conflict data to a geodataframe.
 
     Args:
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
         root_dir (Path): path to location of cfg-file.
         longitude (str, optional): column name with longitude coordinates. Defaults to 'longitude'.
         latitude (str, optional): column name with latitude coordinates. Defaults to 'latitude'.
         crs (str, optional): coordinate system to be used for georeferencing. Defaults to 'EPSG:4326'.
 
     Returns:
-        geo-dataframe: geo-referenced conflict data.
+        gpd.GeoDataFrame: geo-referenced conflict data.
     """
 
     # get path to file containing data
     conflict_fo = os.path.join(
         root_dir,
-        config.get("general", "input_dir"),
-        config.get("conflict", "conflict_file"),
+        config["general"]["input_dir"],
+        config["data"]["conflict"]["file"],
     )
 
     # read file to pandas dataframe
diff --git a/copro/variables.py b/copro/variables.py
index 0db563e..7dd0c63 100644
--- a/copro/variables.py
+++ b/copro/variables.py
@@ -7,7 +7,6 @@
 import os
 import math
 import click
-from configparser import RawConfigParser
 
 import warnings
 
@@ -16,85 +15,63 @@
 
 def nc_with_float_timestamp(
     extent_gdf: gpd.GeoDataFrame,
-    config: RawConfigParser,
+    config: dict,
     root_dir: str,
     var_name: str,
     sim_year: int,
 ) -> list:
-    """This function extracts a value from a netCDF-file (specified in the cfg-file)
+    """This function extracts a value from a netCDF-file (specified in the yaml-file)
     for each polygon specified in extent_gdf for a given year.
-    In the cfg-file, it must also be specified whether the value is log-transformed or not,
+    In the yaml-file, it must also be specified whether the value is log-transformed or not,
     and which statistical method is applied.
 
     .. note::
-        The key in the cfg-file must be identical to variable name in netCDF-file.
+        The key in the yaml-file must be identical to variable name in netCDF-file.
 
     .. note::
         Works only with nc-files with annual data.
 
     Args:
         extent_gdf (gpd.GeoDataFrame): One or more polygons with geometry information for which values are extracted.
-        config (RawConfigParser): parsed configuration settings of run.
-        root_dir (str): path to location of cfg-file.
-        var_name (str): name of variable in nc-file. Must be the same as is specified in cfg-file.
-        sim_year (int): year for which data is extracted.
+        config (dict): Parsed configuration settings of run.
+        root_dir (str): Path to location of yaml-file.
+        var_name (str): Name of variable in nc-file. Must be the same as is specified in yaml-file.
+        sim_year (int): Year for which data is extracted.
 
     Returns:
-        list: list containing statistical value per polygon, i.e. with same length as extent_gdf.
+        list: List containing statistical value per polygon, i.e. with same length as extent_gdf.
     """
 
-    # get the filename, True/False whether log-transform shall be applied, and statistical method from cfg-file as list
-    data_fo = os.path.join(
-        root_dir, config.get("general", "input_dir"), config.get("data", var_name)
-    ).rsplit(",")
-
-    # if not all of these three aspects are provided, raise error
-    if len(data_fo) != 3:
-        raise ValueError(
-            "Not all settings for input data set {} provided - \
-                it must contain of path, False/True, and statistical method".format(
-                os.path.join(
-                    root_dir,
-                    config.get("general", "input_dir"),
-                    config.get("data", var_name),
-                )
-            )
-        )
-
-    # if not, split the list into separate variables
-    nc_fo = data_fo[0]
-    ln_flag = bool(data_fo[1])
-    stat_method = str(data_fo[2])
+    nc_fo = os.path.join(
+        root_dir,
+        config["general"]["input_dir"],
+        config["data"]["indicators"][var_name]["file"],
+    )
 
+    if "log" not in config["data"]["indicators"][var_name].keys():
+        ln_flag = False
+    else:
+        ln_flag = config["data"]["indicators"][var_name]["log"]
+    if "stat" not in config["data"]["indicators"][var_name].keys():
+        stat_method = "mean"
+    else:
+        stat_method = config["data"]["indicators"][var_name]["stat"]
     LAG_TIME = 1
-    click.echo(f"Applying {LAG_TIME} year lag time.")
-    sim_year = sim_year - LAG_TIME
+    click.echo(f"\tuse log: {ln_flag}.")
+    click.echo(f"\tstatistical method: {stat_method}.")
+    click.echo(f"\tlag time: {LAG_TIME} year(s).")
 
-    if ln_flag:
-        click.echo(
-            "Calculating log-transformed {0} {1} per aggregation unit from file {2} for year {3}".format(
-                stat_method, var_name, nc_fo, sim_year
-            )
-        )
-    else:
-        click.echo(
-            "Calculating {0} {1} per aggregation unit from file {2} for year {3}".format(
-                stat_method, var_name, nc_fo, sim_year
-            )
-        )
+    sim_year = sim_year - LAG_TIME
 
     # open nc-file with xarray as dataset
     nc_ds = xr.open_dataset(nc_fo)
     # get xarray data-array for specified variable
     nc_var = nc_ds[var_name]
-    if ln_flag:
-        nc_var = np.log(nc_var)
     # open nc-file with rasterio to get affine information
     affine = rio.open(nc_fo).transform
 
     # get values from data-array for specified year
-    nc_arr = nc_var.sel(time=sim_year)
-    nc_arr_vals = nc_arr.values
+    nc_arr_vals = nc_var.sel({"time": sim_year}).values
     if nc_arr_vals.size == 0:
         raise ValueError(
             f"No data was found for this year in the nc-file {nc_fo}, check if all is correct."
@@ -142,106 +119,66 @@ def nc_with_float_timestamp(
 
 def nc_with_continous_datetime_timestamp(
     extent_gdf: gpd.GeoDataFrame,
-    config: RawConfigParser,
+    config: dict,
     root_dir: str,
     var_name: str,
     sim_year: int,
 ) -> list:
-    """This function extracts a value from a netCDF-file (specified in the cfg-file)
+    """This function extracts a value from a netCDF-file (specified in the yaml-file)
     for each polygon specified in extent_gdf for a given year.
-    In the cfg-file, it must also be specified whether the value is log-transformed or not,
+    In the yaml-file, it must also be specified whether the value is log-transformed or not,
     and which statistical method is applied.
 
     .. note::
-        The key in the cfg-file must be identical to variable name in netCDF-file.
+        The key in the yaml-file must be identical to variable name in netCDF-file.
 
     .. note::
         Works only with nc-files with annual data.
 
     Args:
         extent_gdf (gpd.GeoDataFrame): One or more polygons with geometry information for which values are extracted.
-        config (RawConfigParser): parsed configuration settings of run.
-        root_dir (str): path to location of cfg-file.
-        var_name (str): name of variable in nc-file. Must be the same as in the cfg-file.
-        sim_year (int): year for which data is extracted.
+        config (config): Parsed configuration settings of run.
+        root_dir (str): Path to location of yaml-file.
+        var_name (str): Name of variable in nc-file. Must be the same as in the yaml-file.
+        sim_year (int): Year for which data is extracted.
 
     Returns:
-        list: list containing statistical value per polygon, i.e. with same length as extent_gdf.
+        list: List containing statistical value per polygon, i.e. with same length as extent_gdf.
     """
 
-    # get the filename, True/False whether log-transform shall be applied, and statistical method from cfg-file as list
-    data_fo = os.path.join(
-        root_dir, config.get("general", "input_dir"), config.get("data", var_name)
-    ).rsplit(",")
-
-    # if not all of these three aspects are provided, raise error
-    if len(data_fo) != 3:
-        raise ValueError(
-            "Not all settings for input data set {} provided - \
-                it must contain of path, False/True, and statistical method".format(
-                os.path.join(
-                    root_dir,
-                    config.get("general", "input_dir"),
-                    config.get("data", var_name),
-                )
-            )
-        )
-
-    # if not, split the list into separate variables
-    nc_fo = data_fo[0]
-    ln_flag = bool(data_fo[1])
-    stat_method = str(data_fo[2])
-
-    LAG_TIME = 1
-    click.echo(f"Applying {LAG_TIME} year lag time for variable {var_name}.")
-    sim_year = sim_year - LAG_TIME
+    nc_fo = os.path.join(
+        root_dir,
+        config["general"]["input_dir"],
+        config["data"]["indicators"][var_name]["file"],
+    )
 
-    if ln_flag:
-        click.echo(
-            "Calculating log-transformed {0} {1} per aggregation unit from file {2} for year {3}".format(
-                stat_method, var_name, nc_fo, sim_year
-            )
-        )
+    if "log" not in config["data"]["indicators"][var_name].keys():
+        ln_flag = False
     else:
-        click.echo(
-            "Calculating {0} {1} per aggregation unit from file {2} for year {3}".format(
-                stat_method, var_name, nc_fo, sim_year
-            )
-        )
+        ln_flag = config["data"]["indicators"][var_name]["log"]
+    if "stat" not in config["data"]["indicators"][var_name].keys():
+        stat_method = "mean"
+    else:
+        stat_method = config["data"]["indicators"][var_name]["stat"]
+    LAG_TIME = 1
+    click.echo(f"\tuse log: {ln_flag}.")
+    click.echo(f"\tstatistical method: {stat_method}.")
+    click.echo(f"\tlag time: {LAG_TIME} year(s).")
 
+    sim_year = sim_year - LAG_TIME
     # open nc-file with xarray as dataset
     nc_ds = xr.open_dataset(nc_fo)
     # get xarray data-array for specified variable
     nc_var = nc_ds[var_name]
-    # get years contained in nc-file as integer array to be compatible with sim_year
-    years = (
-        pd.to_datetime(nc_ds.time.values)
-        .to_period(freq="Y")
-        .strftime("%Y")
-        .to_numpy(dtype=int)
-    )
-    if sim_year not in years:
-        warnings.warn(
-            f"The simulation year {sim_year} can not be found in file {nc_fo}."
-        )
-        warnings.warn(
-            "Using the next following year instead (yes that is an ugly solution...)"
-        )
-        sim_year = sim_year + 1
-        # raise ValueError('ERROR: the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
 
-    # get index which corresponds with sim_year in years in nc-file
-    sim_year_idx = int(np.where(years == sim_year)[0])
     # get values from data-array for specified year based on index
-    nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
-    nc_arr_vals = nc_arr.values
+    nc_arr_vals = nc_var.sel({"time": pd.to_datetime(sim_year, format="%Y")}).values
     if nc_arr_vals.size == 0:
         raise ValueError(
             "No data was found for this year in the nc-file {}, check if all is correct".format(
                 nc_fo
             )
         )
-
     # open nc-file with rasterio to get affine information
     affine = rio.open(nc_fo).transform
 
diff --git a/copro/xydata.py b/copro/xydata.py
index 330adc5..a35327a 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -1,5 +1,4 @@
 from copro import conflict, variables, nb, utils
-from configparser import RawConfigParser
 from typing import Tuple
 import click
 import numpy as np
@@ -10,7 +9,7 @@
 
 
 class XYData:
-    def __init__(self, config: RawConfigParser):
+    def __init__(self, config: dict):
         self.XY_dict = {}
         self.__XY_dict_initiated__ = False
         self.config = config
@@ -26,8 +25,8 @@ def _initiate_XY_data(self):
         # some entries are set by default, besides the ones corresponding to input data variables
         self.XY_dict["poly_ID"] = pd.Series()
         self.XY_dict["poly_geometry"] = pd.Series()
-        for key in self.config.items("data"):
-            self.XY_dict[str(key[0])] = pd.Series(dtype=float)
+        for key in self.config["data"]["indicators"]:
+            self.XY_dict[key] = pd.Series(dtype=float)
         self.XY_dict["conflict_t_min_1"] = pd.Series(dtype=bool)
         self.XY_dict["conflict_t_min_1_nb"] = pd.Series(dtype=float)
         self.XY_dict["conflict"] = pd.Series(dtype=bool)
@@ -44,45 +43,33 @@ def create_XY(
         root_dir: click.Path,
         polygon_gdf: gpd.GeoDataFrame,
         conflict_gdf: gpd.GeoDataFrame,
-    ) -> Tuple[np.array, np.array]:
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """Top-level function to create the X-array and Y-array.
         If the XY-data was pre-computed and specified in cfg-file, the data is loaded.
         If not, variable values and conflict data are read from file and stored in array.
         The resulting array is by default saved as npy-format to file.
 
         Args:
-            config (ConfigParser-object): object containing the parsed configuration-settings of the model.
             out_dir (str): path to output folder.
             root_dir (str): path to location of cfg-file.
             polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
             conflict_gdf (geo-dataframe): geo-dataframe containing the selected conflicts.
 
         Returns:
-            array: X-array containing variable values.
-            array: Y-array containing conflict data.
+            np.ndarray: X-array containing variable values.
+            np.ndarray: Y-array containing conflict data.
         """
 
-        # if nothing is specified in cfg-file, then initiate and fill XY data from scratch
-        if self.config.get("pre_calc", "XY") != " ":
-            self._initiate_XY_data()
-            # fill the dictionary and get array
-            XY_arr = _fill_XY(
-                self.XY_dict, self.config, root_dir, conflict_gdf, polygon_gdf, out_dir
-            )
-            # save array to XY.npy out_dir
-            click.echo(
-                f"Saving XY data by default to file {os.path.join(out_dir, 'XY.npy')}."
-            )
-            np.save(os.path.join(out_dir, "XY"), XY_arr)
-        # if path to XY.npy is specified, read the data intead
-        else:
-            click.echo(
-                f"Loading XY data from file {os.path.join(root_dir, self.config.get('pre_calc', 'XY'))}."
-            )
-            XY_arr = np.load(
-                os.path.join(root_dir, self.config.get("pre_calc", "XY")),
-                allow_pickle=True,
-            )
+        self._initiate_XY_data()
+        # fill the dictionary and get array
+        XY_arr = _fill_XY(
+            self.XY_dict, self.config, root_dir, conflict_gdf, polygon_gdf, out_dir
+        )
+        # save array to XY.npy out_dir
+        click.echo(
+            f"Saving XY data by default to file {os.path.join(out_dir, 'XY.npy')}."
+        )
+        np.save(os.path.join(out_dir, "XY"), XY_arr)
 
         # split the XY data into sample data X and target values Y
         X, Y = _split_XY_data(XY_arr)
@@ -90,41 +77,41 @@ def create_XY(
         return X, Y
 
 
-def initiate_X_data(config: RawConfigParser) -> dict:
-    """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data.
-    This is needed for each time step of each projection run.
-    By default, the first column is for the polygon ID and the second for polygon geometry.
-    The penultimate column is for boolean information about conflict at t-1
-    while the last column is for boolean information about conflict at t-1 in neighboring polygons.
-    All remaining columns correspond to the variables provided in the cfg-file.
+# def initiate_X_data(config: RawConfigParser) -> dict:
+#     """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data.
+#     This is needed for each time step of each projection run.
+#     By default, the first column is for the polygon ID and the second for polygon geometry.
+#     The penultimate column is for boolean information about conflict at t-1
+#     while the last column is for boolean information about conflict at t-1 in neighboring polygons.
+#     All remaining columns correspond to the variables provided in the cfg-file.
 
-    Args:
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+#     Args:
+#         config (RawConfigParser): object containing the parsed configuration-settings of the model.
 
-    Returns:
-        dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data.
-    """
+#     Returns:
+#         dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data.
+#     """
 
-    # Initialize dictionary
-    # some entries are set by default, besides the ones corresponding to input data variables
-    X = {}
-    X["poly_ID"] = pd.Series()
-    X["poly_geometry"] = pd.Series()
-    for key in config.items("data"):
-        X[str(key[0])] = pd.Series(dtype=float)
-    X["conflict_t_min_1"] = pd.Series(dtype=bool)
-    X["conflict_t_min_1_nb"] = pd.Series(dtype=float)
+#     # Initialize dictionary
+#     # some entries are set by default, besides the ones corresponding to input data variables
+#     X = {}
+#     X["poly_ID"] = pd.Series()
+#     X["poly_geometry"] = pd.Series()
+#     for key in config.items("data"):
+#         X[str(key[0])] = pd.Series(dtype=float)
+#     X["conflict_t_min_1"] = pd.Series(dtype=bool)
+#     X["conflict_t_min_1_nb"] = pd.Series(dtype=float)
 
-    click.echo("The columns in the sample matrix used are:")
-    for key in X:
-        click.echo(f"...{key}")
+#     click.echo("The columns in the sample matrix used are:")
+#     for key in X:
+#         click.echo(f"...{key}")
 
-    return X
+#     return X
 
 
 def fill_X_sample(
     X: dict,
-    config: RawConfigParser,
+    config: dict,
     root_dir: str,
     polygon_gdf: gpd.GeoDataFrame,
     proj_year: int,
@@ -136,7 +123,7 @@ def fill_X_sample(
 
     Args:
         X (dict): dictionary containing keys to be sampled.
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
         root_dir (str): path to location of cfg-file of reference run.
         polygon_gdf (gpd.GeoDataFrame): geo-dataframe containing the selected polygons.
         proj_year (int): year for which projection is made.
@@ -173,9 +160,9 @@ def fill_X_sample(
                 nc_ds = xr.open_dataset(
                     os.path.join(
                         root_dir,
-                        config.get("general", "input_dir"),
-                        config.get("data", key),
-                    ).rsplit(",")[0]
+                        config["general"]["input_dir"],
+                        config["data"]["indicators"][key]["file"],
+                    )
                 )
 
                 if (np.dtype(nc_ds.time) == np.float32) or (
@@ -205,8 +192,8 @@ def fill_X_sample(
                         "This file has an unsupported dtype for the time variable: {}".format(
                             os.path.join(
                                 root_dir,
-                                config.get("general", "input_dir"),
-                                config.get("data", key),
+                                config["general"]["input_dir"],
+                                config["data"]["indicators"][key]["file"],
                             )
                         )
                     )
@@ -264,7 +251,7 @@ def fill_X_conflict(
 
 def _fill_XY(  # noqa: R0912
     XY: dict,
-    config: RawConfigParser,
+    config: dict,
     root_dir: click.Path,
     conflict_data: gpd.GeoDataFrame,
     polygon_gdf: gpd.GeoDataFrame,
@@ -276,19 +263,19 @@ def _fill_XY(  # noqa: R0912
 
     Args:
         XY (dict): initiated, i.e. empty, XY-dictionary
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        root_dir (str): path to location of cfg-file.
-        conflict_data (geo-dataframe): geo-dataframe containing the selected conflicts.
-        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
-        out_dir (path): path to output folder.
+        config (dict): Parsed configuration-settings of the model.
+        root_dir (str): Path to location of cfg-file.
+        conflict_data (gpd.GeoDataFrame): Geodataframe containing the selected conflicts.
+        polygon_gdf (gpd.GeoDataFrame): Geodataframe containing the selected polygons.
+        out_dir (path): Path to output folder.
 
     Returns:
-        array: filled array containing the variable values (X) and binary conflict data (Y) plus meta-data.
+        np.ndarray: Filled array containing the variable values (X) and binary conflict data (Y) plus meta-data.
     """
 
     # go through all simulation years as specified in config-file
     model_period = np.arange(
-        config.getint("settings", "y_start"), config.getint("settings", "y_end") + 1, 1
+        config["general"]["y_start"], config["general"]["y_end"] + 1, 1
     )
     click.echo(f"Reading data for period from {model_period[0]} to {model_period[-1]}.")
 
@@ -297,7 +284,7 @@ def _fill_XY(  # noqa: R0912
     for (sim_year, i) in zip(model_period, range(len(model_period))):
 
         if i == 0:
-            click.echo(f"Skipping first year {sim_year} to start up model")
+            click.echo(f"Skipping first year {sim_year} to start up model.")
         else:
             click.echo(f"Entering year {sim_year}.")
             # go through all keys in dictionary
@@ -360,13 +347,13 @@ def _fill_XY(  # noqa: R0912
 
                 else:
 
-                    nc_ds = xr.open_dataset(
-                        os.path.join(
-                            root_dir,
-                            config.get("general", "input_dir"),
-                            config.get("data", key),
-                        ).rsplit(",")[0]
+                    nc_fo = os.path.join(
+                        root_dir,
+                        config["general"]["input_dir"],
+                        config["data"]["indicators"][key]["file"],
                     )
+                    click.echo(f"Reading data for indicator {key} from {nc_fo}.")
+                    nc_ds = xr.open_dataset(nc_fo)
 
                     if (np.dtype(nc_ds.time) == np.float32) or (
                         np.dtype(nc_ds.time) == np.float64

From 8219d766b08e0649dc38df3993aa12208ff5493a Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Sat, 24 Aug 2024 21:03:47 +0200
Subject: [PATCH 3/3] add KFold to GridSearchCV

---
 copro/machine_learning.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index 8dc5f21..6bf8641 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -6,7 +6,7 @@
 from typing import Union, Tuple
 import click
 from pathlib import Path
-from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import GridSearchCV, KFold
 from sklearn.ensemble import RandomForestClassifier
 
 
@@ -304,7 +304,7 @@ def apply_gridsearchCV(
     grid_search = GridSearchCV(
         estimator=estimator,
         param_grid=param_grid,
-        cv=5,
+        cv=KFold(n_splits=5, shuffle=True, random_state=42),
         n_jobs=n_jobs,
         verbose=verbose,
         scoring="roc_auc",