From cb8349cd383a11bea5757f047dfc9b91b3ae864b Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Fri, 2 Feb 2024 16:04:35 +0100
Subject: [PATCH 01/13] cleaning up evaluation.py

---
 copro/evaluation.py | 377 ++++++++++++++++++++++++--------------------
 1 file changed, 202 insertions(+), 175 deletions(-)

diff --git a/copro/evaluation.py b/copro/evaluation.py
index a30d843..8da3db0 100644
--- a/copro/evaluation.py
+++ b/copro/evaluation.py
@@ -1,68 +1,51 @@
-import os, sys
+import os
 import click
-from sklearn import metrics, inspection
+from sklearn import metrics, inspection, ensemble
+from configparser import RawConfigParser
 import pandas as pd
 import geopandas as gpd
 import numpy as np
+from typing import Union, Tuple
 
-def init_out_dict():
-    """Initiates the main model evaluatoin dictionary for a range of model metric scores. 
+
+def init_out_dict(scores: Union[list[str], None] = None) -> dict:
+    """Initiates the main model evaluatoin dictionary for a range of model metric scores.
     The scores should match the scores used in the dictioary created in 'evaluation.evaluate_prediction()'.
 
+    Args:
+        scores (list, None): list containing metric scores. If 'None', a default list is used. Defaults to 'None'.
+
     Returns:
         dict: empty dictionary with metrics as keys.
-    """    
-
-    scores = ['Accuracy', 'Precision', 'Recall', 'F1 score', 'Cohen-Kappa score', 'Brier loss score', 'ROC AUC score', 'AP score']
+    """
+
+    if scores is None:
+        scores = [
+            "Accuracy",
+            "Precision",
+            "Recall",
+            "F1 score",
+            "Cohen-Kappa score",
+            "Brier loss score",
+            "ROC AUC score",
+            "AP score",
+        ]
 
     # initialize empty dictionary with one emtpy list per score
     out_dict = {}
     for score in scores:
-        out_dict[score] = list()
+        out_dict[score] = []
 
     return out_dict
 
-def evaluate_prediction(y_test, y_pred, y_prob, X_test, clf, config):
-    """Computes a range of model evaluation metrics and appends the resulting scores to a dictionary.
-    This is done for each model execution separately.
-    Output will be stored to stderr if possible.
-
-    Args:
-        y_test (list): list containing test-sample conflict data.
-        y_pred (list): list containing predictions.
-        y_prob (array): array resulting probabilties of predictions.
-        X_test (array): array containing test-sample variable values.
-        clf (classifier): sklearn-classifier used in the simulation.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-
-    Returns:
-        dict: dictionary with scores for each simulation
-    """  
-
-    if config.getboolean('general', 'verbose'):
-        click.echo("... Accuracy: {0:0.3f}".format(metrics.accuracy_score(y_test, y_pred)), err=True)
-        click.echo("... Precision: {0:0.3f}".format(metrics.precision_score(y_test, y_pred)), err=True)
-        click.echo("... Recall: {0:0.3f}".format(metrics.recall_score(y_test, y_pred)), err=True)
-        click.echo('... F1 score: {0:0.3f}'.format(metrics.f1_score(y_test, y_pred)), err=True)
-        click.echo('... Brier loss score: {0:0.3f}'.format(metrics.brier_score_loss(y_test, y_prob[:, 1])), err=True)
-        click.echo('... Cohen-Kappa score: {0:0.3f}'.format(metrics.cohen_kappa_score(y_test, y_pred)), err=True)
-        click.echo('... ROC AUC score {0:0.3f}'.format(metrics.roc_auc_score(y_test, y_prob[:, 1])), err=True)
-        click.echo('... AP score {0:0.3f}'.format(metrics.average_precision_score(y_test, y_prob[:, 1])), err=True)
-
-    # compute value per evaluation metric and assign to list
-    eval_dict = {'Accuracy': metrics.accuracy_score(y_test, y_pred),
-                 'Precision': metrics.precision_score(y_test, y_pred),
-                 'Recall': metrics.recall_score(y_test, y_pred),
-                 'F1 score': metrics.f1_score(y_test, y_pred),
-                 'Cohen-Kappa score': metrics.cohen_kappa_score(y_test, y_pred),
-                 'Brier loss score': metrics.brier_score_loss(y_test, y_prob[:, 1]),
-                 'ROC AUC score': metrics.roc_auc_score(y_test, y_prob[:, 1]),
-                 'AP score': metrics.average_precision_score(y_test, y_prob[:, 1]),
-                }
 
-    return eval_dict
-
-def fill_out_dict(out_dict, eval_dict):
+def fill_out_dict(
+    out_dict: dict,
+    y_test: list[int],
+    y_pred: list[int],
+    y_prob: list[float],
+    config: RawConfigParser,
+):
     """Appends the computed metric score per run to the main output dictionary.
     All metrics are initialized in init_out_dict().
 
@@ -72,147 +55,110 @@ def fill_out_dict(out_dict, eval_dict):
 
     Returns:
         dict: dictionary with collected scores for each simulation
-    """    
+    """
+
+    eval_dict = _evaluate_prediction(y_test, y_pred, y_prob, config)
 
     for key in out_dict:
         out_dict[key].append(eval_dict[key])
 
     return out_dict
 
-def init_out_df():
-    """Initiates and empty main output dataframe.
 
-    Returns:
-        dataframe: empty dataframe.
-    """    
-
-    return pd.DataFrame()
-
-def fill_out_df(out_df, y_df):
+def fill_out_df(out_df: pd.DataFrame, y_df: pd.DataFrame) -> pd.DataFrame:
     """Appends output dataframe of each simulation to main output dataframe.
 
+    ..note::
+        Is a separate function needed for this?
+
     Args:
         out_df (dataframe): main output dataframe.
         y_df (dataframe): output dataframe of each simulation.
 
     Returns:
         dataframe: main output dataframe containing results of all simulations.
-    """    
+    """
 
     out_df = out_df.append(y_df, ignore_index=True)
 
     return out_df
 
-def polygon_model_accuracy(df, global_df, make_proj=False):
+
+def polygon_model_accuracy(
+    df: pd.DataFrame, global_df: pd.DataFrame, make_proj=False
+) -> Tuple[pd.DataFrame, gpd.GeoDataFrame]:
     """Determines a range of model accuracy values for each polygon.
     Reduces dataframe with results from each simulation to values per unique polygon identifier.
-    Determines the total number of predictions made per polygon as well as fraction of correct predictions made for overall and conflict-only data.
+    Determines the total number of predictions made per polygon 
+    as well as fraction of correct predictions made for overall and conflict-only data.
 
     Args:
         df (dataframe): output dataframe containing results of all simulations.
         global_df (dataframe): global look-up dataframe to associate unique identifier with geometry.
-        make_proj (bool, optional): whether or not this function is used to make a projection. If True, a couple of calculations are skipped as no observed data is available for projections. Defaults to 'False'.
+        make_proj (bool, optional): whether or not this function is used to make a projection. \
+            If True, a couple of calculations are skipped as no observed data is available for projections. \
+                Defaults to 'False'.
 
     Returns:
         (geo-)dataframe: dataframe and geo-dataframe with data per polygon.
-    """    
+    """
 
-    #- create a dataframe containing the number of occurence per ID
-    ID_count = df.ID.value_counts().to_frame().rename(columns={'ID':'nr_predictions'})
-    #- add column containing the IDs
-    ID_count['ID'] = ID_count.index.values
-    #- set index with index named ID now
+    # - create a dataframe containing the number of occurence per ID
+    ID_count = df.ID.value_counts().to_frame().rename(columns={"ID": "nr_predictions"})
+    # - add column containing the IDs
+    ID_count["ID"] = ID_count.index.values
+    # - set index with index named ID now
     ID_count.set_index(ID_count.ID, inplace=True)
-    #- remove column ID
-    ID_count = ID_count.drop('ID', axis=1)
+    # - remove column ID
+    ID_count = ID_count.drop("ID", axis=1)
 
     df_count = pd.DataFrame()
-    
-    #- per polygon ID, compute sum of overall correct predictions and rename column name
-    if not make_proj: df_count['nr_correct_predictions'] = df.correct_pred.groupby(df.ID).sum()
 
-    #- per polygon ID, compute sum of all conflict data points and add to dataframe
-    if not make_proj: df_count['nr_observed_conflicts'] = df.y_test.groupby(df.ID).sum()
+    # - per polygon ID, compute sum of overall correct predictions and rename column name
+    if not make_proj:
+        df_count["nr_correct_predictions"] = df.correct_pred.groupby(df.ID).sum()
 
-    #- per polygon ID, compute sum of all conflict data points and add to dataframe
-    df_count['nr_predicted_conflicts'] = df.y_pred.groupby(df.ID).sum()
+    # - per polygon ID, compute sum of all conflict data points and add to dataframe
+    if not make_proj:
+        df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum()
 
-    #- per polygon ID, compute average probability that conflict occurs
-    df_count['min_prob_1'] = pd.to_numeric(df.y_prob_1).groupby(df.ID).min()
-    df_count['probability_of_conflict'] = pd.to_numeric(df.y_prob_1).groupby(df.ID).mean()
-    df_count['max_prob_1'] = pd.to_numeric(df.y_prob_1).groupby(df.ID).max()
+    # - per polygon ID, compute sum of all conflict data points and add to dataframe
+    df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum()
 
-    #- merge the two dataframes with ID as key
-    df_temp = pd.merge(ID_count, df_count, on='ID')
+    # - per polygon ID, compute average probability that conflict occurs
+    df_count["min_prob_1"] = pd.to_numeric(df.y_prob_1).groupby(df.ID).min()
+    df_count["probability_of_conflict"] = (
+        pd.to_numeric(df.y_prob_1).groupby(df.ID).mean()
+    )
+    df_count["max_prob_1"] = pd.to_numeric(df.y_prob_1).groupby(df.ID).max()
 
-    #- compute average correct prediction rate by dividing sum of correct predictions with number of all predicionts
-    if not make_proj: df_temp['fraction_correct_predictions'] = df_temp.nr_correct_predictions / df_temp.nr_predictions
+    # - merge the two dataframes with ID as key
+    df_temp = pd.merge(ID_count, df_count, on="ID")
 
-    #- compute average correct prediction rate by dividing sum of correct predictions with number of all predicionts
-    df_temp['chance_of_conflict'] = df_temp.nr_predicted_conflicts / df_temp.nr_predictions
+    # - compute average correct prediction rate by dividing sum of correct predictions with number of all predicionts
+    if not make_proj:
+        df_temp["fraction_correct_predictions"] = (
+            df_temp.nr_correct_predictions / df_temp.nr_predictions
+        )
 
-    #- merge with global dataframe containing IDs and geometry, and keep only those polygons occuring in test sample
-    df_hit = pd.merge(df_temp, global_df, on='ID', how='left')
+    # - compute average correct prediction rate by dividing sum of correct predictions with number of all predicionts
+    df_temp["chance_of_conflict"] = (
+        df_temp.nr_predicted_conflicts / df_temp.nr_predictions
+    )
 
+    # - merge with global dataframe containing IDs and geometry, and keep only those polygons occuring in test sample
+    df_hit = pd.merge(df_temp, global_df, on="ID", how="left")
     # #- convert to geodataframe
     gdf_hit = gpd.GeoDataFrame(df_hit, geometry=df_hit.geometry)
 
     return df_hit, gdf_hit
 
-def init_out_ROC_curve():
-    """Initiates empty lists for range of variables needed to plot ROC-curve per simulation.
-
-    Returns:
-        lists: empty lists for variables.
-    """    
-
-    tprs = []
-    aucs = []
-    mean_fpr = np.linspace(0, 1, 100)
-
-    return tprs, aucs, mean_fpr
-
-def save_out_ROC_curve(tprs, aucs, out_dir):
-    """Saves data needed to plot mean ROC and standard deviation to csv-files. 
-    They can be loaded again with pandas in a post-processing step.
-
-    Args:
-        tprs (list): list with false positive rates.
-        aucs (list): list with area-under-curve values.
-        out_dir (str):  path to output folder. If 'None', no output is stored.
-    """    
-
-    tprs = pd.DataFrame(tprs)
-    aucs = pd.DataFrame(aucs)
-
-    tprs.to_csv(os.path.join(out_dir, 'ROC_data_tprs.csv'), index=False, header=False)
-    aucs.to_csv(os.path.join(out_dir, 'ROC_data_aucs.csv'), index=False, header=False)
-
-    print('INFO: saving ROC data to {}'.format(os.path.join(out_dir, 'ROC_data.csv')))
-
-    return
-
-def calc_correlation_matrix(df, out_dir=None):
-    """Computes the correlation matrix for a dataframe. 
-    The dataframe should only contain numeric values.
-
-    Args:
-        df (dataframe): dataframe with analysed output per polygon.
-        out_dir (str):  path to output folder. If 'None', no output is stored. Default to 'None'.
-
-    Returns:
-        dataframe: dataframe containig correlation matrix.
-    """    
 
-    # determine correlation matrix
-    df_corr = df.corr()
-    
-    if (out_dir != None) and isinstance(out_dir, str):
-        df_corr.to_csv(os.path.join(out_dir, 'corr_matrix.csv'))
-
-    return df_corr
-
-def get_feature_importance(clf, config, out_dir):
+def get_feature_importance(
+    clf: ensemble.RandomForestClassifier,
+    config: RawConfigParser,
+    out_dir: Union[str, None] = None,
+) -> pd.DataFrame:
     """Determines relative importance of each feature (i.e. variable) used. Must be used after model/classifier is fit.
     Returns dataframe and saves it to csv too.
 
@@ -221,41 +167,41 @@ def get_feature_importance(clf, config, out_dir):
         config (ConfigParser-object): object containing the parsed configuration-settings of the model.
         out_dir (str): path to output folder. If 'None', no output is stored.
 
-    Raises:
-        Warning: raised if the chosen ML model has not built-in feature importances.
-
     Returns:
         dataframe: dataframe containing feature importance.
-    """ 
-
-    if config.get('machine_learning', 'model') == 'RFClassifier':
+    """
 
-        # get feature importances
-        arr = clf.feature_importances_
+    if not os.path.exists(out_dir):
+        raise FileNotFoundError("Output directory does not exist.")
 
-        # initialize dictionary and add importance value per indicator
-        dict_out = dict()
-        for key, x in zip(config.items('data'), range(len(arr))):
-            dict_out[key[0]] = arr[x]
-        dict_out['conflict_t_min_1'] = arr[-2]
-        dict_out['conflict_t_min_1_nb'] = arr[-1]
+    # get feature importances
+    arr = clf.feature_importances_
 
-        # convert to dataframe
-        df = pd.DataFrame.from_dict(dict_out, orient='index', columns=['feature_importance'])
+    # initialize dictionary and add importance value per indicator
+    dict_out = {}
+    for key, x in zip(config.items("data"), range(len(arr))):
+        dict_out[key[0]] = arr[x]
+    dict_out["conflict_t_min_1"] = arr[-2]
+    dict_out["conflict_t_min_1_nb"] = arr[-1]
 
-        # save to file if specified
-        if (out_dir != None) and isinstance(out_dir, str):
-            df.to_csv(os.path.join(out_dir, 'feature_importances.csv'))
-
-    else:
-
-        raise Warning('WARNING: feature importance not supported for {}'.format(config.get('machine_learning', 'model')))
-
-        df = pd.DataFrame()
+    # convert to dataframe
+    df = pd.DataFrame.from_dict(
+        dict_out, orient="index", columns=["feature_importance"]
+    )
+    if out_dir is not None:
+        df.to_csv(os.path.join(out_dir, "feature_importances.csv"))
 
     return df
 
-def get_permutation_importance(clf, X_ft, Y, df_feat_imp, out_dir):
+
+def get_permutation_importance(
+    clf: ensemble.RandomForestClassifier,
+    X_ft: np.ndarray,
+    Y: np.ndarray,
+    df_feat_imp: pd.DataFrame,
+    out_dir: Union[str, None] = None,
+    n_repeats=10,
+) -> pd.DataFrame:
     """Returns a dataframe with the mean permutation importance of the features used to train a RF tree model.
     Dataframe is stored to output directory as csv-file.
 
@@ -264,17 +210,98 @@ def get_permutation_importance(clf, X_ft, Y, df_feat_imp, out_dir):
         X_ft (array): X-array containing variable values after scaling.
         Y (array): Y-array containing conflict data.
         df_feat_imp (dataframe): dataframe containing feature importances to align names across outputs.
-        out_dir (str): path to output folder. If 'None', no output is stored.
+        out_dir (str, None): path to output folder. If 'None', no output is stored. Default to 'None'.
+        n_repeats (int): number of repetitions for permutation importance. Default to 10.
 
     Returns:
         dataframe: contains mean permutation importance for each feature.
-    """    
+    """
+
+    if not os.path.exists(out_dir):
+        raise FileNotFoundError("Output directory does not exist.")
+
+    result = inspection.permutation_importance(
+        clf, X_ft, Y, n_repeats=n_repeats, random_state=42
+    )
+    df = pd.DataFrame(
+        result.importances_mean,
+        columns=["permutation_importance"],
+        index=df_feat_imp.index.values,
+    )
+    if out_dir is not None:
+        df.to_csv(os.path.join(out_dir, "permutation_importances.csv"))
 
-    result = inspection.permutation_importance(clf, X_ft, Y, n_repeats=10, random_state=42)   
+    return df
 
-    df = pd.DataFrame(result.importances_mean, columns=['permutation_importance'], index=df_feat_imp.index.values)
 
-    if (out_dir != None) and isinstance(out_dir, str):
-        df.to_csv(os.path.join(out_dir, 'permutation_importances.csv'))
+def _evaluate_prediction(
+    y_test: list, y_pred: list, y_prob: list, config: RawConfigParser
+) -> dict:
+    """Computes a range of model evaluation metrics and appends the resulting scores to a dictionary.
+    This is done for each model execution separately.
+    Output will be stored to stderr if possible.
 
-    return df 
\ No newline at end of file
+    Args:
+        y_test (list): list containing test-sample conflict data.
+        y_pred (list): list containing predictions.
+        y_prob (array): array resulting probabilties of predictions.
+        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+
+    Returns:
+        dict: dictionary with scores for each simulation
+    """
+
+    if config.getboolean("general", "verbose"):
+        click.echo(
+            "... Accuracy: {0:0.3f}".format(metrics.accuracy_score(y_test, y_pred)),
+            err=True,
+        )
+        click.echo(
+            "... Precision: {0:0.3f}".format(metrics.precision_score(y_test, y_pred)),
+            err=True,
+        )
+        click.echo(
+            "... Recall: {0:0.3f}".format(metrics.recall_score(y_test, y_pred)),
+            err=True,
+        )
+        click.echo(
+            "... F1 score: {0:0.3f}".format(metrics.f1_score(y_test, y_pred)), err=True
+        )
+        click.echo(
+            "... Brier loss score: {0:0.3f}".format(
+                metrics.brier_score_loss(y_test, y_prob[:, 1])
+            ),
+            err=True,
+        )
+        click.echo(
+            "... Cohen-Kappa score: {0:0.3f}".format(
+                metrics.cohen_kappa_score(y_test, y_pred)
+            ),
+            err=True,
+        )
+        click.echo(
+            "... ROC AUC score {0:0.3f}".format(
+                metrics.roc_auc_score(y_test, y_prob[:, 1])
+            ),
+            err=True,
+        )
+        click.echo(
+            "... AP score {0:0.3f}".format(
+                metrics.average_precision_score(y_test, y_prob[:, 1])
+            ),
+            err=True,
+        )
+
+    # compute value per evaluation metric and assign to list
+    eval_dict = {
+        "Accuracy": metrics.accuracy_score(y_test, y_pred),
+        "Precision": metrics.precision_score(y_test, y_pred),
+        "Recall": metrics.recall_score(y_test, y_pred),
+        "F1 score": metrics.f1_score(y_test, y_pred),
+        "Cohen-Kappa score": metrics.cohen_kappa_score(y_test, y_pred),
+        "Brier loss score": metrics.brier_score_loss(y_test, y_prob[:, 1]),
+        "ROC AUC score": metrics.roc_auc_score(y_test, y_prob[:, 1]),
+        "AP score": metrics.average_precision_score(y_test, y_prob[:, 1]),
+    }
+
+    return eval_dict

From 0616d0043ae332439af21584a68f2d41cffc9f94 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Wed, 14 Feb 2024 10:06:58 +0100
Subject: [PATCH 02/13] parsing and collecting of settings/configurations in
 separate file

---
 copro/settings.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 copro/settings.py

diff --git a/copro/settings.py b/copro/settings.py
new file mode 100644
index 0000000..2c0f11f
--- /dev/null
+++ b/copro/settings.py
@@ -0,0 +1,114 @@
+import click
+import os
+from configparser import RawConfigParser
+from shutil import copyfile
+from typing import Tuple
+from copro import utils, io
+
+
+def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]:
+    """Initiates the model set-up.
+    It parses the cfg-file, creates an output folder, copies the cfg-file to the output folder.
+
+    .. example::
+        # How the dict should look
+        config_dict = {'_REF': [config_REF, out_dir_REF], \
+            'run1': [config_run1, out_dir_run1], \
+                'run2': [config_run2, out_dir_run2]}
+
+    Args:
+        settings_file (Path): path to settings-file (cfg-file).
+
+    Returns:
+        dict: dictionary with config-objects and output directories for reference run \
+            and all projection runs.
+        Path: path to location of the cfg-file for the reference run, serving as main/base directory.
+    """
+
+    # print model info, i.e. author names, license info etc.
+    utils.print_model_info()
+
+    # get name of directory where cfg-file for reference run is stored
+    # all other paths should be relative to this one
+    root_dir = os.path.dirname(os.path.abspath(settings_file))
+
+    # parse cfg-file and get config-object for reference run
+    config = _parse_settings(settings_file)
+    # get dictionary with all config-objects, also for projection runs
+    config_dict = _collect_simulation_settings(config, root_dir)
+
+    # get dictionary with all config-objects and all out-dirs
+    main_dict = io.make_output_dir(config, root_dir, config_dict)
+
+    # copy cfg-file of reference run to out-dir of reference run
+    # for documentation and debugging purposes
+    copyfile(
+        os.path.abspath(settings_file),
+        os.path.join(
+            main_dict["_REF"][1], "copy_of_{}".format(os.path.basename(settings_file))
+        ),
+    )
+
+    return main_dict, root_dir
+
+
+def _parse_settings(settings_file: click.Path) -> RawConfigParser:
+    """Reads the model configuration file.
+
+    Args:
+        settings_file (Path): path to settings-file (cfg-file).
+
+    Returns:
+        RawConfigParser: parsed model configuration.
+    """
+
+    click.echo(f"Parsing settings from file {settings_file}.")
+    config = RawConfigParser(allow_no_value=True, inline_comment_prefixes="#")
+    config.optionxform = lambda option: option
+    config.read(settings_file)
+
+    return config
+
+
+def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path) -> dict:
+    """Collects the configuration settings for the reference run and all projection runs.
+    These cfg-files need to be specified one by one in the PROJ_files section of the cfg-file for the reference run.
+    The function returns then a dictionary with the name of the run and the associated config-object.
+
+    .. example::
+        # How it should look in the cfg-file
+        [PROJ_files]
+        run1 = cfg_file1
+        run2 = cfg_file2
+
+        # How the dict should look
+        config_dict = {'_REF': [config_REF], 'run1': [config_run1], 'run2': [config_run2]}
+
+    Args:
+        config (ConfigParser-object): object containing the parsed configuration-settings \
+            of the model for the reference run.
+        root_dir (Path): path to location of the cfg-file for the reference run.
+
+    Returns:
+        dict: dictionary with name and config-object for reference run and all specified projection runs.
+    """
+
+    # initiate output dictionary
+    config_dict = {}
+    # first entry is config-object for reference run
+    config_dict["_REF"] = config
+
+    # loop through all keys and values in PROJ_files section of reference config-object
+    for (each_key, each_val) in config.items("PROJ_files"):
+
+        # for each value (here representing the cfg-files of the projections), get the absolute path
+        each_val = os.path.abspath(os.path.join(root_dir, each_val))
+
+        # parse each config-file specified
+        click.echo(f"Parsing settings from file {each_val}.")
+        each_config = _parse_settings(each_val)
+
+        # update the output dictionary with key and config-object
+        config_dict[each_key] = [each_config]
+
+    return config_dict

From 152a1404d8af2734de5137624a6b8b037c1b9ebd Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Wed, 14 Feb 2024 10:22:50 +0100
Subject: [PATCH 03/13] separate file for I/O functions

---
 copro/io.py       | 100 ++++++++++++++++++++++++++++++++++++++++++++++
 copro/settings.py |   2 +-
 2 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 copro/io.py

diff --git a/copro/io.py b/copro/io.py
new file mode 100644
index 0000000..35475c0
--- /dev/null
+++ b/copro/io.py
@@ -0,0 +1,100 @@
+import pandas as pd
+import numpy as np
+from typing import Union
+from configparser import RawConfigParser
+from pathlib import Path
+import os
+import click
+
+
+def make_and_collect_output_dirs(
+    config: RawConfigParser, root_dir: click.Path, config_dict: dict
+) -> dict:
+    """Creates the output folder at location specfied in cfg-file
+    and returns dictionary with config-objects and out-dir per run.
+
+    Args:
+        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        root_dir (Path): absolute path to location of configurations-file
+        config_dict (dict): dictionary containing config-objects for reference run and all projection.
+
+    Returns:
+        dict: dictionary containing config-objects and output directories for reference run and all projection runs.
+    """
+
+    # get path to main output directory as specified in cfg-file
+    out_dir = os.path.join(root_dir, config.get("general", "output_dir"))
+    click.echo(f"Saving output to main output folder {out_dir}.")
+
+    # initalize list for all out-dirs
+    all_out_dirs = []
+    # create reference output dir '_REF' under main output dir
+    all_out_dirs.append(os.path.join(out_dir, "_REF"))
+
+    # create reference output dir '_PROJ' under main output dir
+    out_dir_proj = os.path.join(out_dir, "_PROJ")
+    # create sub-dirs under '_PROJ' for all projection runs
+    for key, i in zip(config_dict, range(len(config_dict))):
+        # skip the first entry, as this is the reference run which does not need a sub-directory
+        if i > 0:
+            all_out_dirs.append(os.path.join(out_dir_proj, str(key)))
+
+    # initiate dictionary for config-objects and out-dir per un
+    config_outdir_dict = {}
+    # for all keys (i.e. run names), assign config-object (i.e. the values) as well as out-dir
+    for key, value, i in zip(
+        config_dict.keys(), config_dict.values(), range(len(config_dict))
+    ):
+        config_outdir_dict[key] = [value, all_out_dirs[i]]
+
+    # check if out-dir exists and if not, create it
+    for key, value in config_outdir_dict.items():
+        # value [0] would be the config-object
+        click.echo(f"Creating output-folder {value[1]}.")
+        Path.mkdir(Path(value[1]), exist_ok=True)
+
+    return config_outdir_dict
+
+
+def save_to_csv(arg: dict, out_dir: click.Path, fname: str):
+    """Saves an dictionary to csv-file.
+
+    Args:
+        arg (dict): dictionary or dataframe to be saved.
+        out_dir (str): path to output folder.
+        fname (str): name of stored item.
+    """
+
+    # check if arg is actuall a dict
+    if not isinstance(arg, dict):
+        raise TypeError("argument is not a dictionary.")
+    try:
+        arg = pd.DataFrame().from_dict(arg)
+    except ValueError:
+        arg = pd.DataFrame().from_dict(arg, orient="index")
+
+    # save dataframe as csv
+    arg.to_csv(os.path.join(out_dir, fname + ".csv"))
+
+
+def save_to_npy(arg: Union[dict, pd.DataFrame], out_dir: click.Path, fname: str):
+    """Saves an argument (either dictionary or dataframe) to npy-file.
+
+    Args:
+        arg (dict or dataframe): dictionary or dataframe to be saved.
+        out_dir (str): path to output folder.
+        fname (str): name of stored item.
+    """
+
+    # if arg is dict, then first create dataframe, then np-array
+    if isinstance(arg, dict):
+        arg = pd.DataFrame().from_dict(arg)
+        arg = arg.to_numpy()
+    # if arg is dataframe, directly create np-array
+    elif isinstance(arg, pd.DataFrame):
+        arg = arg.to_numpy()
+    else:
+        raise TypeError("argument is not a dictionary or dataframe.")
+
+    # save np-array as npy-file
+    np.save(os.path.join(out_dir, fname + ".npy"), arg)
diff --git a/copro/settings.py b/copro/settings.py
index 2c0f11f..949a07b 100644
--- a/copro/settings.py
+++ b/copro/settings.py
@@ -38,7 +38,7 @@ def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]:
     config_dict = _collect_simulation_settings(config, root_dir)
 
     # get dictionary with all config-objects and all out-dirs
-    main_dict = io.make_output_dir(config, root_dir, config_dict)
+    main_dict = io.make_and_collect_output_dirs(config, root_dir, config_dict)
 
     # copy cfg-file of reference run to out-dir of reference run
     # for documentation and debugging purposes

From e5fd63c07ab19ae1da8320b93cf3109a4a89ef2e Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Wed, 14 Feb 2024 10:41:35 +0100
Subject: [PATCH 04/13] clean up

---
 copro/utils.py | 384 ++++---------------------------------------------
 1 file changed, 29 insertions(+), 355 deletions(-)

diff --git a/copro/utils.py b/copro/utils.py
index 5e651e3..8c1a5f4 100644
--- a/copro/utils.py
+++ b/copro/utils.py
@@ -1,23 +1,24 @@
 import geopandas as gpd
 import pandas as pd
-import numpy as np
 import os
 from configparser import RawConfigParser
-from shutil import copyfile
-from sklearn import utils
 from datetime import date
 import click
-import copro
+from copro import __version__, __author__, __email__
 
 
-def get_geodataframe(
-    config, root_dir, longitude="longitude", latitude="latitude", crs="EPSG:4326"
-):
-    """Georeferences a pandas dataframe using longitude and latitude columns of that dataframe.
+def get_conflict_geodataframe(
+    config: RawConfigParser,
+    root_dir: click.Path,
+    longitude="longitude",
+    latitude="latitude",
+    crs="EPSG:4326",
+) -> gpd.GeoDataFrame:
+    """Converts a csv-file containing geo-referenced conflict data to a geodataframe.
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        root_dir (str): path to location of cfg-file.
+        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        root_dir (Path): path to location of cfg-file.
         longitude (str, optional): column name with longitude coordinates. Defaults to 'longitude'.
         latitude (str, optional): column name with latitude coordinates. Defaults to 'latitude'.
         crs (str, optional): coordinate system to be used for georeferencing. Defaults to 'EPSG:4326'.
@@ -34,12 +35,10 @@ def get_geodataframe(
     )
 
     # read file to pandas dataframe
-    click.echo("INFO: reading csv file to dataframe {}".format(conflict_fo))
+    click.echo(f"Reading {conflict_fo} file and converting to geodataframe.")
     df = pd.read_csv(conflict_fo)
 
     # convert dataframe to geo-dataframe
-    if config.getboolean("general", "verbose"):
-        click.echo("DEBUG: translating to geopandas dataframe")
     gdf = gpd.GeoDataFrame(
         df, geometry=gpd.points_from_xy(df[longitude], df[latitude]), crs=crs
     )
@@ -47,131 +46,28 @@ def get_geodataframe(
     return gdf
 
 
-def parse_settings(settings_file: str) -> RawConfigParser:
-    """Reads the model configuration file.
-
-    Args:
-        settings_file (str): path to settings-file (cfg-file).
-
-    Returns:
-        ConfigParser-object: parsed model configuration.
-    """
-
-    config = RawConfigParser(allow_no_value=True, inline_comment_prefixes="#")
-    config.optionxform = lambda option: option
-    config.read(settings_file)
-
-    return config
-
-
-def parse_projection_settings(config, root_dir):
-    """This function parses the (various) cfg-files for projections.
-    These cfg-files need to be specified one by one in the PROJ_files section of the cfg-file for the reference run.
-    The function returns then a dictionary with the name of the run and the associated config-object.
-
-    Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings \
-            of the model for the reference run.
-
-    Returns:
-        dict: dictionary with name and config-object per specified projection run.
-    """
-
-    # initiate output dictionary
-    config_dict = {}
-
-    # first entry is config-object for reference run
-    config_dict["_REF"] = config
-
-    # loop through all keys and values in PROJ_files section of reference config-object
-    for (each_key, each_val) in config.items("PROJ_files"):
-
-        # for each value (here representing the cfg-files of the projections), get the absolute path
-        each_val = os.path.abspath(os.path.join(root_dir, each_val))
-
-        # parse each config-file specified
-        if config.getboolean("general", "verbose"):
-            click.echo("DEBUG: parsing settings from file {}".format(each_val))
-        each_config = parse_settings(each_val)
-
-        # update the output dictionary with key and config-object
-        config_dict[each_key] = [each_config]
-
-    return config_dict
-
-
-def make_output_dir(config, root_dir, config_dict):
-    """Creates the output folder at location specfied in cfg-file
-    and returns dictionary with config-objects and out-dir per run.
+def get_ID_geometry_lookup(
+    gdf: gpd.GeoDataFrame,
+) -> pd.DataFrame:  # get_ID_geometry_lookup
+    """Retrieves unique ID and geometry information from geo-dataframe for a global look-up dataframe.
+    The IDs currently supported are 'name' or 'watprovID'.
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        root_dir (str): absolute path to location of configurations-file
-        config_dict (dict): dictionary containing config-objects per projection.
+        gdf (gpd.GeoDataFrame): containing all polygons used in the model.
 
     Returns:
-        dict: dictionary containing config-objects and output directories for reference run and all projection runs.
+        pd.DataFrame: look-up dataframe associated ID with geometry
     """
 
-    # get path to main output directory as specified in cfg-file
-    out_dir = os.path.join(root_dir, config.get("general", "output_dir"))
-    click.echo("INFO: saving output to main folder {}".format(out_dir))
-
-    # initalize list for all out-dirs
-    all_out_dirs = []
-
-    # append the path to the output folder for the reference run
-    # note that this is hardcoded, i.e. each output folder will have a sub-folder '_REF'
-    all_out_dirs.append(os.path.join(out_dir, "_REF"))
-
-    # for all specified projections, create individual sub-folder under the folder '_PROJ'
-    # and append those to list as well
-    out_dir_proj = os.path.join(out_dir, "_PROJ")
-    for key, i in zip(config_dict, range(len(config_dict))):
-        if i > 0:
-            all_out_dirs.append(os.path.join(out_dir_proj, str(key)))
-
-    assert len(all_out_dirs) == len(config_dict), AssertionError(
-        "ERROR: number of output folders and config-objects do not match!"
+    # stack identifier and geometry of all polygons
+    # NOTE: columnn 'watprovID' is hardcoded here
+    df = pd.DataFrame(
+        index=gdf["watprovID"].to_list(),
+        data=gdf["geometry"].to_list(),
+        columns=["geometry"],
     )
 
-    # initiate dictionary for config-objects and out-dir per un
-    main_dict = {}
-
-    # for all keys (i.e. run names), assign config-object (i.e. the values) as well as out-dir
-    for key, value, i in zip(
-        config_dict.keys(), config_dict.values(), range(len(config_dict))
-    ):
-        main_dict[key] = [value, all_out_dirs[i]]
-
-    # create all the specified output folders if they do not exist yet
-    # if they exist, remove all files there besides the npy-files
-    for key, value in main_dict.items():
-
-        # get entry corresponding to out-dir
-        # value [0] would be the config-object
-        d = value[1]
-
-        # check if out-dir exists and if not, create it
-        if not os.path.isdir(d):
-            click.echo("INFO: creating output-folder {}".format(d))
-            os.makedirs(d)
-
-        # else, remove all files with a few exceptions
-        else:
-            for root, _, files in os.walk(d):
-                if (config.getboolean("general", "verbose")) and (len(files) > 0):
-                    click.echo(
-                        "DEBUG: remove files in {}".format(os.path.abspath(root))
-                    )
-                for fo in files:
-                    if fo in ["XY.npy", "X.npy"]:
-                        if config.getboolean("general", "verbose"):
-                            click.echo("DEBUG: sparing {}".format(fo))
-                    else:
-                        os.remove(os.path.join(root, fo))
-
-    return main_dict
+    return df
 
 
 def print_model_info():
@@ -179,7 +75,7 @@ def print_model_info():
 
     click.echo("")
     click.echo(
-        click.style("#### CoPro version {} ####".format(copro.__version__), fg="yellow")
+        click.style("#### CoPro version {} ####".format(__version__), fg="yellow")
     )
     click.echo(
         click.style(
@@ -189,15 +85,11 @@ def print_model_info():
     )
     click.echo(
         click.style(
-            "#### Copyright (2020-{}): {} ####".format(
-                date.today().year, copro.__author__
-            ),
+            "#### Copyright (2020-{}): {} ####".format(date.today().year, __author__),
             fg="yellow",
         )
     )
-    click.echo(
-        click.style("#### Contact via: {} ####".format(copro.__email__), fg="yellow")
-    )
+    click.echo(click.style("#### Contact via: {} ####".format(__email__), fg="yellow"))
     click.echo(
         click.style(
             "#### The model can be used and shared under the MIT license ####"
@@ -205,221 +97,3 @@ def print_model_info():
             fg="yellow",
         )
     )
-
-
-def initiate_setup(settings_file, verbose=None):
-    """Initiates the model set-up.
-    It parses the cfg-file, creates an output folder, copies the cfg-file to the output folder, 
-    and, if specified, downloads conflict data.
-
-    Args:
-        settings_file (str): path to settings-file (cfg-file).
-        verbose (bool, optional): whether model is verbose or not, e.g. click.echos DEBUG output or not. \
-            If None, then the setting in cfg-file counts. \
-                Otherwise verbose can be set directly to function which superseded the cfg-file. Defaults to None.
-
-    Returns:
-        ConfigParser-object: parsed model configuration.
-        out_dir_list: list with paths to output folders; \
-            first main output folder, then reference run folder, then (multiple) folders for projection runs.
-        root_dir: path to location of cfg-file.
-    """
-
-    # print model info, i.e. author names, license info etc.
-    print_model_info()
-
-    # get name of directory where cfg-file is stored
-    root_dir = os.path.dirname(os.path.abspath(settings_file))
-
-    # parse cfg-file and get config-object for reference run
-    config = parse_settings(settings_file)
-    click.echo("INFO: reading model properties from {}".format(settings_file))
-
-    if verbose is not None:
-        config.set("general", "verbose", str(verbose))
-
-    click.echo(
-        "INFO: verbose mode on: {}".format(config.getboolean("general", "verbose"))
-    )
-
-    # get dictionary with all config-objects, also for projection runs
-    config_dict = parse_projection_settings(config, root_dir)
-
-    # get dictionary with all config-objects and all out-dirs
-    main_dict = make_output_dir(config, root_dir, config_dict)
-
-    # copy cfg-file of reference run to out-dir of reference run
-    if config.getboolean("general", "verbose"):
-        click.echo(
-            "DEBUG: copying cfg-file {} to folder {}".format(
-                os.path.abspath(settings_file), main_dict["_REF"][1]
-            )
-        )
-    copyfile(
-        os.path.abspath(settings_file),
-        os.path.join(
-            main_dict["_REF"][1], "copy_of_{}".format(os.path.basename(settings_file))
-        ),
-    )
-
-    # if specfied, download UCDP/PRIO data directly
-    if config["conflict"]["conflict_file"] == "download":
-        raise NotImplementedError(
-            "Downloading conflict data is not implemented anymore."
-        )
-
-    # if any other model than all_data is specified, set number of runs to 1
-    if (config.getint("general", "model") == 2) or (
-        config.getint("general", "model") == 3
-    ):
-        config.set("machine_learning", "n_runs", str(1))
-        click.echo(
-            "INFO: changed nr of runs to {}".format(
-                config.getint("machine_learning", "n_runs")
-            )
-        )
-
-    return main_dict, root_dir
-
-
-def create_artificial_Y(Y):
-    """Creates an array with identical percentage of conflict points as input array.
-
-    Args:
-        Y (array): original array containing binary conflict classifier data.
-
-    Returns:
-        array: array with reshuffled conflict classifier data.
-    """
-
-    arr_1 = np.ones(len(np.where(Y != 0)[0]))
-    arr_0 = np.zeros(int(len(Y) - len(np.where(Y != 0)[0])))
-    Y_r_1 = np.append(arr_1, arr_0)
-
-    Y_r = utils.shuffle(Y_r_1, random_state=42)
-
-    return Y_r
-
-
-def global_ID_geom_info(gdf):
-    """Retrieves unique ID and geometry information from geo-dataframe for a global look-up dataframe.
-    The IDs currently supported are 'name' or 'watprovID'.
-
-    Args:
-        gdf (geo-dataframe): containing all polygons used in the model.
-
-    Returns:
-        dataframe: look-up dataframe associated ID with geometry
-    """
-
-    # stack identifier and geometry of all polygons
-    # test if gdf has column 'name', otherwise use column 'watprovID'
-    arr = np.column_stack((gdf.watprovID.to_numpy(), gdf.geometry.to_numpy()))
-
-    # convert to dataframe
-    df = pd.DataFrame(data=arr, columns=["ID", "geometry"])
-
-    # use column ID as index
-    df.set_index(df.ID, inplace=True)
-    df = df.drop("ID", axis=1)
-
-    return df
-
-
-def get_conflict_datapoints_only(X_df, y_df):
-    """Filters out only those polygons where conflict was actually observed in the test-sample.
-
-    Args:
-        X_df (dataframe): variable values per polygon.
-        y_df (dataframe): conflict data per polygon.
-
-    Returns:
-        dataframe: variable values for polyons where conflict was observed.
-        dataframe: conflict data for polyons where conflict was observed.
-    """
-
-    # concatenate dataframes of sample data and target values
-    df = pd.concat([X_df, y_df], axis=1)
-    # keep only those entries where conflict was observed
-    df = df.loc[df.y_test == 1]
-
-    # split again into X and Y
-    X1_df = df[df.columns[: len(X_df.columns)]]
-    y1_df = df[df.columns[len(X_df.columns) :]]
-
-    return X1_df, y1_df
-
-
-def save_to_csv(arg, out_dir, fname):
-    """Saves an dictionary to csv-file.
-
-    Args:
-        arg (dict): dictionary or dataframe to be saved.
-        out_dir (str): path to output folder.
-        fname (str): name of stored item.
-    """
-
-    # check if arg is actuall a dict
-    if isinstance(arg, dict):
-        # create dataframe from dict
-        try:
-            arg = pd.DataFrame().from_dict(arg)
-        except ValueError:
-            arg = pd.DataFrame().from_dict(arg, orient="index")
-
-    # save dataframe as csv
-    arg.to_csv(os.path.join(out_dir, fname + ".csv"))
-
-
-def save_to_npy(arg, out_dir, fname):
-    """Saves an argument (either dictionary or dataframe) to npy-file.
-
-    Args:
-        arg (dict or dataframe): dictionary or dataframe to be saved.
-        out_dir (str): path to output folder.
-        fname (str): name of stored item.
-    """
-
-    # if arg is dict, then first create dataframe, then np-array
-    if isinstance(arg, dict):
-        arg = pd.DataFrame().from_dict(arg)
-        arg = arg.to_numpy()
-
-    # if arg is dataframe, directly create np-array
-    elif isinstance(arg, pd.DataFrame):
-        arg = arg.to_numpy()
-
-    # save np-array as npy-file
-    np.save(os.path.join(out_dir, fname + ".npy"), arg)
-
-
-def determine_projection_period(config_REF, config_PROJ):
-    """Determines the period for which projections need to be made.
-    This is defined as the period between the end year of the reference run 
-    and the specified projection year for each projection.
-
-    Args:
-        config_REF (ConfigParser-object): object containing the parsed \
-            configuration-settings of the model for the reference run.
-        config_PROJ (ConfigParser-object): object containing the parsed \
-            configuration-settings of the model for a projection run..
-
-    Returns:
-        list: list containing all years of the projection period.
-    """
-
-    # get all years of projection period
-    projection_period = np.arange(
-        config_REF.getint("settings", "y_end") + 1,
-        config_PROJ.getint("settings", "y_proj") + 1,
-        1,
-    )
-    # convert to list
-    projection_period = projection_period.tolist()
-    print(
-        "INFO: the projection period is {} to {}".format(
-            projection_period[0], projection_period[-1]
-        )
-    )
-
-    return projection_period

From d51e8a7c47f3eadc243bad8af182ad2c5ecc413e Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Wed, 14 Feb 2024 10:46:10 +0100
Subject: [PATCH 05/13] removed outdated tests

---
 tests/test_machine_learning.py | 33 ---------------------------------
 tests/test_utils.py            | 26 --------------------------
 2 files changed, 59 deletions(-)
 delete mode 100644 tests/test_machine_learning.py
 delete mode 100644 tests/test_utils.py

diff --git a/tests/test_machine_learning.py b/tests/test_machine_learning.py
deleted file mode 100644
index 2d2e8bb..0000000
--- a/tests/test_machine_learning.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pytest
-import configparser
-import numpy as np
-import pandas as pd
-import geopandas as gpd
-from sklearn import preprocessing, model_selection
-from copro import conflict, machine_learning
-
-def create_fake_config():
-
-    config = configparser.ConfigParser()
-
-    config.add_section('general')
-    config.set('general', 'verbose', str(False))
-    config.add_section('machine_learning')
-    config.set('machine_learning', 'train_fraction', str(0.7))
-
-    return config
-
-def test_split_scale_train_test_split():
-
-    X1 = [1, 2, 3, 4]
-    X2 = [1, 2, 3, 4]
-    X3 = [[1, 2], [3, 4], [1, 2], [5, 6]]
-
-    X = np.column_stack((X1, X2, X3))
-    Y = [1, 0, 0, 1]
-    config = create_fake_config()
-    scaler = preprocessing.QuantileTransformer()
-
-    X_train, X_test, y_train, y_test, X_train_geom, X_test_geom, X_train_ID, X_test_ID = machine_learning.split_scale_train_test_split(X, Y, config, scaler)
-
-    assert (len(X_train) + len(X_test)) == len(X)
\ No newline at end of file
diff --git a/tests/test_utils.py b/tests/test_utils.py
deleted file mode 100644
index c4989b7..0000000
--- a/tests/test_utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import pytest
-import numpy as np
-import pandas as pd
-from copro import utils
-
-def test_create_artificial_Y():
-
-    Y = [1, 0, 0, 0, 0, 1]
-    
-    Y_r = utils.create_artificial_Y(Y)
-
-    assert len(np.where(Y_r != 0)[0]) == len(np.where(Y != 0)[0])
-
-def test_get_conflict_datapoints_only():
-
-    X_arr = [[1, 2], [3, 4], [1, 2], [5, 6]]
-    y_arr = [1, 0, 0, 1]
-
-    X_in = pd.DataFrame(data=X_arr, columns=['var1', 'var2'])
-    y_in = pd.DataFrame(data=y_arr, columns=['y_test'])
-
-    X_out, y_out = utils.get_conflict_datapoints_only(X_in, y_in)
-
-    test_arr = np.where(y_out.y_test.values == 0)[0]
-
-    assert test_arr.size == 0
\ No newline at end of file

From 78ef79dd0898bae3589474b3706eed8eebec9371 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Wed, 14 Feb 2024 11:31:31 +0100
Subject: [PATCH 06/13] improving selection.py

---
 copro/selection.py | 247 +++++++++++++++++----------------------------
 1 file changed, 92 insertions(+), 155 deletions(-)

diff --git a/copro/selection.py b/copro/selection.py
index e7018fc..8a66025 100644
--- a/copro/selection.py
+++ b/copro/selection.py
@@ -1,191 +1,128 @@
-import pandas as pd
 import geopandas as gpd
+import pandas as pd
 import os
 from copro import utils
+from configparser import RawConfigParser
+import click
+from typing import Tuple
+from ast import literal_eval
 
-def filter_conflict_properties(gdf, config):
-    """Filters conflict database according to certain conflict properties such as number of casualties, type of violence or country.
+
+def select(
+    config: RawConfigParser, out_dir: click.Path, root_dir: click.Path
+) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame, pd.DataFrame]:
+    """Main function performing the selection procedure.
+    First, selects only conflicts matching specified properties.
+    Second, clips the conflict data to a specified spatial extent.
+    Third, retrieves the geometry of all polygons in the spatial extent and assigns IDs.
 
     Args:
-        gdf (geo-dataframe): geo-dataframe containing entries with conflicts.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        out_dir (Path): path to output folder.
+        root_dir (Path): path to location of cfg-file for reference run.
 
     Returns:
-        geo-dataframe: geo-dataframe containing filtered entries.
-    """    
-    
-    # create dictionary with all selection criteria
-    selection_criteria = {'best': config.getint('conflict', 'min_nr_casualties'),
-                          'type_of_violence': (config.get('conflict', 'type_of_violence')).rsplit(',')}
-    
-    print('INFO: filtering based on conflict properties.')
-    
-    # go through all criteria
-    for key in selection_criteria:
+        gpd.GeoDataFrame: remaining conflict data after selection process.
+        gpd.GeoDataFrame: remaining polygons after selection process.
+        pd.DataFrame: global look-up dataframe linking polygon ID with geometry information.
+    """
 
-        # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold
-        if key == 'best':
-            if selection_criteria[key] == '':
-                pass
-            else:
-                if config.getboolean('general', 'verbose'): print('DEBUG: filtering key', key, 'with lower value', selection_criteria[key])
-                gdf = gdf[gdf['best'] >= selection_criteria['best']]
+    # get the conflict data
+    conflict_gdf = utils.get_conflict_geodataframe(config, root_dir)
 
-        # for other criteria, select all entries matching the specified value(s) per criterion
-        if key == 'type_of_violence':
-            if selection_criteria[key] == '':
-                pass
-            else:
-                if config.getboolean('general', 'verbose'): print('DEBUG: filtering key', key, 'with value(s)', selection_criteria[key])
-                selection_criteria[key] = [eval(i) for i in selection_criteria[key]]
-                gdf = gdf[gdf[key].isin(selection_criteria[key])]
+    # filter based on conflict properties
+    conflict_gdf = _filter_conflict_properties(conflict_gdf, config)
 
-    return gdf
+    # clip conflicts to a spatial extent defined as polygons
+    conflict_gdf, extent_gdf = _clip_to_extent(conflict_gdf, config, root_dir)
 
-def select_period(gdf, config):
-    """Reducing the geo-dataframe to those entries falling into a specified time period.
+    # get a dataframe containing the ID and geometry of all polygons
+    global_df = utils.get_ID_geometry_lookup(extent_gdf)
+
+    # save conflict data and polygon to shp-file
+    conflict_gdf.to_file(
+        os.path.join(out_dir, "selected_conflicts.geojson"),
+        driver="GeoJSON",
+        crs="EPSG:4326",
+    )
+    extent_gdf.to_file(
+        os.path.join(out_dir, "selected_polygons.geojson"),
+        driver="GeoJSON",
+        crs="EPSG:4326",
+    )
+
+    return conflict_gdf, extent_gdf, global_df
+
+
+def _filter_conflict_properties(
+    gdf: gpd.GeoDataFrame, config: RawConfigParser
+) -> gpd.GeoDataFrame:
+    """Filters conflict database according to certain conflict properties
+    such as number of casualties, type of violence or country.
 
     Args:
-        gdf (geo-dataframe): geo-dataframe containing entries with conflicts.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        gdf (gpd.GeoDataFrame): geo-dataframe containing entries with conflicts.
+        config (RawConfigParser): object containing the parsed configuration-settings of the model.
 
     Returns:
-        geo-dataframe: geo-dataframe containing filtered entries.
-    """    
-
-    # get start and end year of model period
-    t0 = config.getint('settings', 'y_start')
-    t1 = config.getint('settings', 'y_end')
-    
-    # select those entries meeting the requirements
-    if config.getboolean('general', 'verbose'): print('DEBUG: focussing on period between {} and {}'.format(t0, t1))
-    gdf = gdf.loc[(gdf.year >= t0) & (gdf.year <= t1)]
-    
+        gpd.GeoDataFrame: geo-dataframe containing filtered entries.
+    """
+
+    # create dictionary with all selection criteria
+    selection_criteria = {
+        "best": config.getint("conflict", "min_nr_casualties"),
+        "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","),
+    }
+
+    click.echo("Filtering based on conflict properties.")
+    # go through all criteria
+    for key, value in selection_criteria.items():
+
+        # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold
+        if key == "best" and value != "":
+            click.echo(f"Filtering key {key} with lower value {value}.")
+            gdf = gdf[gdf["best"] >= value]
+        # for other criteria, select all entries matching the specified value(s) per criterion
+        if key == "type_of_violence" and value != "":
+            click.echo(f"Filtering key {key} with value(s) {value}.")
+            # NOTE: check if this works like this
+            values = [literal_eval(i) for i in value]
+            gdf = gdf[gdf[key].isin(values)]
+
     return gdf
 
-def clip_to_extent(gdf, config, root_dir):
-    """As the original conflict data has global extent, this function clips the database to those entries which have occured on a specified continent.
+
+def _clip_to_extent(
+    conflict_gdf: gpd.GeoDataFrame, config: RawConfigParser, root_dir: click.Path
+) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
+    """As the original conflict data has global extent, this function clips the database
+    to those entries which have occured on a specified continent.
 
     Args:
-        gdf (geo-dataframe): geo-dataframe containing entries with conflicts.
+        conflict_gdf (geo-dataframe): geo-dataframe containing entries with conflicts.
         config (ConfigParser-object): object containing the parsed configuration-settings of the model.
         root_dir (str): path to location of cfg-file.
 
     Returns:
         geo-dataframe: geo-dataframe containing filtered entries.
         geo-dataframe: geo-dataframe containing country polygons of selected continent.
-    """    
+    """
 
     # get path to file with polygons for which analysis is carried out
-    shp_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('extent', 'shp'))
-    
+    shp_fo = os.path.join(
+        root_dir, config.get("general", "input_dir"), config.get("extent", "shp")
+    )
+
     # read file
-    if config.getboolean('general', 'verbose'): print('DEBUG: reading extent and spatial aggregation level from file {}'.format(shp_fo))
+    click.echo(f"Reading extent and spatial aggregation level from file {shp_fo}.")
     extent_gdf = gpd.read_file(shp_fo)
 
     # fixing invalid geometries
-    if config.getboolean('general', 'verbose'): print('DEBUG: fixing invalid geometries')
+    click.echo("Fixing invalid geometries")
     extent_gdf.geometry = extent_gdf.buffer(0)
 
     # clip the conflict dataframe to the specified polygons
-    if config.getboolean('general', 'verbose'): print('DEBUG: clipping clipping conflict dataset to extent')    
-    gdf = gpd.clip(gdf, extent_gdf)
-    
-    return gdf, extent_gdf
-
-def climate_zoning(gdf, extent_gdf, config, root_dir):
-    """This function allows for selecting only those conflicts and polygons falling in specified climate zones.
-    Also, a global dataframe is returned containing the IDs and geometry of all polygons after selection procedure.
-    This can be used to add geometry information to model output based on common ID.
-
-    Args:
-        gdf (geo-dataframe): geo-dataframe containing conflict data.
-        extent_gdf (geo-dataframe): all polygons of study area.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        root_dir (str): path to location of cfg-file.
-
-    Returns:
-        geo-dataframe: conflict data clipped to climate zones.
-        geo-dataframe: polygons of study area clipped to climate zones.
-    """
-
-    # load file with extents of climate zones
-    Koeppen_Geiger_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('climate', 'shp'))
-    KG_gdf = gpd.read_file(Koeppen_Geiger_fo)
-    # load file to look-up climate zone names with codes in shp-file
-    code2class_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('climate', 'code2class'))
-    code2class = pd.read_csv(code2class_fo, sep='\t')
-    
-    # if climate zones are specified...
-    if config.get('climate', 'zones') != '':
-
-        # get all classes specified
-        look_up_classes = config.get('climate', 'zones').rsplit(',')
-
-        # get the corresponding code per class
-        code_nrs = []
-        for entry in look_up_classes:
-            code_nr = int(code2class['code'].loc[code2class['class'] == entry])
-            code_nrs.append(code_nr)
-    
-        # get only those entries with retrieved codes
-        KG_gdf = KG_gdf.loc[KG_gdf['GRIDCODE'].isin(code_nrs)]
-        
-        # make sure EPSG:4236 is used
-        if KG_gdf.crs != 'EPSG:4326':
-            KG_gdf = KG_gdf.to_crs('EPSG:4326')
-
-        # clip the conflict dataframe to the specified climate zones
-        if config.getboolean('general', 'verbose'): print('DEBUG: clipping conflicts to climate zones {}'.format(look_up_classes))
-        gdf = gpd.clip(gdf, KG_gdf.buffer(0))
-
-        # clip the studied polygons to the specified climate zones
-        if config.getboolean('general', 'verbose'): print('DEBUG: clipping polygons to climate zones {}'.format(look_up_classes))
-        polygon_gdf = gpd.clip(extent_gdf, KG_gdf.buffer(0))
-
-    # if not, nothing needs to be done besides aligning names
-    else:
-
-        polygon_gdf = extent_gdf.copy()
-
-    return gdf, polygon_gdf
-
-def select(config, out_dir, root_dir):
-    """Main function performing the selection procedure.
-    Also stores the selected conflicts and polygons to output directory.
-
-    Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        out_dir (str): path to output folder.
-        root_dir (str): path to location of cfg-file.
-
-    Returns:
-        geo-dataframe: remaining conflict data after selection process.
-        geo-dataframe: all polygons of the study area.
-        geo-dataframe: remaining polygons after selection process.
-        dataframe: global look-up dataframe linking polygon ID with geometry information.
-    """  
-
-    # get the conflict data
-    gdf = utils.get_geodataframe(config, root_dir)
-
-    # filter based on conflict properties
-    gdf = filter_conflict_properties(gdf, config)
-
-    # selected conflicts falling in a specified time period
-    gdf = select_period(gdf, config)
-
-    # clip conflicts to a spatial extent defined as polygons
-    gdf, extent_gdf = clip_to_extent(gdf, config, root_dir)
-
-    # clip conflicts and polygons to specified climate zones
-    gdf, polygon_gdf = climate_zoning(gdf, extent_gdf, config, root_dir)
-
-    # get a dataframe containing the ID and geometry of all polygons after selecting for climate zones
-    global_df = utils.global_ID_geom_info(polygon_gdf)
-
-    # save conflict data and polygon to shp-file
-    gdf.to_file(os.path.join(out_dir, 'selected_conflicts.geojson'), driver='GeoJSON', crs='EPSG:4326')
-    polygon_gdf.to_file(os.path.join(out_dir, 'selected_polygons.geojson'), driver='GeoJSON', crs='EPSG:4326')
+    click.echo("Clipping clipping conflict dataset to extent.")
+    conflict_gdf = gpd.clip(conflict_gdf, extent_gdf)
 
-    return gdf, polygon_gdf, global_df
\ No newline at end of file
+    return conflict_gdf, extent_gdf

From 2e8c869ea1f100dbf03d0f806aaaac02a43f38c5 Mon Sep 17 00:00:00 2001
From: Jannis Hoch <10956703+JannisHoch@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:49:27 +0100
Subject: [PATCH 07/13] Update CODE_OF_CONDUCT.md

---
 CODE_OF_CONDUCT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index b690921..65cf5ff 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -55,7 +55,7 @@ further defined and clarified by project maintainers.
 ## Enforcement
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported by contacting the project team at j.m.hoch@uu.nl. All
+reported by contacting the project team. All
 complaints will be reviewed and investigated and will result in a response that
 is deemed necessary and appropriate to the circumstances. The project team is
 obligated to maintain confidentiality with regard to the reporter of an incident.

From 74d735bff7e1f6c38d340230ed44c02bc9cd7685 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Feb 2024 12:26:26 +0100
Subject: [PATCH 08/13] updated and moved functions in conflicts.py

---
 copro/conflict.py | 339 +++++++++++++++++-----------------------------
 copro/utils.py    |  46 ++++++-
 2 files changed, 167 insertions(+), 218 deletions(-)

diff --git a/copro/conflict.py b/copro/conflict.py
index caaceb5..32c505c 100644
--- a/copro/conflict.py
+++ b/copro/conflict.py
@@ -1,12 +1,22 @@
 from copro import utils
-import geopandas as gpd
+from configparser import RawConfigParser
+from pathlib import Path
+from typing import Union, Literal
 import pandas as pd
+import geopandas as gpd
 import numpy as np
 import os
 import click
+import warnings
 
 
-def conflict_in_year_bool(config, conflict_gdf, extent_gdf, sim_year, out_dir):
+def conflict_in_year_bool(
+    config: RawConfigParser,
+    conflict_gdf: gpd.GeoDataFrame,
+    extent_gdf: gpd.GeoDataFrame,
+    sim_year: int,
+    out_dir: click.Path,
+) -> list:
     """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
 
     Args:
@@ -17,31 +27,22 @@ def conflict_in_year_bool(config, conflict_gdf, extent_gdf, sim_year, out_dir):
         sim_year (int): year for which data is extracted.
         out_dir (str): path to output folder. If 'None', no output is stored.
 
-    Raises:
-        AssertionError: raised if the length of output list does not match length of input geo-dataframe.
-
     Returns:
         list: list containing 0/1 per polygon depending on conflict occurence.
     """
 
-    if config.getboolean("general", "verbose"):
-        print("DEBUG: checking for conflict event in polygon at t")
-
+    click.echo(f"Checking for conflict events which occured in {sim_year}.")
     # select the entries which occured in this year
     temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]
-
-    if len(temp_sel_year) == 0:
-        click.echo(
-            "WARNING: no conflicts were found in sampled conflict data set for year {}".format(
-                sim_year
-            )
+    if temp_sel_year.empty:
+        warnings.warn(
+            f"No conflicts were found in sampled conflict data set for year {sim_year}."
         )
 
     # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
     data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
 
     # determine the aggregated amount of fatalities in one region (e.g. water province)
-
     fatalities_per_poly = (
         data_merged["best"]
         .groupby(data_merged["watprovID"])
@@ -51,104 +52,57 @@ def conflict_in_year_bool(config, conflict_gdf, extent_gdf, sim_year, out_dir):
     )
 
     out_dir = os.path.join(out_dir, "files")
-    if not os.path.isdir(out_dir):
-        os.makedirs(out_dir)
+    Path.mkdir(Path(out_dir), exist_ok=True)
 
     if sim_year == config.getint("settings", "y_end"):
-        # get a 1 for each polygon where there was conflict
-        bool_per_poly = fatalities_per_poly / fatalities_per_poly
-        # change column name and dtype
-        bool_per_poly = bool_per_poly.rename(
-            columns={"total_fatalities": "bool_conflict"}
-        ).astype(int)
-        # change index name to fit global_df
-        bool_per_poly.index = bool_per_poly.index.rename("ID")
-        # get list of all polygon IDs with their geometry information
-        global_df = utils.global_ID_geom_info(extent_gdf)
-        # merge the boolean info with geometry
-        # for all polygons without conflict, set a 0
-        if config.getboolean("general", "verbose"):
-            print(
-                "DEBUG: storing boolean conflict map of year {} to file {}".format(
-                    sim_year,
-                    os.path.join(out_dir, "conflicts_in_{}.csv".format(sim_year)),
-                )
-            )
-        # data_stored = pd.merge(bool_per_poly, global_df, on='ID', how='right').fillna(0)
-        data_stored = pd.merge(bool_per_poly, global_df, on="ID", how="right").dropna()
-        data_stored.index = data_stored.index.rename("watprovID")
-        data_stored = data_stored.drop("geometry", axis=1)
-        data_stored = data_stored.astype(int)
-        data_stored.to_csv(
-            os.path.join(out_dir, "conflicts_in_{}.csv".format(sim_year))
+        _store_boolean_conflict_data_to_csv(
+            fatalities_per_poly, config, extent_gdf, sim_year, out_dir
         )
 
     # loop through all regions and check if exists in sub-set
     # if so, this means that there was conflict and thus assign value 1
     list_out = []
-    for i in range(len(extent_gdf)):
+    for i, _ in extent_gdf.iterrows():
         i_poly = extent_gdf.iloc[i]["watprovID"]
         if i_poly in fatalities_per_poly.index.values:
             list_out.append(1)
         else:
             list_out.append(0)
 
-    assert len(extent_gdf) == len(list_out), AssertionError(
-        "ERROR: the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}".format(
-            len(extent_gdf), len(list_out)
-        )
-    )
-
     return list_out
 
 
-def conflict_in_previous_year(
-    config,
-    conflict_gdf,
-    extent_gdf,
-    sim_year,
-    check_neighbors=False,
-    neighboring_matrix=None,
-):
-    """Creates a list for each timestep with boolean information whether a conflict took place 
-    in a polygon at the previous timestep or not.
-    If the current time step is the first (t=0), then this year is skipped and 
-    the model continues at the next time step.
+def conflict_in_previous_year_bool(
+    conflict_gdf: gpd.GeoDataFrame,
+    extent_gdf: gpd.GeoDataFrame,
+    sim_year: int,
+    check_neighbors: bool = False,
+    neighboring_matrix: Union[None, pd.DataFrame] = None,
+) -> list:
+    """_summary_
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict.
-        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with \
-            geometry information for which values are extracted.
-        sim_year (int): year for which data is extracted.
-        check_neighbors (bool): whether to check conflict events in neighboring polygons. \
-            Defaults to False.
-        neighboring_matrix (dataframe): lookup-dataframe indicating which polygons are mutual neighbors. \
-            Defaults to None.
-
-    Raises:
-        ValueError: raised if check_neighbors is True, but no matrix is provided.
-        AssertionError: raised if the length of output list does not match length of input geo-dataframe.
+        conflict_gdf (gpd.GeoDataFrame): _description_
+        extent_gdf (gpd.GeoDataFrame): _description_
+        sim_year (int): _description_
+        check_neighbors (bool, optional): _description_. Defaults to False.
+        neighboring_matrix (Union[None, pd.DataFrame], optional): _description_. Defaults to None.
 
     Returns:
-        list: list containing 0/1 per polygon depending on conflict occurence if checking for conflict at t-1, \
-            and containing log-transformed number of conflict events in neighboring polygons if specified.
+        list: _description_
     """
 
-    if config.getboolean("general", "verbose"):
-        if check_neighbors:
-            print("DEBUG: checking for conflicts in neighboring polygons at t-1")
-        else:
-            print("DEBUG: checking for conflict event in polygon at t-1")
+    if check_neighbors:
+        click.echo("Checking for conflicts in neighboring polygons at t-1")
+    else:
+        click.echo("Checking for conflict event in polygon at t-1")
 
     # get conflicts at t-1
     temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year - 1]
-
-    assert len(temp_sel_year) != 0, AssertionError(
-        "ERROR: no conflicts were found in sampled conflict data set for year {}".format(
-            sim_year - 1
+    if temp_sel_year.empty:
+        warnings.warn(
+            f"No conflicts were found in sampled conflict data set for year {sim_year - 1}."
         )
-    )
 
     # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
     data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
@@ -160,107 +114,85 @@ def conflict_in_previous_year(
         .rename(columns={"id": "conflict_count"})
     )
 
-    # loop through all polygons and check if exists in sub-set
+    # loop through all polygons
     list_out = []
     for i in range(len(extent_gdf)):
-
         i_poly = extent_gdf.watprovID.iloc[i]
-
+        # check if polygon is in list with conflict polygons
         if i_poly in conflicts_per_poly.index.values:
-
+            # if so, check if neighboring polygons contain conflict and assign boolean value
             if check_neighbors:
-
-                # determine log-scaled number of conflict events in neighboring polygons
                 val = calc_conflicts_nb(i_poly, neighboring_matrix, conflicts_per_poly)
                 # append resulting value
                 list_out.append(val)
-
+            # if not, assign 1 directly
             else:
-
                 list_out.append(1)
-
         else:
-
             # if polygon not in list with conflict polygons, assign 0
             list_out.append(0)
 
-    assert len(extent_gdf) == len(list_out), AssertionError(
-        "ERROR: the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}".format(
-            len(extent_gdf), len(list_out)
-        )
-    )
-
     return list_out
 
 
 def read_projected_conflict(
-    extent_gdf, bool_conflict, check_neighbors=False, neighboring_matrix=None
-):
+    extent_gdf: gpd.GeoDataFrame,
+    bool_conflict: pd.DataFrame,
+    check_neighbors=False,
+    neighboring_matrix=None,
+) -> list:
     """Creates a list for each timestep with boolean information 
     whether a conflict took place in a polygon or not.
-    Input conflict data (bool_conflict) must contain an index with IDs 
-    corresponding with the 'watprovID' values of extent_gdf.
+    Input conflict data (`bool_conflict`) must contain an index with IDs 
+    corresponding with the `watprovID` values of extent_gdf.
     Optionally, the algorithm can be extended to the neighboring polygons.
 
     Args:
-        extent_gdf (geodataframe): geo-dataframe containing one or more polygons \
+        extent_gdf (gpd.GeoDataFrame): geo-dataframe containing one or more polygons \
             with geometry information for which values are extracted.
-        bool_conflict (dataframe): dataframe with boolean values (1) for each polygon with conflict.
+        bool_conflict (pd.DataFrame): dataframe with boolean values (1) for each polygon with conflict.
         check_neighbors (bool, optional): whether or not to check for conflict in neighboring polygons. \
-            Defaults to False.
-        neighboring_matrix (dataframe, optional): look-up dataframe listing all neighboring polygons. \
-            Defaults to None.
+            Defaults to `False`.
+        neighboring_matrix (pd.DataFrame, optional): look-up dataframe listing all neighboring polygons. \
+            Defaults to `None`.
 
     Returns:
-        list: containing 1 and 0 values for each polygon with conflict respectively without conflict. \
-            If check_neighbors=True, then 1 if neighboring polygon contains conflict and 0 is not.
+        list: 1 and 0 values for each polygon with conflict respectively without conflict. \
+            If `check_neighbors=True`, then 1 if neighboring polygon contains conflict and 0 is not.
     """
 
-    # assert that there are actually conflicts reported
-    # assert len(bool_conflict) != 0, AssertionError(
-    #     "ERROR: no conflicts were found in sampled conflict data set for year {}".format(
-    #         sim_year - 1
-    #     )
-    # )
-
     # loop through all polygons and check if exists in sub-set
     list_out = []
     for i in range(len(extent_gdf)):
-
         i_poly = extent_gdf.watprovID.iloc[i]
-
         if i_poly in bool_conflict.index.values:
-
             if check_neighbors:
-
                 # determine log-scaled number of conflict events in neighboring polygons
                 val = calc_conflicts_nb(i_poly, neighboring_matrix, bool_conflict)
                 # append resulting value
                 list_out.append(val)
-
             else:
-
                 list_out.append(1)
-
         else:
-
             # if polygon not in list with conflict polygons, assign 0
             list_out.append(0)
 
     return list_out
 
 
-def calc_conflicts_nb(i_poly, neighboring_matrix, conflicts_per_poly):
+def calc_conflicts_nb(
+    i_poly: int, neighboring_matrix: pd.DataFrame, conflicts_per_poly: pd.DataFrame
+) -> Literal[0, 1]:
     """Determines whether in the neighbouring polygons of a polygon i_poly conflict took place.
     If so, a value 1 is returned, otherwise 0.
 
     Args:
         i_poly (int): ID number of polygon under consideration.
-        neighboring_matrix (dataframe): look-up dataframe listing all neighboring polygons.
-        conflicts_per_poly (dataframe): dataframe with conflict informatoin per polygon.
+        neighboring_matrix (pd.DataFrame): look-up dataframe listing all neighboring polygons.
+        conflicts_per_poly (pd.DataFrame): dataframe with conflict data per polygon.
 
     Returns:
-        int: 1 is conflict took place in neighboring polygon, 0 if not.
+        Literal: 1 if conflict took place in neighboring polygon, 0 if not.
     """
 
     # find neighbors of this polygon
@@ -268,15 +200,11 @@ def calc_conflicts_nb(i_poly, neighboring_matrix, conflicts_per_poly):
 
     # initiate list
     nb_count = []
-
     # loop through neighbors
     for k in nb:
-
         # check if there was conflict at t-1
         if k in conflicts_per_poly.index.values:
-
             nb_count.append(1)
-
     # if more than one neighboring polygon has conflict, return 0
     if np.sum(nb_count) > 0:
         val = 1
@@ -287,99 +215,36 @@ def calc_conflicts_nb(i_poly, neighboring_matrix, conflicts_per_poly):
     return val
 
 
-def get_poly_ID(extent_gdf):
-    """Extracts and returns a list with unique identifiers for each polygon used in the model.
-    The identifiers are currently limited to 'watprovID'.
-
-    Args:
-        extent_gdf (geo-dataframe): geo-dataframe containing one or more polygons.
-
-    Raises:
-        AssertionError: error raised if length of output list does not match length of input geo-dataframe.
-
-    Returns:
-        list: list containing a unique identifier extracted from geo-dataframe for each polygon used in the model.
-    """
-
-    # initiatie empty list
-    list_ID = []
-
-    # loop through all polygons
-    for i in range(len(extent_gdf)):
-        # append geometry of each polygon to list
-        list_ID.append(extent_gdf.iloc[i]["watprovID"])
-
-    # in the end, the same number of polygons should be in geodataframe and list
-    assert len(extent_gdf) == len(list_ID), AssertionError(
-        "ERROR: the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}".format(
-            len(extent_gdf), len(list_ID)
-        )
-    )
-
-    return list_ID
-
-
-def get_poly_geometry(extent_gdf, config):
-    """Extracts geometry information for each polygon from geodataframe and saves to list.
-    The geometry column in geodataframe must be named 'geometry'.
-
-    Args:
-        extent_gdf (geo-dataframe): geo-dataframe containing one or more polygons with geometry information.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-
-    Raises:
-        AssertionError: error raised if length of output list does not match length of input geo-dataframe.
-
-    Returns:
-        list: list containing the geometry information extracted from geo-dataframe for each polygon used in the model.
-    """
-
-    if config.getboolean("general", "verbose"):
-        print("DEBUG: getting the geometry of all geographical units")
-
-    # initiatie empty list
-    list_geometry = []
-
-    # loop through all polygons
-    for i in range(len(extent_gdf)):
-        # append geometry of each polygon to list
-        list_geometry.append(extent_gdf.iloc[i]["geometry"])
-
-    # in the end, the same number of polygons should be in geodataframe and list
-    assert len(extent_gdf) == len(list_geometry), AssertionError(
-        "ERROR: the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}".format(
-            len(extent_gdf), len(list_geometry)
-        )
-    )
-
-    return list_geometry
-
-
-def get_pred_conflict_geometry(
-    X_test_ID, X_test_geom, y_test, y_pred, y_prob_0, y_prob_1
-):
+def check_for_correct_prediction(
+    X_test_ID: np.ndarray,
+    X_test_geom: np.ndarray,
+    y_test: np.ndarray,
+    y_pred: np.ndarray,
+    y_prob_0: np.ndarray,
+    y_prob_1: np.ndarray,
+) -> pd.DataFrame:
     """Stacks together the arrays with unique identifier, geometry, test data, and predicted data into a dataframe.
     Contains therefore only the data points used in the test-sample, not in the training-sample.
     Additionally computes whether a correct prediction was made.
 
     Args:
-        X_test_ID (list): list containing the unique identifier per data point.
-        X_test_geom (list): list containing the geometry per data point.
-        y_test (list): list containing test-data.
-        y_pred (list): list containing predictions.
+        X_test_ID (np.ndarray): _description_
+        X_test_geom (np.ndarray): _description_
+        y_test (np.ndarray): _description_
+        y_pred (np.ndarray): _description_
+        y_prob_0 (np.ndarray): _description_
+        y_prob_1 (np.ndarray): _description_
 
     Returns:
-        dataframe: dataframe with each input list as column plus computed 'correct_pred'.
+        pd.DataFrame: _description_
     """
 
     # stack separate columns horizontally
     arr = np.column_stack((X_test_ID, X_test_geom, y_test, y_pred, y_prob_0, y_prob_1))
-
     # convert array to dataframe
     df = pd.DataFrame(
         arr, columns=["ID", "geometry", "y_test", "y_pred", "y_prob_0", "y_prob_1"]
     )
-
     # compute whether a prediction is correct
     # if so, assign 1; otherwise, assign 0
     df["correct_pred"] = np.where(df["y_test"] == df["y_pred"], 1, 0)
@@ -387,15 +252,55 @@ def get_pred_conflict_geometry(
     return df
 
 
+def _store_boolean_conflict_data_to_csv(
+    fatalities_per_poly: pd.DataFrame,
+    extent_gdf: gpd.GeoDataFrame,
+    sim_year: int,
+    out_dir: click.Path,
+):
+    """Stores boolean conflict data to csv-file at the end of reference period.
+    Used as initial conditions for projections from there.
+
+    Args:
+        fatalities_per_poly (pd.DataFrame): Fatalities per polygon in `sim_year`.
+        extent_gdf (gpd.GeoDataFrame): All polygons considered in analysis, also those w/o conflict.
+        sim_year (int): Simulation year for which data is stored.
+        out_dir (click.Path): Path to output folder.
+    """
+
+    # get a 1 for each polygon where there was conflict
+    bool_per_poly = fatalities_per_poly / fatalities_per_poly
+    # change column name and dtype
+    bool_per_poly = bool_per_poly.rename(
+        columns={"total_fatalities": "bool_conflict"}
+    ).astype(int)
+    # change index name to fit global_df
+    bool_per_poly.index = bool_per_poly.index.rename("ID")
+    # get list of all polygon IDs with their geometry information
+    global_df = utils.get_ID_geometry_lookup(extent_gdf)
+    # merge the boolean info with geometry
+    # for all polygons without conflict, set a 0
+    click.echo(
+        f"Storing boolean conflict map of year {sim_year} \
+            to file {os.path.join(out_dir, f'conflicts_in_{sim_year}.csv')}"
+    )
+
+    data_stored = pd.merge(bool_per_poly, global_df, on="ID", how="right").dropna()
+    data_stored.index = data_stored.index.rename("watprovID")
+    data_stored = data_stored.drop("geometry", axis=1)
+    data_stored = data_stored.astype(int)
+    data_stored.to_csv(os.path.join(out_dir, f"conflicts_in_{sim_year}.csv"))
+
+
 def _find_neighbors(ID: int, neighboring_matrix: pd.DataFrame) -> np.ndarray:
     """Filters all polygons which are actually neighbors to given polygon.
 
     Args:
         ID (int): ID of specific polygon under consideration.
-        neighboring_matrix (dataframe): output from neighboring_polys().
+        neighboring_matrix (pd.DataFrame): output from neighboring_polys().
 
     Returns:
-        dataframe: dataframe containig IDs of all polygons that are actual neighbors.
+        np.ndarray: IDs of all polygons that are actual neighbors.
     """
 
     # locaties entry for polygon under consideration
diff --git a/copro/utils.py b/copro/utils.py
index 8c1a5f4..861dddb 100644
--- a/copro/utils.py
+++ b/copro/utils.py
@@ -46,9 +46,53 @@ def get_conflict_geodataframe(
     return gdf
 
 
+def get_poly_ID(extent_gdf: gpd.GeoDataFrame) -> list:
+    """Extracts and returns a list with unique identifiers for each polygon used in the model.
+
+    .. note::
+        The identifiers are currently limited to `watprovID`.
+
+    Args:
+        extent_gdf (gpd.GeoDataFrame): all polygons considered in model.
+
+    Returns:
+        list: list with ID of each polygons.
+    """
+
+    # initiatie empty list
+    list_ID = []
+    # loop through all polygons
+    for i in range(len(extent_gdf)):
+        # append geometry of each polygon to list
+        list_ID.append(extent_gdf.iloc[i]["watprovID"])
+
+    return list_ID
+
+
+def get_poly_geometry(extent_gdf: gpd.GeoDataFrame) -> list:
+    """Extracts geometry information for each polygon from geodataframe and saves to list.
+    The geometry column in geodataframe must be named `geometry`.
+
+    Args:
+        extent_gdf (gpd.GeoDataFrame): all polygons considered in model.
+
+    Returns:
+        list: list with geometry of each polygons.
+    """
+
+    # initiatie empty list
+    list_geometry = []
+    # loop through all polygons
+    for i in range(len(extent_gdf)):
+        # append geometry of each polygon to list
+        list_geometry.append(extent_gdf.iloc[i]["geometry"])
+
+    return list_geometry
+
+
 def get_ID_geometry_lookup(
     gdf: gpd.GeoDataFrame,
-) -> pd.DataFrame:  # get_ID_geometry_lookup
+) -> pd.DataFrame:
     """Retrieves unique ID and geometry information from geo-dataframe for a global look-up dataframe.
     The IDs currently supported are 'name' or 'watprovID'.
 

From d2c23b6f4ea52f4fce3c8d27720cd275210fed08 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Feb 2024 14:19:38 +0100
Subject: [PATCH 09/13] cosmetic changes to machine_learning.py

---
 copro/machine_learning.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index 37cfa2c..045165f 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -5,6 +5,8 @@
 from configparser import RawConfigParser
 from sklearn import ensemble, preprocessing, model_selection
 from typing import Union, Tuple
+import click
+from pathlib import Path
 
 
 class MachineLearning:
@@ -36,16 +38,14 @@ def split_scale_train_test_split(
         X_ID, X_geom, X_data = _split_conflict_geom_data(X)
 
         ##- scaling only the variable values
-        if self.config.getboolean("general", "verbose"):
-            print("DEBUG: fitting and transforming X")
+        click.echo("Fitting and transforming X")
         X_ft = self.scaler.fit_transform(X_data)
 
         ##- combining ID, geometry and scaled sample values per polygon
         X_cs = np.column_stack((X_ID, X_geom, X_ft))
 
         ##- splitting in train and test samples based on user-specified fraction
-        if self.config.getboolean("general", "verbose"):
-            print("DEBUG: splitting both X and Y in train and test data")
+        click.echo("Splitting both X and Y in train and test data")
         X_train, X_test, y_train, y_test = model_selection.train_test_split(
             X_cs,
             Y,
@@ -95,12 +95,10 @@ def fit_predict(
 
         # create folder to store all classifiers with pickle
         clf_pickle_rep = os.path.join(out_dir, "clfs")
-        if not os.path.isdir(clf_pickle_rep):
-            os.makedirs(clf_pickle_rep)
+        Path.mkdir(Path(clf_pickle_rep), parents=True, exist_ok=True)
 
         # save the fitted classifier to file via pickle.dump()
-        if self.config.getboolean("general", "verbose"):
-            print("DEBUG: dumping classifier to {}".format(clf_pickle_rep))
+        click.echo(f"Dumping classifier to {clf_pickle_rep}.")
         with open(os.path.join(clf_pickle_rep, "clf_{}.pkl".format(run_nr)), "wb") as f:
             pickle.dump(self.clf, f)
 
@@ -127,9 +125,10 @@ def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:
 
     clfs = os.listdir(os.path.join(out_dir, "clfs"))
 
-    assert len(clfs) == config.getint("machine_learning", "n_runs"), AssertionError(
-        "ERROR: number of loaded classifiers does not match the specified number of runs in cfg-file!"
-    )
+    if len(clfs) != config.getint("machine_learning", "n_runs"):
+        raise ValueError(
+            "Number of loaded classifiers does not match the specified number of runs in cfg-file!"
+        )
 
     return clfs
 
@@ -192,7 +191,6 @@ def _define_scaling(
                 choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
         )
 
-    if config.getboolean("general", "verbose"):
-        print("DEBUG: chosen scaling method is {}".format(scaler))
+    click.echo(f"Chosen scaling method is {scaler}.")
 
     return scaler

From 4119986d4772b8cf6b7fa233ee8539bff62c77bd Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Feb 2024 14:55:39 +0100
Subject: [PATCH 10/13] put all neighbours related functions in one file

---
 copro/nb.py | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 copro/nb.py

diff --git a/copro/nb.py b/copro/nb.py
new file mode 100644
index 0000000..3da7e97
--- /dev/null
+++ b/copro/nb.py
@@ -0,0 +1,64 @@
+import click
+import pandas as pd
+import numpy as np
+import geopandas as gpd
+
+
+def neighboring_polys(
+    extent_gdf: gpd.GeoDataFrame, identifier="watprovID"
+) -> pd.DataFrame:
+    """For each polygon, determines its neighboring polygons.
+    As result, a (n x n) look-up dataframe is obtained containing, where n is number of polygons in extent_gdf.
+
+    Args:
+        extent_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
+        identifier (str, optional): column name in extent_gdf to be used to identify neighbors. Defaults to 'watprovID'.
+
+    Returns:
+        dataframe: look-up dataframe containing True/False statement per polygon for all other polygons.
+    """
+
+    click.echo("Determining matrix with neighboring polygons.")
+    # initialise empty dataframe
+    df = pd.DataFrame()
+    # go through each polygon aka water province
+    for i in range(len(extent_gdf)):
+        # get geometry of current polygon
+        wp = extent_gdf.geometry.iloc[i]
+        # check which polygons in geodataframe (i.e. all water provinces) touch the current polygon
+        # also create a dataframe from result (boolean)
+        # the transpose is needed to easier append
+        df_temp = pd.DataFrame(
+            extent_gdf.geometry.touches(wp), columns=[extent_gdf[identifier].iloc[i]]
+        ).T
+        # append the dataframe
+        df = df.append(df_temp)
+
+    # replace generic indices with actual water province IDs
+    df.set_index(extent_gdf[identifier], inplace=True)
+    # replace generic columns with actual water province IDs
+    df.columns = extent_gdf[identifier].values
+
+    return df
+
+
+def find_neighbors(ID: int, neighboring_matrix: pd.DataFrame) -> np.ndarray:
+    """Filters all polygons which are actually neighbors to given polygon.
+
+    Args:
+        ID (int): ID of specific polygon under consideration.
+        neighboring_matrix (pd.DataFrame): output from neighboring_polys().
+
+    Returns:
+        np.ndarray: IDs of all polygons that are actual neighbors.
+    """
+
+    # locaties entry for polygon under consideration
+    neighbours = neighboring_matrix.loc[neighboring_matrix.index == ID].T
+
+    # filters all actual neighbors defined as neighboring polygons with True statement
+    actual_neighbours = neighbours.loc[  # noqa: C0121
+        neighbours[ID] == True  # noqa: C0121
+    ].index.values  # noqa: C0121
+
+    return actual_neighbours

From 282adcff07def0799e8e3711caa8adfbe02ca415 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Feb 2024 16:21:05 +0100
Subject: [PATCH 11/13] updating data.py, resp. what is now called xydata.py

---
 copro/conflict.py            |  28 +--
 copro/pipeline.py            | 270 ++++++++++-----------
 copro/{data.py => xydata.py} | 440 +++++++++++++++++------------------
 tests/test_data.py           |  12 +-
 4 files changed, 349 insertions(+), 401 deletions(-)
 rename copro/{data.py => xydata.py} (79%)

diff --git a/copro/conflict.py b/copro/conflict.py
index 32c505c..281e6d1 100644
--- a/copro/conflict.py
+++ b/copro/conflict.py
@@ -1,4 +1,4 @@
-from copro import utils
+from copro import utils, nb
 from configparser import RawConfigParser
 from pathlib import Path
 from typing import Union, Literal
@@ -196,12 +196,12 @@ def calc_conflicts_nb(
     """
 
     # find neighbors of this polygon
-    nb = _find_neighbors(i_poly, neighboring_matrix)
+    nbs = nb.find_neighbors(i_poly, neighboring_matrix)
 
     # initiate list
     nb_count = []
     # loop through neighbors
-    for k in nb:
+    for k in nbs:
         # check if there was conflict at t-1
         if k in conflicts_per_poly.index.values:
             nb_count.append(1)
@@ -290,25 +290,3 @@ def _store_boolean_conflict_data_to_csv(
     data_stored = data_stored.drop("geometry", axis=1)
     data_stored = data_stored.astype(int)
     data_stored.to_csv(os.path.join(out_dir, f"conflicts_in_{sim_year}.csv"))
-
-
-def _find_neighbors(ID: int, neighboring_matrix: pd.DataFrame) -> np.ndarray:
-    """Filters all polygons which are actually neighbors to given polygon.
-
-    Args:
-        ID (int): ID of specific polygon under consideration.
-        neighboring_matrix (pd.DataFrame): output from neighboring_polys().
-
-    Returns:
-        np.ndarray: IDs of all polygons that are actual neighbors.
-    """
-
-    # locaties entry for polygon under consideration
-    neighbours = neighboring_matrix.loc[neighboring_matrix.index == ID].T
-
-    # filters all actual neighbors defined as neighboring polygons with True statement
-    actual_neighbours = neighbours.loc[  # noqa: C0121
-        neighbours[ID] == True  # noqa: C0121
-    ].index.values  # noqa: C0121
-
-    return actual_neighbours
diff --git a/copro/pipeline.py b/copro/pipeline.py
index d3e56c1..f7573d2 100644
--- a/copro/pipeline.py
+++ b/copro/pipeline.py
@@ -1,215 +1,181 @@
-from copro import models, data, machine_learning, evaluation, utils
+from copro import models, machine_learning, evaluation, utils, xydata
 import pandas as pd
-import numpy as np
 import pickle
 import click
 import os
 
 
-def create_XY(config, out_dir, root_dir, polygon_gdf, conflict_gdf):
-    """Top-level function to create the X-array and Y-array.
-    If the XY-data was pre-computed and specified in cfg-file, the data is loaded.
-    If not, variable values and conflict data are read from file and stored in array. 
-    The resulting array is by default saved as npy-format to file.
-
-    Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        out_dir (str): path to output folder.
-        root_dir (str): path to location of cfg-file.
-        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
-        conflict_gdf (geo-dataframe): geo-dataframe containing the selected conflicts.
-
-    Returns:
-        array: X-array containing variable values.
-        array: Y-array containing conflict data.
-    """    
-
-    # if nothing is specified in cfg-file, then initiate and fill XY data from scratch
-    if config.get('pre_calc', 'XY') is '':
-
-        # initiate (empty) dictionary with all keys
-        XY = data.initiate_XY_data(config)
-
-        # fill the dictionary and get array
-        XY = data.fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf, out_dir)
-
-        # save array to XY.npy out_dir
-        if config.getboolean('general', 'verbose'): click.echo('DEBUG: saving XY data by default to file {}'.format(os.path.join(out_dir, 'XY.npy')))
-        np.save(os.path.join(out_dir,'XY'), XY)
-
-    # if path to XY.npy is specified, read the data intead
-    else:
-
-        click.echo('INFO: loading XY data from file {}'.format(os.path.join(root_dir, config.get('pre_calc', 'XY'))))
-        XY = np.load(os.path.join(root_dir, config.get('pre_calc', 'XY')), allow_pickle=True)
-        
-    # split the XY data into sample data X and target values Y
-    X, Y = data.split_XY_data(XY, config)    
-
-    return X, Y
-
-def prepare_ML(config):
-    """Top-level function to instantiate the scaler and model as specified in model configurations.
-
-    Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-
-    Returns:
-        scaler: the specified scaler instance.
-        classifier: the specified model instance.
-    """    
-
-    scaler = machine_learning.define_scaling(config)
-
-    clf = machine_learning.define_model(config)
-
-    return scaler, clf
-
-def run_reference(X, Y, config, scaler, clf, out_dir, run_nr):
-    """Top-level function to run one of the four supported models.
-
-    Args:
-        X (array): X-array containing variable values.
-        Y (array): Y-array containing conflict data.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        scaler (scaler): the specified scaler instance.
-        clf (classifier): the specified model instance.
-        out_dir (str): path to output folder.
-
-    Raises:
-        ValueError: raised if unsupported model is specified.
-
-    Returns:
-        dataframe: containing the test-data X-array values.
-        datatrame: containing model output on polygon-basis.
-        dict: dictionary containing evaluation metrics per simulation.
-    """    
-
-    # depending on selection, run corresponding model with data
-    if config.getint('general', 'model') == 1:
-        X_df, y_df, eval_dict = models.all_data(X, Y, config, scaler, clf, out_dir, run_nr)
-    elif config.getint('general', 'model') == 2:
-        X_df, y_df, eval_dict = models.leave_one_out(X, Y, config, scaler, clf, out_dir)
-    elif config.getint('general', 'model') == 3:
-        X_df, y_df, eval_dict = models.single_variables(X, Y, config, scaler, clf, out_dir)
-    elif config.getint('general', 'model') == 4:
-        X_df, y_df, eval_dict = models.dubbelsteen(X, Y, config, scaler, clf, out_dir)
-    else:
-        raise ValueError('the specified model type in the cfg-file is invalid - specify either 1, 2, 3 or 4.')
-
-    return X_df, y_df, eval_dict
-
 def run_prediction(scaler, main_dict, root_dir, selected_polygons_gdf):
     """Top-level function to execute the projections.
     Per specified projection, conflict is projected forwards in time per time step until the projection year is reached.
-    Pear time step, the sample data and conflict data are read individually since different conflict projections are made per classifier used.
+    Pear time step, the sample data and conflict data are read individually since different 
+    conflict projections are made per classifier used.
     At the end of each time step, the projections of all classifiers are combined and output metrics determined.
 
     Args:
         scaler (scaler): the specified scaler instance.
-        main_dict (dict): dictionary containing config-objects and output directories for reference run and all projection runs.
+        main_dict (dict): dictionary containing config-objects and output directories \
+            for reference run and all projection runs.
         root_dir (str): path to location of cfg-file.
-        selected_polygons_gdf (geo-dataframe): 
-
-    Raises:
-        ValueError: raised if another model type than the one using all data is specified in cfg-file.
+        selected_polygons_gdf (geo-dataframe):
 
     Returns:
         dataframe: containing model output on polygon-basis.
-    """    
+    """
 
-    config_REF = main_dict['_REF'][0]
-    out_dir_REF = main_dict['_REF'][1]
+    config_REF = main_dict["_REF"][0]
+    out_dir_REF = main_dict["_REF"][1]
 
     clfs = machine_learning.load_clfs(config_REF, out_dir_REF)
 
-    if config_REF.getint('general', 'model') != 1:
-        raise ValueError('ERROR: making a prediction is only possible with model type 1, i.e. using all data')
-
     # initiate output dataframe
-    all_y_df = pd.DataFrame(columns=['ID', 'geometry', 'y_pred'])
+    all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
 
     # going through each projection specified
-    for (each_key, each_val) in config_REF.items('PROJ_files'):
+    for each_key, _ in config_REF.items("PROJ_files"):
 
         # get config-object and out-dir per projection
-        click.echo('INFO: loading config-object for projection run: {}'.format(each_key))
+        click.echo(f"Loading config-object for projection run: {each_key}.")
         config_PROJ = main_dict[str(each_key)][0][0]
         out_dir_PROJ = main_dict[str(each_key)][1]
 
-        # aligning verbosity settings across config-objects
-        config_PROJ.set('general', 'verbose', str(config_REF.getboolean('general', 'verbose')))
-
-        if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: storing output for this projections to folder {}'.format(out_dir_PROJ))
+        click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.")
 
         # if not os.path.isdir(os.path.join(out_dir_PROJ, 'files')):
         #     os.makedirs(os.path.join(out_dir_PROJ, 'files'))
-        if not os.path.isdir(os.path.join(out_dir_PROJ, 'clfs')):
-            os.makedirs(os.path.join(out_dir_PROJ, 'clfs'))
+        if not os.path.isdir(os.path.join(out_dir_PROJ, "clfs")):
+            os.makedirs(os.path.join(out_dir_PROJ, "clfs"))
 
         # get projection period for this projection
         # defined as all years starting from end of reference run until specified end of projections
         projection_period = utils.determine_projection_period(config_REF, config_PROJ)
 
         # for this projection, go through all years
-        for i in range(len(projection_period)):
+        for i, proj_year in enumerate(projection_period):
 
-            proj_year = projection_period[i]
-            click.echo('INFO: making projection for year {}'.format(proj_year))
+            click.echo(f"Making projection for year {proj_year}.")
 
-            X = data.initiate_X_data(config_PROJ)
-            X = data.fill_X_sample(X, config_PROJ, root_dir, selected_polygons_gdf, proj_year)
+            X = xydata.initiate_X_data(config_PROJ)
+            X = xydata.fill_X_sample(
+                X, config_PROJ, root_dir, selected_polygons_gdf, proj_year
+            )
 
-            # for the first projection year, we need to fall back on the observed conflict at the last time step of the reference run
+            # for the first projection year, we need to fall back on the observed conflict
+            # at the last time step of the reference run
             if i == 0:
-                if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: reading previous conflicts from file {}'.format(os.path.join(out_dir_REF, 'files', 'conflicts_in_{}.csv'.format(config_REF.getint('settings', 'y_end')))))
-                conflict_data = pd.read_csv(os.path.join(out_dir_REF, 'files', 'conflicts_in_{}.csv'.format(config_REF.getint('settings', 'y_end'))), index_col=0)
-
-                if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: combining sample data with conflict data from previous year')
-                X = data.fill_X_conflict(X, config_PROJ, conflict_data, selected_polygons_gdf)
+                click.echo(
+                    "Reading previous conflicts from file {}".format(
+                        os.path.join(
+                            out_dir_REF,
+                            "files",
+                            "conflicts_in_{}.csv".format(
+                                config_REF.getint("settings", "y_end")
+                            ),
+                        )
+                    )
+                )
+                conflict_data = pd.read_csv(
+                    os.path.join(
+                        out_dir_REF,
+                        "files",
+                        "conflicts_in_{}.csv".format(
+                            config_REF.getint("settings", "y_end")
+                        ),
+                    ),
+                    index_col=0,
+                )
+
+                X = xydata.fill_X_conflict(
+                    X, config_PROJ, conflict_data, selected_polygons_gdf
+                )
                 X = pd.DataFrame.from_dict(X).to_numpy()
 
             # initiating dataframe containing all projections from all classifiers for this timestep
-            y_df = pd.DataFrame(columns=['ID', 'geometry', 'y_pred'])
+            y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
 
             # now load all classifiers created in the reference run
             for clf in clfs:
 
                 # creating an individual output folder per classifier
-                if not os.path.isdir(os.path.join(os.path.join(out_dir_PROJ, 'clfs', str(clf).rsplit('.')[0]))):
-                    os.makedirs(os.path.join(out_dir_PROJ, 'clfs', str(clf).rsplit('.')[0]))
-                
+                if not os.path.isdir(
+                    os.path.join(
+                        os.path.join(
+                            out_dir_PROJ, "clfs", str(clf).rsplit(".", maxsplit=1)[0]
+                        )
+                    )
+                ):
+                    os.makedirs(
+                        os.path.join(
+                            out_dir_PROJ, "clfs", str(clf).rsplit(".", maxsplit=1)[0]
+                        )
+                    )
+
                 # load the pickled objects
                 # TODO: keep them in memory, i.e. after reading the clfs-folder above
-                with open(os.path.join(out_dir_REF, 'clfs', clf), 'rb') as f:
-                    if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: loading classifier {} from {}'.format(clf, os.path.join(out_dir_REF, 'clfs')))
+                with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f:
+                    click.echo(
+                        "Loading classifier {} from {}".format(
+                            clf, os.path.join(out_dir_REF, "clfs")
+                        )
+                    )
                     clf_obj = pickle.load(f)
 
-                # for all other projection years than the first one, we need to read projected conflict from the previous projection year
+                # for all other projection years than the first one,
+                # we need to read projected conflict from the previous projection year
                 if i > 0:
-                    if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: reading previous conflicts from file {}'.format(os.path.join(out_dir_PROJ, 'clfs', str(clf), 'projection_for_{}.csv'.format(proj_year-1))))
-                    conflict_data = pd.read_csv(os.path.join(out_dir_PROJ, 'clfs', str(clf).rsplit('.')[0], 'projection_for_{}.csv'.format(proj_year-1)), index_col=0)
-
-                    if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: combining sample data with conflict data for {}'.format(clf.rsplit('.')[0]))
-                    X = data.fill_X_conflict(X, config_PROJ, conflict_data, selected_polygons_gdf)
+                    click.echo(
+                        "Reading previous conflicts from file {}".format(
+                            os.path.join(
+                                out_dir_PROJ,
+                                "clfs",
+                                str(clf),
+                                "projection_for_{}.csv".format(proj_year - 1),
+                            )
+                        )
+                    )
+                    conflict_data = pd.read_csv(
+                        os.path.join(
+                            out_dir_PROJ,
+                            "clfs",
+                            str(clf).rsplit(".", maxsplit=1)[0],
+                            "projection_for_{}.csv".format(proj_year - 1),
+                        ),
+                        index_col=0,
+                    )
+
+                    X = xydata.fill_X_conflict(
+                        X, config_PROJ, conflict_data, selected_polygons_gdf
+                    )
                     X = pd.DataFrame.from_dict(X).to_numpy()
 
                 X = pd.DataFrame(X)
-                if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: number of data points including missing values: {}'.format(len(X)))
-
                 X = X.fillna(0)
-                
+
                 # put all the data into the machine learning algo
                 # here the data will be used to make projections with various classifiers
                 # returns the prediction based on one individual classifier
                 y_df_clf = models.predictive(X, clf_obj, scaler, config_PROJ)
 
                 # storing the projection per clf to be used in the following timestep
-                y_df_clf.to_csv(os.path.join(out_dir_PROJ, 'clfs', str(clf).rsplit('.')[0], 'projection_for_{}.csv'.format(proj_year)))
+                y_df_clf.to_csv(
+                    os.path.join(
+                        out_dir_PROJ,
+                        "clfs",
+                        str(clf).rsplit(".", maxsplit=1)[0],
+                        "projection_for_{}.csv".format(proj_year),
+                    )
+                )
 
                 # removing projection of previous time step as not needed anymore
                 if i > 0:
-                    os.remove(os.path.join(out_dir_PROJ, 'clfs', str(clf).rsplit('.')[0], 'projection_for_{}.csv'.format(proj_year-1)))
+                    os.remove(
+                        os.path.join(
+                            out_dir_PROJ,
+                            "clfs",
+                            str(clf).rsplit(".", maxsplit=1)[0],
+                            "projection_for_{}.csv".format(proj_year - 1),
+                        )
+                    )
 
                 # append to all classifiers dataframe
                 y_df = y_df.append(y_df_clf, ignore_index=True)
@@ -217,12 +183,18 @@ def run_prediction(scaler, main_dict, root_dir, selected_polygons_gdf):
             # get look-up dataframe to assign geometry to polygons via unique ID
             global_df = utils.global_ID_geom_info(selected_polygons_gdf)
 
-            if config_REF.getboolean('general', 'verbose'): click.echo('DEBUG: storing model output for year {} to output folder'.format(proj_year))
-            df_hit, gdf_hit = evaluation.polygon_model_accuracy(y_df, global_df, make_proj=True)
-            # df_hit.to_csv(os.path.join(out_dir_PROJ, 'output_in_{}.csv'.format(proj_year)))
-            gdf_hit.to_file(os.path.join(out_dir_PROJ, 'output_in_{}.geojson'.format(proj_year)), driver='GeoJSON')
+            click.echo(
+                f"Storing model output for year {proj_year} to output folder".format
+            )
+            _, gdf_hit = evaluation.polygon_model_accuracy(
+                y_df, global_df, make_proj=True
+            )
+            gdf_hit.to_file(
+                os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"),
+                driver="GeoJSON",
+            )
 
         # create one major output dataframe containing all output for all projections with all classifiers
         all_y_df = all_y_df.append(y_df, ignore_index=True)
 
-    return all_y_df
\ No newline at end of file
+    return all_y_df
diff --git a/copro/data.py b/copro/xydata.py
similarity index 79%
rename from copro/data.py
rename to copro/xydata.py
index 2300c1a..33fe7cf 100644
--- a/copro/data.py
+++ b/copro/xydata.py
@@ -1,19 +1,22 @@
-from copro import conflict, variables
+from copro import conflict, variables, nb, utils
 from configparser import RawConfigParser
+from typing import Tuple
 import click
 import numpy as np
 import xarray as xr
 import pandas as pd
+import geopandas as gpd
 import os
+import warnings
 
 
-class XY:
+class XYData:
     def __init__(self, config: RawConfigParser):
         self.XY_dict = {}
         self.__XY_dict_initiated__ = False
         self.config = config
 
-    def initiate_XY_data(self) -> None:
+    def _initiate_XY_data(self):
 
         if self.__XY_dict_initiated__:
             raise ValueError(
@@ -30,13 +33,63 @@ def initiate_XY_data(self) -> None:
         self.XY_dict["conflict_t_min_1_nb"] = pd.Series(dtype=float)
         self.XY_dict["conflict"] = pd.Series(dtype=bool)
 
-        if self.config.getboolean("general", "verbose"):
-            click.echo("DEBUG: the columns in the sample matrix used are:")
-            for key in self.XY_dict:
-                click.echo("...{}".format(key))
+        click.echo("The columns in the sample matrix used are:")
+        for key in self.XY_dict:
+            click.echo(f"\t{key}.")
 
         self.__XY_dict_initiated__ = True
 
+    def create_XY(
+        self,
+        out_dir: click.Path,
+        root_dir: click.Path,
+        polygon_gdf: gpd.GeoDataFrame,
+        conflict_gdf: gpd.GeoDataFrame,
+    ) -> Tuple[np.array, np.array]:
+        """Top-level function to create the X-array and Y-array.
+        If the XY-data was pre-computed and specified in cfg-file, the data is loaded.
+        If not, variable values and conflict data are read from file and stored in array.
+        The resulting array is by default saved as npy-format to file.
+
+        Args:
+            config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+            out_dir (str): path to output folder.
+            root_dir (str): path to location of cfg-file.
+            polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
+            conflict_gdf (geo-dataframe): geo-dataframe containing the selected conflicts.
+
+        Returns:
+            array: X-array containing variable values.
+            array: Y-array containing conflict data.
+        """
+
+        # if nothing is specified in cfg-file, then initiate and fill XY data from scratch
+        if self.config.get("pre_calc", "XY") != "":
+            self._initiate_XY_data()
+            # fill the dictionary and get array
+            XY_arr = _fill_XY(
+                self.XY_dict, self.config, root_dir, conflict_gdf, polygon_gdf, out_dir
+            )
+            # save array to XY.npy out_dir
+            click.echo(
+                f"Saving XY data by default to file {os.path.join(out_dir, 'XY.npy')}."
+            )
+            np.save(os.path.join(out_dir, "XY"), XY_arr)
+        # if path to XY.npy is specified, read the data intead
+        else:
+            click.echo(
+                f"Loading XY data from file {os.path.join(root_dir, self.config.get('pre_calc', 'XY'))}."
+            )
+            XY_arr = np.load(
+                os.path.join(root_dir, self.config.get("pre_calc", "XY")),
+                allow_pickle=True,
+            )
+
+        # split the XY data into sample data X and target values Y
+        X, Y = _split_XY_data(XY_arr)
+
+        return X, Y
+
 
 def initiate_X_data(config):
     """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data.
@@ -71,160 +124,6 @@ def initiate_X_data(config):
     return X
 
 
-def fill_XY(XY, config, root_dir, conflict_data, polygon_gdf, out_dir):  # noqa: R0912
-    """Fills the (XY-)dictionary with data for each variable and conflict for each polygon for each simulation year.
-    The number of rows should therefore equal to number simulation years times number of polygons.
-    At end of last simulation year, the dictionary is converted to a numpy-array.
-
-    Args:
-        XY (dict): initiated, i.e. empty, XY-dictionary
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        root_dir (str): path to location of cfg-file.
-        conflict_data (geo-dataframe): geo-dataframe containing the selected conflicts.
-        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
-        out_dir (path): path to output folder.
-
-    Raises:
-        Warning: raised if the datetime-format of the netCDF-file does not match conventions and/or supported formats.
-
-    Returns:
-        array: filled array containing the variable values (X) and binary conflict data (Y) plus meta-data.
-    """
-
-    # go through all simulation years as specified in config-file
-    model_period = np.arange(
-        config.getint("settings", "y_start"), config.getint("settings", "y_end") + 1, 1
-    )
-    click.echo(
-        "INFO: reading data for period from {} to {}".format(
-            model_period[0], model_period[-1]
-        )
-    )
-
-    neighboring_matrix = neighboring_polys(config, polygon_gdf)
-
-    for (sim_year, i) in zip(model_period, range(len(model_period))):
-
-        if i == 0:
-
-            click.echo(
-                "INFO: skipping first year {} to start up model".format(sim_year)
-            )
-
-        else:
-
-            click.echo("INFO: entering year {}".format(sim_year))
-
-            # go through all keys in dictionary
-            for key, value in XY.items():
-
-                if key == "conflict":
-
-                    data_series = value
-                    data_list = conflict.conflict_in_year_bool(
-                        config, conflict_data, polygon_gdf, sim_year, out_dir
-                    )
-                    data_series = data_series.append(
-                        pd.Series(data_list), ignore_index=True
-                    )
-                    XY[key] = data_series
-
-                elif key == "conflict_t_min_1":
-
-                    data_series = value
-                    data_list = conflict.conflict_in_previous_year(
-                        config, conflict_data, polygon_gdf, sim_year
-                    )
-                    data_series = data_series.append(
-                        pd.Series(data_list), ignore_index=True
-                    )
-                    XY[key] = data_series
-
-                elif key == "conflict_t_min_1_nb":
-
-                    data_series = value
-                    data_list = conflict.conflict_in_previous_year(
-                        config,
-                        conflict_data,
-                        polygon_gdf,
-                        sim_year,
-                        check_neighbors=True,
-                        neighboring_matrix=neighboring_matrix,
-                    )
-                    data_series = data_series.append(
-                        pd.Series(data_list), ignore_index=True
-                    )
-                    XY[key] = data_series
-
-                elif key == "poly_ID":
-
-                    data_series = value
-                    data_list = conflict.get_poly_ID(polygon_gdf)
-                    data_series = data_series.append(
-                        pd.Series(data_list), ignore_index=True
-                    )
-                    XY[key] = data_series
-
-                elif key == "poly_geometry":
-
-                    data_series = value
-                    data_list = conflict.get_poly_geometry(polygon_gdf, config)
-                    data_series = data_series.append(
-                        pd.Series(data_list), ignore_index=True
-                    )
-                    XY[key] = data_series
-
-                else:
-
-                    nc_ds = xr.open_dataset(
-                        os.path.join(
-                            root_dir,
-                            config.get("general", "input_dir"),
-                            config.get("data", key),
-                        ).rsplit(",")[0]
-                    )
-
-                    if (np.dtype(nc_ds.time) == np.float32) or (
-                        np.dtype(nc_ds.time) == np.float64
-                    ):
-                        data_series = value
-                        data_list = variables.nc_with_float_timestamp(
-                            polygon_gdf, config, root_dir, key, sim_year
-                        )
-                        data_series = data_series.append(
-                            pd.Series(data_list), ignore_index=True
-                        )
-                        XY[key] = data_series
-
-                    elif np.dtype(nc_ds.time) == "datetime64[ns]":
-                        data_series = value
-                        data_list = variables.nc_with_continous_datetime_timestamp(
-                            polygon_gdf, config, root_dir, key, sim_year
-                        )
-                        data_series = data_series.append(
-                            pd.Series(data_list), ignore_index=True
-                        )
-                        XY[key] = data_series
-
-                    else:
-                        raise Warning(
-                            "This file has an unsupported dtype for the time variable: {}".format(
-                                os.path.join(
-                                    root_dir,
-                                    config.get("general", "input_dir"),
-                                    config.get("data", key),
-                                )
-                            )
-                        )
-
-            if config.getboolean("general", "verbose"):
-                click.echo("DEBUG: all data read")
-
-    df_out = pd.DataFrame.from_dict(XY)
-
-    return df_out.to_numpy()
-
-
 def fill_X_sample(X, config, root_dir, polygon_gdf, proj_year):
     """Fills the X-dictionary with the data sample data besides
     any conflict-related data for each polygon and each year.
@@ -325,7 +224,7 @@ def fill_X_conflict(X, config, conflict_data, polygon_gdf):
     """
 
     # determine all neighbours for each polygon
-    neighboring_matrix = neighboring_polys(config, polygon_gdf)
+    neighboring_matrix = nb.neighboring_polys(config, polygon_gdf)
 
     # go through all keys in dictionary
     for key, value in X.items():
@@ -359,7 +258,154 @@ def fill_X_conflict(X, config, conflict_data, polygon_gdf):
     return X
 
 
-def split_XY_data(XY, config):
+def _fill_XY(  # noqa: R0912
+    XY: dict,
+    config: RawConfigParser,
+    root_dir: click.Path,
+    conflict_data: gpd.GeoDataFrame,
+    polygon_gdf: gpd.GeoDataFrame,
+    out_dir: click.Path,
+) -> np.ndarray:
+    """Fills the (XY-)dictionary with data for each variable and conflict for each polygon for each simulation year.
+    The number of rows should therefore equal to number simulation years times number of polygons.
+    At end of last simulation year, the dictionary is converted to a numpy-array.
+
+    Args:
+        XY (dict): initiated, i.e. empty, XY-dictionary
+        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        root_dir (str): path to location of cfg-file.
+        conflict_data (geo-dataframe): geo-dataframe containing the selected conflicts.
+        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
+        out_dir (path): path to output folder.
+
+    Returns:
+        array: filled array containing the variable values (X) and binary conflict data (Y) plus meta-data.
+    """
+
+    # go through all simulation years as specified in config-file
+    model_period = np.arange(
+        config.getint("settings", "y_start"), config.getint("settings", "y_end") + 1, 1
+    )
+    click.echo(f"Reading data for period from {model_period[0]} to {model_period[-1]}.")
+
+    neighboring_matrix = nb.neighboring_polys(polygon_gdf)
+
+    for (sim_year, i) in zip(model_period, range(len(model_period))):
+
+        if i == 0:
+            click.echo(f"Skipping first year {sim_year} to start up model")
+        else:
+            click.echo(f"Entering year {sim_year}.")
+            # go through all keys in dictionary
+            for key, value in XY.items():
+
+                if key == "conflict":
+
+                    data_series = value
+                    data_list = conflict.conflict_in_year_bool(
+                        config, conflict_data, polygon_gdf, sim_year, out_dir
+                    )
+                    data_series = data_series.append(
+                        pd.Series(data_list), ignore_index=True
+                    )
+                    XY[key] = data_series
+
+                elif key == "conflict_t_min_1":
+
+                    data_series = value
+                    data_list = conflict.conflict_in_previous_year(
+                        config, conflict_data, polygon_gdf, sim_year
+                    )
+                    data_series = data_series.append(
+                        pd.Series(data_list), ignore_index=True
+                    )
+                    XY[key] = data_series
+
+                elif key == "conflict_t_min_1_nb":
+
+                    data_series = value
+                    data_list = conflict.conflict_in_previous_year(
+                        config,
+                        conflict_data,
+                        polygon_gdf,
+                        sim_year,
+                        check_neighbors=True,
+                        neighboring_matrix=neighboring_matrix,
+                    )
+                    data_series = data_series.append(
+                        pd.Series(data_list), ignore_index=True
+                    )
+                    XY[key] = data_series
+
+                elif key == "poly_ID":
+
+                    data_series = value
+                    data_list = utils.get_poly_ID(polygon_gdf)
+                    data_series = data_series.append(
+                        pd.Series(data_list), ignore_index=True
+                    )
+                    XY[key] = data_series
+
+                elif key == "poly_geometry":
+
+                    data_series = value
+                    data_list = utils.get_poly_geometry(polygon_gdf)
+                    data_series = data_series.append(
+                        pd.Series(data_list), ignore_index=True
+                    )
+                    XY[key] = data_series
+
+                else:
+
+                    nc_ds = xr.open_dataset(
+                        os.path.join(
+                            root_dir,
+                            config.get("general", "input_dir"),
+                            config.get("data", key),
+                        ).rsplit(",")[0]
+                    )
+
+                    if (np.dtype(nc_ds.time) == np.float32) or (
+                        np.dtype(nc_ds.time) == np.float64
+                    ):
+                        data_series = value
+                        data_list = variables.nc_with_float_timestamp(
+                            polygon_gdf, config, root_dir, key, sim_year
+                        )
+                        data_series = data_series.append(
+                            pd.Series(data_list), ignore_index=True
+                        )
+                        XY[key] = data_series
+
+                    elif np.dtype(nc_ds.time) == "datetime64[ns]":
+                        data_series = value
+                        data_list = variables.nc_with_continous_datetime_timestamp(
+                            polygon_gdf, config, root_dir, key, sim_year
+                        )
+                        data_series = data_series.append(
+                            pd.Series(data_list), ignore_index=True
+                        )
+                        XY[key] = data_series
+
+                    else:
+                        warnings.warn(
+                            "This file has an unsupported dtype for the time variable: {}".format(
+                                os.path.join(
+                                    root_dir,
+                                    config.get("general", "input_dir"),
+                                    config.get("data", key),
+                                )
+                            )
+                        )
+
+            click.echo("All data read.")
+
+    df_out = pd.DataFrame.from_dict(XY)
+
+    return df_out.to_numpy()
+
+
+def _split_XY_data(XY_arr: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     """Separates the XY-array into array containing information about
     variable values (X-array or sample data) and conflict data (Y-array or target data).
     Thereby, the X-array also contains the information about
@@ -374,72 +420,22 @@ def split_XY_data(XY, config):
     """
 
     # convert array to dataframe for easier handling
-    XY = pd.DataFrame(XY)
-    if config.getboolean("general", "verbose"):
-        click.echo(
-            "DEBUG: number of data points including missing values: {}".format(len(XY))
-        )
-
+    XY_df = pd.DataFrame(XY_arr)
     # fill all missing values with 0
-    XY = XY.fillna(0)
-
+    XY_df = XY_df.fillna(0)
     # convert dataframe back to array
-    XY = XY.to_numpy()
+    XY_df = XY_df.to_numpy()
 
     # get X data
     # since conflict is the last column, we know that all previous columns must be variable values
-    X = XY[:, :-1]
+    X = XY_df[:, :-1]
     # get Y data and convert to integer values
-    Y = XY[:, -1]
+    Y = XY_df[:, -1]
     Y = Y.astype(int)
 
-    if config.getboolean("general", "verbose"):
-        fraction_Y_1 = 100 * len(np.where(Y != 0)[0]) / len(Y)
-        click.echo(
-            "DEBUG: a fraction of {} percent in the data corresponds to conflicts.".format(
-                round(fraction_Y_1, 2)
-            )
-        )
+    fraction_Y_1 = 100 * len(np.where(Y != 0)[0]) / len(Y)
+    click.echo(
+        f"{round(fraction_Y_1, 2)} percent in the data corresponds to conflicts."
+    )
 
     return X, Y
-
-
-def neighboring_polys(config, extent_gdf, identifier="watprovID"):
-    """For each polygon, determines its neighboring polygons.
-    As result, a (n x n) look-up dataframe is obtained containing, where n is number of polygons in extent_gdf.
-
-    Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        extent_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
-        identifier (str, optional): column name in extent_gdf to be used to identify neighbors. Defaults to 'watprovID'.
-
-    Returns:
-        dataframe: look-up dataframe containing True/False statement per polygon for all other polygons.
-    """
-
-    if config.getboolean("general", "verbose"):
-        click.echo("DEBUG: determining matrix with neighboring polygons")
-
-    # initialise empty dataframe
-    df = pd.DataFrame()
-
-    # go through each polygon aka water province
-    for i in range(len(extent_gdf)):
-        # get geometry of current polygon
-        wp = extent_gdf.geometry.iloc[i]
-        # check which polygons in geodataframe (i.e. all water provinces) touch the current polygon
-        # also create a dataframe from result (boolean)
-        # the transpose is needed to easier append
-        df_temp = pd.DataFrame(
-            extent_gdf.geometry.touches(wp), columns=[extent_gdf[identifier].iloc[i]]
-        ).T
-        # append the dataframe
-        df = df.append(df_temp)
-
-    # replace generic indices with actual water province IDs
-    df.set_index(extent_gdf[identifier], inplace=True)
-
-    # replace generic columns with actual water province IDs
-    df.columns = extent_gdf[identifier].values
-
-    return df
diff --git a/tests/test_data.py b/tests/test_data.py
index b94561b..27b4d7c 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -2,17 +2,19 @@
 import configparser
 import numpy as np
 import pandas as pd
-from copro import data
+from copro import xydata
+
 
 def create_fake_config():
 
     config = configparser.ConfigParser()
 
-    config.add_section('general')
-    config.set('general', 'verbose', str(False))
+    config.add_section("general")
+    config.set("general", "verbose", str(False))
 
     return config
 
+
 def test_split_XY_data():
 
     config = create_fake_config()
@@ -21,8 +23,8 @@ def test_split_XY_data():
     y_arr = [1, 0, 0, 1]
 
     XY_in = np.column_stack((X_arr, y_arr))
-    
-    X, Y = data.split_XY_data(XY_in, config)
+
+    X, Y = xydata.split_XY_data(XY_in, config)
 
     XY_out = np.column_stack((X, Y))
 

From 3abe27560353536b6c01ea73268539cc3e3a041f Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Feb 2024 20:50:11 +0100
Subject: [PATCH 12/13] moved run_prediction func from ppeline.py to models.py

---
 copro/models.py   | 215 +++++++++++++++++++++++++++++++++++++++++++++-
 copro/pipeline.py | 200 ------------------------------------------
 2 files changed, 214 insertions(+), 201 deletions(-)
 delete mode 100644 copro/pipeline.py

diff --git a/copro/models.py b/copro/models.py
index 9be86ab..7448e9b 100644
--- a/copro/models.py
+++ b/copro/models.py
@@ -1,9 +1,13 @@
-from copro import machine_learning, conflict, evaluation
+from copro import machine_learning, conflict, evaluation, utils, xydata
 from configparser import RawConfigParser
 from sklearn import preprocessing, ensemble
 import pandas as pd
 import numpy as np
 from typing import Union
+import geopandas as gpd
+import click
+import os
+import pickle
 
 
 class MainModel:
@@ -90,3 +94,212 @@ def run(self) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
         )
 
         return X_df, y_df, eval_dict
+
+    def run_prediction(
+        self,
+        main_dict: dict,
+        root_dir: click.Path,
+        selected_polygons_gdf: gpd.GeoDataFrame,
+    ) -> pd.DataFrame:
+        """Top-level function to execute the projections.
+        Per specified projection, conflict is projected forwards in time per time step 
+        until the projection year is reached.
+        Pear time step, the sample data and conflict data are read individually since different 
+        conflict projections are made per classifier used.
+        At the end of each time step, the projections of all classifiers are combined and output metrics determined.
+
+        Args:
+            main_dict (dict): dictionary containing config-objects and output directories \
+                for reference run and all projection runs.
+            root_dir (str): path to location of cfg-file.
+            selected_polygons_gdf (geo-dataframe):
+
+        Returns:
+            dataframe: containing model output on polygon-basis.
+        """
+
+        config_REF = main_dict["_REF"][0]
+        out_dir_REF = main_dict["_REF"][1]
+
+        clfs = machine_learning.load_clfs(config_REF, out_dir_REF)
+
+        # initiate output dataframe
+        all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
+
+        # going through each projection specified
+        for each_key, _ in config_REF.items("PROJ_files"):
+
+            # get config-object and out-dir per projection
+            click.echo(f"Loading config-object for projection run: {each_key}.")
+            config_PROJ = main_dict[str(each_key)][0][0]
+            out_dir_PROJ = main_dict[str(each_key)][1]
+
+            click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.")
+
+            # if not os.path.isdir(os.path.join(out_dir_PROJ, 'files')):
+            #     os.makedirs(os.path.join(out_dir_PROJ, 'files'))
+            if not os.path.isdir(os.path.join(out_dir_PROJ, "clfs")):
+                os.makedirs(os.path.join(out_dir_PROJ, "clfs"))
+
+            # get projection period for this projection
+            # defined as all years starting from end of reference run until specified end of projections
+            projection_period = utils.determine_projection_period(
+                config_REF, config_PROJ
+            )
+
+            # for this projection, go through all years
+            for i, proj_year in enumerate(projection_period):
+
+                click.echo(f"Making projection for year {proj_year}.")
+
+                X = xydata.initiate_X_data(config_PROJ)
+                X = xydata.fill_X_sample(
+                    X, config_PROJ, root_dir, selected_polygons_gdf, proj_year
+                )
+
+                # for the first projection year, we need to fall back on the observed conflict
+                # at the last time step of the reference run
+                if i == 0:
+                    click.echo(
+                        "Reading previous conflicts from file {}".format(
+                            os.path.join(
+                                out_dir_REF,
+                                "files",
+                                "conflicts_in_{}.csv".format(
+                                    config_REF.getint("settings", "y_end")
+                                ),
+                            )
+                        )
+                    )
+                    conflict_data = pd.read_csv(
+                        os.path.join(
+                            out_dir_REF,
+                            "files",
+                            "conflicts_in_{}.csv".format(
+                                config_REF.getint("settings", "y_end")
+                            ),
+                        ),
+                        index_col=0,
+                    )
+
+                    X = xydata.fill_X_conflict(
+                        X, config_PROJ, conflict_data, selected_polygons_gdf
+                    )
+                    X = pd.DataFrame.from_dict(X).to_numpy()
+
+                # initiating dataframe containing all projections from all classifiers for this timestep
+                y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
+
+                # now load all classifiers created in the reference run
+                for clf in clfs:
+
+                    # creating an individual output folder per classifier
+                    if not os.path.isdir(
+                        os.path.join(
+                            os.path.join(
+                                out_dir_PROJ,
+                                "clfs",
+                                str(clf).rsplit(".", maxsplit=1)[0],
+                            )
+                        )
+                    ):
+                        os.makedirs(
+                            os.path.join(
+                                out_dir_PROJ,
+                                "clfs",
+                                str(clf).rsplit(".", maxsplit=1)[0],
+                            )
+                        )
+
+                    # load the pickled objects
+                    # TODO: keep them in memory, i.e. after reading the clfs-folder above
+                    with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f:
+                        click.echo(
+                            "Loading classifier {} from {}".format(
+                                clf, os.path.join(out_dir_REF, "clfs")
+                            )
+                        )
+                        clf_obj = pickle.load(f)
+
+                    # for all other projection years than the first one,
+                    # we need to read projected conflict from the previous projection year
+                    if i > 0:
+                        click.echo(
+                            "Reading previous conflicts from file {}".format(
+                                os.path.join(
+                                    out_dir_PROJ,
+                                    "clfs",
+                                    str(clf),
+                                    "projection_for_{}.csv".format(proj_year - 1),
+                                )
+                            )
+                        )
+                        conflict_data = pd.read_csv(
+                            os.path.join(
+                                out_dir_PROJ,
+                                "clfs",
+                                str(clf).rsplit(".", maxsplit=1)[0],
+                                "projection_for_{}.csv".format(proj_year - 1),
+                            ),
+                            index_col=0,
+                        )
+
+                        X = xydata.fill_X_conflict(
+                            X, config_PROJ, conflict_data, selected_polygons_gdf
+                        )
+                        X = pd.DataFrame.from_dict(X).to_numpy()
+
+                    X = pd.DataFrame(X)
+                    X = X.fillna(0)
+
+                    # put all the data into the machine learning algo
+                    # here the data will be used to make projections with various classifiers
+                    # returns the prediction based on one individual classifier
+
+                    # TODO: check what this function should contain
+                    # TODO: and where it is now in the upcated copro code
+                    # y_df_clf = models.predictive(X, clf_obj, self.scaler, config_PROJ)
+                    y_df_clf = pd.DataFrame(clf_obj)
+
+                    # storing the projection per clf to be used in the following timestep
+                    y_df_clf.to_csv(
+                        os.path.join(
+                            out_dir_PROJ,
+                            "clfs",
+                            str(clf).rsplit(".", maxsplit=1)[0],
+                            "projection_for_{}.csv".format(proj_year),
+                        )
+                    )
+
+                    # removing projection of previous time step as not needed anymore
+                    if i > 0:
+                        os.remove(
+                            os.path.join(
+                                out_dir_PROJ,
+                                "clfs",
+                                str(clf).rsplit(".", maxsplit=1)[0],
+                                "projection_for_{}.csv".format(proj_year - 1),
+                            )
+                        )
+
+                    # append to all classifiers dataframe
+                    y_df = y_df.append(y_df_clf, ignore_index=True)
+
+                # get look-up dataframe to assign geometry to polygons via unique ID
+                global_df = utils.global_ID_geom_info(selected_polygons_gdf)
+
+                click.echo(
+                    f"Storing model output for year {proj_year} to output folder".format
+                )
+                _, gdf_hit = evaluation.polygon_model_accuracy(
+                    y_df, global_df, make_proj=True
+                )
+                gdf_hit.to_file(
+                    os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"),
+                    driver="GeoJSON",
+                )
+
+            # create one major output dataframe containing all output for all projections with all classifiers
+            all_y_df = all_y_df.append(y_df, ignore_index=True)
+
+        return all_y_df
diff --git a/copro/pipeline.py b/copro/pipeline.py
deleted file mode 100644
index f7573d2..0000000
--- a/copro/pipeline.py
+++ /dev/null
@@ -1,200 +0,0 @@
-from copro import models, machine_learning, evaluation, utils, xydata
-import pandas as pd
-import pickle
-import click
-import os
-
-
-def run_prediction(scaler, main_dict, root_dir, selected_polygons_gdf):
-    """Top-level function to execute the projections.
-    Per specified projection, conflict is projected forwards in time per time step until the projection year is reached.
-    Pear time step, the sample data and conflict data are read individually since different 
-    conflict projections are made per classifier used.
-    At the end of each time step, the projections of all classifiers are combined and output metrics determined.
-
-    Args:
-        scaler (scaler): the specified scaler instance.
-        main_dict (dict): dictionary containing config-objects and output directories \
-            for reference run and all projection runs.
-        root_dir (str): path to location of cfg-file.
-        selected_polygons_gdf (geo-dataframe):
-
-    Returns:
-        dataframe: containing model output on polygon-basis.
-    """
-
-    config_REF = main_dict["_REF"][0]
-    out_dir_REF = main_dict["_REF"][1]
-
-    clfs = machine_learning.load_clfs(config_REF, out_dir_REF)
-
-    # initiate output dataframe
-    all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
-
-    # going through each projection specified
-    for each_key, _ in config_REF.items("PROJ_files"):
-
-        # get config-object and out-dir per projection
-        click.echo(f"Loading config-object for projection run: {each_key}.")
-        config_PROJ = main_dict[str(each_key)][0][0]
-        out_dir_PROJ = main_dict[str(each_key)][1]
-
-        click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.")
-
-        # if not os.path.isdir(os.path.join(out_dir_PROJ, 'files')):
-        #     os.makedirs(os.path.join(out_dir_PROJ, 'files'))
-        if not os.path.isdir(os.path.join(out_dir_PROJ, "clfs")):
-            os.makedirs(os.path.join(out_dir_PROJ, "clfs"))
-
-        # get projection period for this projection
-        # defined as all years starting from end of reference run until specified end of projections
-        projection_period = utils.determine_projection_period(config_REF, config_PROJ)
-
-        # for this projection, go through all years
-        for i, proj_year in enumerate(projection_period):
-
-            click.echo(f"Making projection for year {proj_year}.")
-
-            X = xydata.initiate_X_data(config_PROJ)
-            X = xydata.fill_X_sample(
-                X, config_PROJ, root_dir, selected_polygons_gdf, proj_year
-            )
-
-            # for the first projection year, we need to fall back on the observed conflict
-            # at the last time step of the reference run
-            if i == 0:
-                click.echo(
-                    "Reading previous conflicts from file {}".format(
-                        os.path.join(
-                            out_dir_REF,
-                            "files",
-                            "conflicts_in_{}.csv".format(
-                                config_REF.getint("settings", "y_end")
-                            ),
-                        )
-                    )
-                )
-                conflict_data = pd.read_csv(
-                    os.path.join(
-                        out_dir_REF,
-                        "files",
-                        "conflicts_in_{}.csv".format(
-                            config_REF.getint("settings", "y_end")
-                        ),
-                    ),
-                    index_col=0,
-                )
-
-                X = xydata.fill_X_conflict(
-                    X, config_PROJ, conflict_data, selected_polygons_gdf
-                )
-                X = pd.DataFrame.from_dict(X).to_numpy()
-
-            # initiating dataframe containing all projections from all classifiers for this timestep
-            y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
-
-            # now load all classifiers created in the reference run
-            for clf in clfs:
-
-                # creating an individual output folder per classifier
-                if not os.path.isdir(
-                    os.path.join(
-                        os.path.join(
-                            out_dir_PROJ, "clfs", str(clf).rsplit(".", maxsplit=1)[0]
-                        )
-                    )
-                ):
-                    os.makedirs(
-                        os.path.join(
-                            out_dir_PROJ, "clfs", str(clf).rsplit(".", maxsplit=1)[0]
-                        )
-                    )
-
-                # load the pickled objects
-                # TODO: keep them in memory, i.e. after reading the clfs-folder above
-                with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f:
-                    click.echo(
-                        "Loading classifier {} from {}".format(
-                            clf, os.path.join(out_dir_REF, "clfs")
-                        )
-                    )
-                    clf_obj = pickle.load(f)
-
-                # for all other projection years than the first one,
-                # we need to read projected conflict from the previous projection year
-                if i > 0:
-                    click.echo(
-                        "Reading previous conflicts from file {}".format(
-                            os.path.join(
-                                out_dir_PROJ,
-                                "clfs",
-                                str(clf),
-                                "projection_for_{}.csv".format(proj_year - 1),
-                            )
-                        )
-                    )
-                    conflict_data = pd.read_csv(
-                        os.path.join(
-                            out_dir_PROJ,
-                            "clfs",
-                            str(clf).rsplit(".", maxsplit=1)[0],
-                            "projection_for_{}.csv".format(proj_year - 1),
-                        ),
-                        index_col=0,
-                    )
-
-                    X = xydata.fill_X_conflict(
-                        X, config_PROJ, conflict_data, selected_polygons_gdf
-                    )
-                    X = pd.DataFrame.from_dict(X).to_numpy()
-
-                X = pd.DataFrame(X)
-                X = X.fillna(0)
-
-                # put all the data into the machine learning algo
-                # here the data will be used to make projections with various classifiers
-                # returns the prediction based on one individual classifier
-                y_df_clf = models.predictive(X, clf_obj, scaler, config_PROJ)
-
-                # storing the projection per clf to be used in the following timestep
-                y_df_clf.to_csv(
-                    os.path.join(
-                        out_dir_PROJ,
-                        "clfs",
-                        str(clf).rsplit(".", maxsplit=1)[0],
-                        "projection_for_{}.csv".format(proj_year),
-                    )
-                )
-
-                # removing projection of previous time step as not needed anymore
-                if i > 0:
-                    os.remove(
-                        os.path.join(
-                            out_dir_PROJ,
-                            "clfs",
-                            str(clf).rsplit(".", maxsplit=1)[0],
-                            "projection_for_{}.csv".format(proj_year - 1),
-                        )
-                    )
-
-                # append to all classifiers dataframe
-                y_df = y_df.append(y_df_clf, ignore_index=True)
-
-            # get look-up dataframe to assign geometry to polygons via unique ID
-            global_df = utils.global_ID_geom_info(selected_polygons_gdf)
-
-            click.echo(
-                f"Storing model output for year {proj_year} to output folder".format
-            )
-            _, gdf_hit = evaluation.polygon_model_accuracy(
-                y_df, global_df, make_proj=True
-            )
-            gdf_hit.to_file(
-                os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"),
-                driver="GeoJSON",
-            )
-
-        # create one major output dataframe containing all output for all projections with all classifiers
-        all_y_df = all_y_df.append(y_df, ignore_index=True)
-
-    return all_y_df

From 6fd5c2b972a078f1b7bbd2a95c61dd9b4df63866 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Tue, 27 Feb 2024 09:18:10 +0100
Subject: [PATCH 13/13] updated cli script

---
 copro/machine_learning.py     |   4 +-
 copro/models.py               |  22 ++--
 copro/scripts/copro_runner.py | 239 +++++++++++++++++-----------------
 3 files changed, 128 insertions(+), 137 deletions(-)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index 045165f..a01ab73 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -12,7 +12,7 @@
 class MachineLearning:
     def __init__(self, config: RawConfigParser) -> None:
         self.config = config
-        self.scaler = _define_scaling(config)
+        self.scaler = define_scaling(config)
         self.clf = ensemble.RandomForestClassifier(
             n_estimators=1000, class_weight={1: 100}, random_state=42
         )
@@ -157,7 +157,7 @@ def _split_conflict_geom_data(
     return X_ID, X_geom, X_data
 
 
-def _define_scaling(
+def define_scaling(
     config: RawConfigParser,
 ) -> Union[
     preprocessing.MinMaxScaler,
diff --git a/copro/models.py b/copro/models.py
index 7448e9b..14c8817 100644
--- a/copro/models.py
+++ b/copro/models.py
@@ -1,6 +1,6 @@
 from copro import machine_learning, conflict, evaluation, utils, xydata
 from configparser import RawConfigParser
-from sklearn import preprocessing, ensemble
+from sklearn import ensemble
 import pandas as pd
 import numpy as np
 from typing import Union
@@ -16,15 +16,7 @@ def __init__(
         X: Union[np.ndarray, pd.DataFrame],
         Y: np.ndarray,
         config: RawConfigParser,
-        scaler: Union[
-            preprocessing.MinMaxScaler,
-            preprocessing.StandardScaler,
-            preprocessing.RobustScaler,
-            preprocessing.QuantileTransformer,
-        ],
-        clf: ensemble.RandomForestClassifier,
         out_dir: str,
-        run_nr: int,
     ):
         """Constructor for the MainModel class.
 
@@ -40,12 +32,14 @@ def __init__(
         self.X = X
         self.Y = Y
         self.config = config
-        self.scaler = scaler
-        self.clf = clf
+        # TODO: scaler and clf settings need to be aligned with machine_learning.py class
+        self.scaler = machine_learning.define_scaling(config)
+        self.clf = ensemble.RandomForestClassifier(
+            n_estimators=1000, class_weight={1: 100}, random_state=42
+        )
         self.out_dir = out_dir
-        self.run_nr = run_nr
 
-    def run(self) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
+    def run(self, run_nr) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
         """Main model workflow when all XY-data is used.
         The model workflow is executed for each classifier.
 
@@ -78,7 +72,7 @@ def run(self) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
 
         # fit classifier and make prediction with test-set
         y_pred, y_prob = MLmodel.fit_predict(
-            X_train, y_train, X_test, self.out_dir, self.run_nr
+            X_train, y_train, X_test, self.out_dir, run_nr
         )
         y_prob_0 = y_prob[:, 0]  # probability to predict 0
         y_prob_1 = y_prob[:, 1]  # probability to predict 1
diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py
index 559482e..39acd21 100644
--- a/copro/scripts/copro_runner.py
+++ b/copro/scripts/copro_runner.py
@@ -1,137 +1,134 @@
-import copro 
+from copro import settings, selection, evaluation, io, models, xydata
 
 import click
-import pandas as pd
 import numpy as np
-import os, sys
-import pickle
-import matplotlib.pyplot as plt
+import os
 
 import warnings
+
 warnings.filterwarnings("ignore")
 
-@click.command()
-@click.argument('cfg', type=click.Path())
-@click.option('--make_plots', '-plt', help='add additional output plots', type=bool)
-@click.option('--verbose', '-v', help='command line switch to turn on verbose mode', is_flag=True)
 
-def cli(cfg, make_plots=True, verbose=False):   
-    """Main command line script to execute the model. 
+@click.command()
+@click.argument("cfg", type=click.Path())
+def cli(cfg):
+    """Main command line script to execute the model.
     All settings are read from cfg-file.
     One cfg-file is required argument to train, test, and evaluate the model.
     Multiple classifiers are trained based on different train-test data combinations.
-    Additional cfg-files for multiple projections can be provided as optional arguments, whereby each file corresponds to one projection to be made.
+    Additional cfg-files for multiple projections can be provided as optional arguments,
+    whereby each file corresponds to one projection to be made.
     Per projection, each classifiers is used to create separate projection outcomes per time step (year).
     All outcomes are combined after each time step to obtain the common projection outcome.
 
     Args:
         CFG (str): (relative) path to cfg-file
-    """ 
-  
-    #- parsing settings-file
-    #- returns dictionary with config-objects and output directories of reference run and all projections
-    #- also returns root_dir which is the path to the cfg-file
-    main_dict, root_dir = copro.utils.initiate_setup(cfg)
-
-    #- get config-objct and out_dir for reference run
-    config_REF = main_dict['_REF'][0]
-    out_dir_REF = main_dict['_REF'][1]
-
-    #- is specified, set verbose-settings
-    if verbose:
-        config_REF.set('general', 'verbose', str(verbose))
-
-    click.echo(click.style('\nINFO: reference run started\n', fg='cyan'))
-
-    #- selecting conflicts and getting area-of-interest and aggregation level
-    conflict_gdf, extent_active_polys_gdf, global_df = copro.selection.select(config_REF, out_dir_REF, root_dir)
-
-    #- plot selected polygons and conflicts
-    if make_plots:
-        fig, ax = plt.subplots(1, 1)
-        copro.plots.selected_polygons(extent_active_polys_gdf, figsize=(20, 10), ax=ax)
-        copro.plots.selected_conflicts(conflict_gdf, ax=ax)
-        plt.savefig(os.path.join(out_dir_REF, 'selected_polygons_and_conflicts.png'), dpi=300, bbox_inches='tight')
-
-    #- create X and Y arrays by reading conflict and variable files for reference run
-    #- or by loading a pre-computed array (npy-file) if specified in cfg-file
-    X, Y = copro.pipeline.create_XY(config_REF, out_dir_REF, root_dir, extent_active_polys_gdf, conflict_gdf)
-
-    #- defining scaling and model algorithms
-    scaler, clf = copro.pipeline.prepare_ML(config_REF)
-
-    #- fit-transform on scaler to be used later during projections
-    click.echo('INFO: fitting scaler to sample data')
-    scaler_fitted = scaler.fit(X[: , 2:])
-
-    #- initializing output variables
-    #TODO: put all this into one function
-    out_X_df = copro.evaluation.init_out_df()
-    out_y_df = copro.evaluation.init_out_df()
-    out_dict = copro.evaluation.init_out_dict()
-    trps, aucs, mean_fpr = copro.evaluation.init_out_ROC_curve()
-
-    #- create plot instance for ROC plots
-    fig, ax1 = plt.subplots(1, 1, figsize=(20,10))
-
-    #- go through all n model executions
-    #- that is, create different classifiers based on different train-test data combinations
-    click.echo('INFO: training and testing machine learning model')
-    for n in range(config_REF.getint('machine_learning', 'n_runs')):
-        
-        click.echo('INFO: run {} of {}'.format(n+1, config_REF.getint('machine_learning', 'n_runs')))
-
-        #- run machine learning model and return outputs
-        X_df, y_df, eval_dict = copro.pipeline.run_reference(X, Y, config_REF, scaler, clf, out_dir_REF, run_nr=n+1)
-        
-        #- append per model execution
-        #TODO: put all this into one function
-        out_X_df = copro.evaluation.fill_out_df(out_X_df, X_df)
-        out_y_df = copro.evaluation.fill_out_df(out_y_df, y_df)
-        out_dict = copro.evaluation.fill_out_dict(out_dict, eval_dict)
-
-        ## NOTE 15-Mar-2023: ROC plotting has been changed in sklearn, needs updating
-        #- plot ROC curve per model execution
-        # tprs, aucs = copro.plots.plot_ROC_curve_n_times(ax1, clf, X_df.to_numpy(), y_df.y_test.to_list(),
-        #                                                             trps, aucs, mean_fpr)
-
-    ## NOTE 15-Mar-2023: ROC plotting has been changed in sklearn, needs updating
-    # #- plot mean ROC curve
-    # copro.plots.plot_ROC_curve_n_mean(ax1, tprs, aucs, mean_fpr)
-    # #- save plot
-    # plt.savefig(os.path.join(out_dir_REF, 'ROC_curve_per_run.png'), dpi=300, bbox_inches='tight')
-    # #- save data for plot
-    # copro.evaluation.save_out_ROC_curve(tprs, aucs, out_dir_REF)
-
-    #- save output dictionary to csv-file
-    copro.utils.save_to_csv(out_dict, out_dir_REF, 'evaluation_metrics')
-    copro.utils.save_to_npy(out_y_df, out_dir_REF, 'raw_output_data')
-    
-    #- print mean values of all evaluation metrics
-    if config_REF.getboolean('general', 'verbose'):
-        for key in out_dict:
-            click.echo('DEBUG: average {0} of run with {1} repetitions is {2:0.3f}'.format(key, config_REF.getint('machine_learning', 'n_runs'), np.mean(out_dict[key])))
-
-    #- create accuracy values per polygon and save to output folder
-    #- note only the dataframe is stored, not the geo-dataframe
-    df_hit, gdf_hit = copro.evaluation.polygon_model_accuracy(out_y_df, global_df)
-
-    gdf_hit.to_file(os.path.join(out_dir_REF, 'output_for_REF.geojson'), driver='GeoJSON')
-
-    #- plot distribution of all evaluation metrics
-    if make_plots:
-        fig, ax = plt.subplots(1, 1)
-        copro.plots.metrics_distribution(out_dict, figsize=(20, 10))
-        plt.savefig(os.path.join(out_dir_REF, 'metrics_distribution.png'), dpi=300, bbox_inches='tight')
-
-    df_feat_imp = copro.evaluation.get_feature_importance(clf, config_REF, out_dir_REF) 
-    df_perm_imp = copro.evaluation.get_permutation_importance(clf, scaler.fit_transform(X[:,2:]), Y, df_feat_imp, out_dir_REF)
-
-    click.echo(click.style('\nINFO: reference run succesfully finished\n', fg='cyan'))
-
-    click.echo(click.style('INFO: starting projections\n', fg='cyan'))
-
-    #- running prediction runs
-    copro.pipeline.run_prediction(scaler_fitted, main_dict, root_dir, extent_active_polys_gdf)
-
-    click.echo(click.style('\nINFO: all projections succesfully finished\n', fg='cyan'))
\ No newline at end of file
+    """
+
+    # - parsing settings-file
+    # - returns dictionary with config-objects and output directories of reference run and all projections
+    # - also returns root_dir which is the path to the cfg-file
+    main_dict, root_dir = settings.initiate_setup(cfg)
+
+    # - get config-objct and out_dir for reference run
+    config_REF = main_dict["_REF"][0]
+    out_dir_REF = main_dict["_REF"][1]
+
+    click.echo(click.style("\nINFO: reference run started\n", fg="cyan"))
+
+    # - selecting conflicts and getting area-of-interest and aggregation level
+    conflict_gdf, extent_active_polys_gdf, global_df = selection.select(
+        config_REF, out_dir_REF, root_dir
+    )
+
+    # - create X and Y arrays by reading conflict and variable files for reference run
+    # - or by loading a pre-computed array (npy-file) if specified in cfg-file
+    X, Y = xydata.create_XY(
+        config_REF, out_dir_REF, root_dir, extent_active_polys_gdf, conflict_gdf
+    )
+
+    # - defining scaling and model algorithms
+    MachineLearning = models.MainModel(
+        config=config_REF,
+        X=X,
+        Y=Y,
+        out_dir=out_dir_REF,
+    )
+
+    # - fit-transform on scaler to be used later during projections
+    click.echo("INFO: fitting scaler to sample data")
+    # TODO: scaler_fitted needs to be part of the class
+    _ = MachineLearning.scaler.fit(X[:, 2:])  # returns scaler_fitted
+
+    # - initializing output variables
+    # TODO: put all this into one function
+    out_X_df = evaluation.init_out_df()
+    out_y_df = evaluation.init_out_df()
+    out_dict = evaluation.init_out_dict()
+
+    # - go through all n model executions
+    # - that is, create different classifiers based on different train-test data combinations
+    click.echo("INFO: training and testing machine learning model")
+    for n in range(config_REF.getint("machine_learning", "n_runs")):
+
+        click.echo(
+            "INFO: run {} of {}".format(
+                n + 1, config_REF.getint("machine_learning", "n_runs")
+            )
+        )
+
+        # - run machine learning model and return outputs
+        # TODO: check if both function calls are doing the same
+        # copro.pipeline.run_reference(X, Y, config_REF, scaler, clf, out_dir_REF, run_nr=n+1)
+        X_df, y_df, _ = MachineLearning.run(run_nr=n)
+
+        # - append per model execution
+        # TODO: put all this into one function
+        out_X_df = evaluation.fill_out_df(out_X_df, X_df)
+        out_y_df = evaluation.fill_out_df(out_y_df, y_df)
+        # TODO: the arguments available and required do not seem to match
+        out_dict = evaluation.fill_out_dict(
+            out_dict=out_dict, y_pred=None, y_prob=None, y_test=None, config=config_REF
+        )
+
+    # - save output dictionary to csv-file
+    io.save_to_csv(out_dict, out_dir_REF, "evaluation_metrics")
+    io.save_to_npy(out_y_df, out_dir_REF, "raw_output_data")
+
+    # - print mean values of all evaluation metrics
+    for key, value in out_dict.items():
+        click.echo(
+            "Average {} of run with {} repetitions is {:0.3f}".format(
+                key,
+                config_REF.getint("machine_learning", "n_runs"),
+                np.mean(value),
+            )
+        )
+
+    # - create accuracy values per polygon and save to output folder
+    # - note only the dataframe is stored, not the geo-dataframe
+    _, gdf_hit = evaluation.polygon_model_accuracy(out_y_df, global_df)
+    gdf_hit.to_file(
+        os.path.join(out_dir_REF, "output_for_REF.geojson"), driver="GeoJSON"
+    )
+
+    df_feat_imp = evaluation.get_feature_importance(
+        MachineLearning.clf, config_REF, out_dir_REF
+    )
+    _ = evaluation.get_permutation_importance(
+        MachineLearning.clf,
+        MachineLearning.scaler.fit_transform(X[:, 2:]),
+        Y,
+        df_feat_imp,
+        out_dir_REF,
+    )
+
+    click.echo(click.style("\nINFO: reference run succesfully finished\n", fg="cyan"))
+
+    click.echo(click.style("INFO: starting projections\n", fg="cyan"))
+
+    # - running prediction runs
+    # TODO: scaler_fitted is now not part of the class
+    MachineLearning.run_prediction(main_dict, root_dir, extent_active_polys_gdf)
+
+    click.echo(click.style("\nINFO: all projections succesfully finished\n", fg="cyan"))