From 9dee8ab0b0f75789fdfebbc5dfc8e417f2aa98b8 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 09:42:20 +0200
Subject: [PATCH 01/15] option for either Regression or Classification model

---
 copro/machine_learning.py | 94 +++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 29 deletions(-)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index 6bf8641..e58232e 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -7,14 +7,14 @@
 import click
 from pathlib import Path
 from sklearn.model_selection import GridSearchCV, KFold
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 
 class MachineLearning:
     def __init__(self, config: dict) -> None:
         self.config = config
         self.scaler = define_scaling(config)
-        self.clf = ensemble.RandomForestClassifier(random_state=42)
+        self.estimator = define_model(config)
 
     def split_scale_train_test_split(
         self, X: Union[np.ndarray, pd.DataFrame], Y: np.ndarray
@@ -100,11 +100,11 @@ def fit_predict(
 
         if tune_hyperparameters:
             fitted_estimator = apply_gridsearchCV(
-                self.clf, X_train, y_train, n_jobs=n_jobs, verbose=verbose
+                self.estimator, X_train, y_train, n_jobs=n_jobs, verbose=verbose
             )
         else:
             # fit the classifier with training data
-            fitted_estimator = self.clf.fit(X_train, y_train)
+            fitted_estimator = self.estimator.fit(X_train, y_train)
 
         # compute permutation importance
         click.echo("Computing permutation importance.")
@@ -120,12 +120,14 @@ def fit_predict(
         perm_importances_arr = perm_importances["importances"].T
 
         # create folder to store all classifiers with pickle
-        clf_pickle_rep = os.path.join(out_dir, "clfs")
-        Path.mkdir(Path(clf_pickle_rep), parents=True, exist_ok=True)
+        estimator_pickle_rep = os.path.join(out_dir, "estimators")
+        Path.mkdir(Path(estimator_pickle_rep), parents=True, exist_ok=True)
 
         # save the fitted classifier to file via pickle.dump()
-        click.echo(f"Dumping classifier to {clf_pickle_rep}.")
-        with open(os.path.join(clf_pickle_rep, "clf_{}.pkl".format(run_nr)), "wb") as f:
+        click.echo(f"Dumping classifier to {estimator_pickle_rep}.")
+        with open(
+            os.path.join(estimator_pickle_rep, "estimator_{}.pkl".format(run_nr)), "wb"
+        ) as f:
             pickle.dump(fitted_estimator, f)
 
         # make prediction
@@ -136,7 +138,7 @@ def fit_predict(
         return y_pred, y_prob, perm_importances_arr
 
 
-def load_clfs(config: dict, out_dir: str) -> list[str]:
+def load_estimators(config: dict, out_dir: str) -> list[str]:
     """Loads the paths to all previously fitted classifiers to a list.
     Classifiers were saved to file in fit_predict().
     With this list, the classifiers can be loaded again during projections.
@@ -149,14 +151,14 @@ def load_clfs(config: dict, out_dir: str) -> list[str]:
         list: list with file names of classifiers.
     """
 
-    clfs = os.listdir(os.path.join(out_dir, "clfs"))
+    estimators = os.listdir(os.path.join(out_dir, "estimators"))
 
-    if len(clfs) != config["machine_learning"]["n_runs"]:
+    if len(estimators) != config["machine_learning"]["n_runs"]:
         raise ValueError(
             "Number of loaded classifiers does not match the specified number of runs in cfg-file!"
         )
 
-    return clfs
+    return estimators
 
 
 def _split_conflict_geom_data(
@@ -219,9 +221,31 @@ def define_scaling(
     return scaler
 
 
+def define_model(
+    config: dict,
+) -> Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]:
+    """Defines model based on model configurations.
+
+    Args:
+        config (dict): Parsed configuration-settings of the model.
+
+    Returns:
+        model: the specified model instance.
+    """
+
+    if config["machine_learning"]["model"] in ["Classification", "C"]:
+        return ensemble.RandomForestClassifier(random_state=42)
+    if config["machine_learning"]["model"] in ["Regression", "R"]:
+        return ensemble.RandomForestRegressor(random_state=42)
+    raise ValueError(
+        "no supported model selected - \
+            choose between Classification or Regression"
+    )
+
+
 def predictive(
     X: np.ndarray,
-    clf: ensemble.RandomForestClassifier,
+    estimator: ensemble.RandomForestClassifier,
     scaler: Union[
         preprocessing.MinMaxScaler,
         preprocessing.StandardScaler,
@@ -236,7 +260,7 @@ def predictive(
 
     Args:
         X (np.ndarray): array containing the variable values plus unique identifer and geometry information.
-        clf (RandomForestClassifier): the fitted RandomForestClassifier.
+        estimator (RandomForestClassifier): the fitted RandomForestClassifier.
         scaler (scaler): the fitted specified scaling method instance.
 
     Returns:
@@ -251,10 +275,10 @@ def predictive(
     X_ft = scaler.transform(X_data)
 
     # make projection with transformed data
-    y_pred = clf.predict(X_ft)
+    y_pred = estimator.predict(X_ft)
 
     # predict probabilites of outcomes
-    y_prob = clf.predict_proba(X_ft)
+    y_prob = estimator.predict_proba(X_ft)
     y_prob_0 = y_prob[:, 0]  # probability to predict 0
     y_prob_1 = y_prob[:, 1]  # probability to predict 1
 
@@ -268,7 +292,7 @@ def predictive(
 
 
 def apply_gridsearchCV(
-    estimator: RandomForestClassifier,
+    estimator: Union[RandomForestClassifier, RandomForestRegressor],
     X_train: np.ndarray,
     y_train: np.ndarray,
     n_jobs=2,
@@ -277,7 +301,7 @@ def apply_gridsearchCV(
     """Applies grid search to find the best hyperparameters for the RandomForestClassifier.
 
     Args:
-        estimator (RandomForestClassifier): Estimator to be used in the grid search.
+        estimator (Union[RandomForestClassifier, RandomForestRegressor]): Estimator to be used in the grid search.
         X_train (np.ndarray): Feature matrix.
         y_train (np.ndarray): Target vector.
         n_jobs (int, optional): Number of cores to be used. Defaults to 2.
@@ -289,16 +313,28 @@ def apply_gridsearchCV(
 
     click.echo("Tuning hyperparameters with GridSearchCV.")
     # Define the parameter grid
-    param_grid = {
-        "n_estimators": [50, 100, 200],
-        "criterion": ["gini", "entropy"],
-        "min_impurity_decrease": [0, 0.5, 1],
-        "max_features": ("sqrt", "log2"),
-        "min_samples_split": [2, 5, 10],
-        "min_samples_leaf": [1, 2, 4],
-        "class_weight": [{1: 75}, {1: 100}, {1: 150}],
-        # 'bootstrap': [True, False]
-    }
+    if isinstance(estimator, RandomForestClassifier):
+        param_grid = {
+            "n_estimators": [50, 100, 200],
+            "criterion": ["gini", "entropy"],
+            "min_impurity_decrease": [0, 0.5, 1],
+            "max_features": ("sqrt", "log2"),
+            "min_samples_split": [2, 5, 10],
+            "min_samples_leaf": [1, 2, 4],
+            "class_weight": [{1: 75}, {1: 100}, {1: 150}],
+            # 'bootstrap': [True, False]
+        }
+        scoring = "roc_auc"
+    else:
+        param_grid = {
+            "n_estimators": [10, 50, 100],
+            "criterion": ("squared_error", "absolute_error", "friedman_mse"),
+            "max_features": ("sqrt", "log2"),
+            "min_samples_split": [2, 5, 20],
+            "min_impurity_decrease": [0, 0.5, 1],
+            "min_samples_leaf": [1, 5, 10],
+        }
+        scoring = "r2"
 
     # Instantiate the grid search model
     grid_search = GridSearchCV(
@@ -307,7 +343,7 @@ def apply_gridsearchCV(
         cv=KFold(n_splits=5, shuffle=True, random_state=42),
         n_jobs=n_jobs,
         verbose=verbose,
-        scoring="roc_auc",
+        scoring=scoring,
     )
 
     # Fit the grid search to the data

From 70a4e9f1fae9cc32184801a8b0a0d77391ccca58 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 10:42:16 +0200
Subject: [PATCH 02/15] fine tuned selection of conflict data points

---
 copro/selection.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/copro/selection.py b/copro/selection.py
index 3a84896..8a4d9b9 100644
--- a/copro/selection.py
+++ b/copro/selection.py
@@ -69,17 +69,20 @@ def _filter_conflict_properties(
         gpd.GeoDataFrame: geo-dataframe containing filtered entries.
     """
 
+    # if not thresholding options are found, return the original dataframe
     if "thresholds" not in config["data"]["conflict"]:
         click.echo("No thresholding options found in configuration file.")
         return gdf
 
-    # go through all criteria
+    # otherwise, go through all variables for which tresholding is specified
     for key, value in config["data"]["conflict"]["thresholds"].items():
 
+        # if variable is not found in the dataframe, skip it
         if key not in gdf.columns:
             warnings.warn(
-                f"{key} is not found in geodataframe columns, will be skipped."
+                f"{key} is not found in geodataframe columns, thresholding be skipped."
             )
+        # otherwise, check which option is specified and apply it
         else:
             click.echo(f"Tresholding conflict data on {key}.")
             for v, k in value.items():
@@ -93,7 +96,7 @@ def _filter_conflict_properties(
                     click.echo(f"Selecting datapoints less or equal to {k}.")
                     gdf = gdf[gdf[key] <= k]
                 else:
-                    raise ValueError(
+                    warnings.warn(
                         f"{v} is not a recognized tresholding option - use 'values', 'vmin' or 'vmax'."
                     )
 

From 34770b8108fda1415bb9408c9d65cc170bf33ce3 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 10:55:18 +0200
Subject: [PATCH 03/15] option to provide ML target variable

---
 copro/machine_learning.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index e58232e..e861041 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -16,6 +16,16 @@ def __init__(self, config: dict) -> None:
         self.scaler = define_scaling(config)
         self.estimator = define_model(config)
 
+        if "target_var" in config["machine_learning"].keys():
+            self.target_var = config["machine_learning"]["target_var"]
+        else:
+            if isinstance(self.estimator, RandomForestRegressor):
+                raise ValueError("No target variable specified for regression model.")
+            click.echo(
+                "No targe variable specified, using default classification approach."
+            )
+            self.target_var = None
+
     def split_scale_train_test_split(
         self, X: Union[np.ndarray, pd.DataFrame], Y: np.ndarray
     ):
@@ -243,6 +253,31 @@ def define_model(
     )
 
 
+def define_target_var(
+    config: dict,
+    estimator: Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor],
+) -> Union[str, None]:
+    """Defines target variable of ML model.
+    A target variable needs to be specified for regression models.
+    For classification models, it can be provided in the configuration file.
+    If not, the default classification approach is used.
+
+    Args:
+        config (dict): Parsed configuration-settings of the model.
+        estimator (Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]): ML estimator.
+
+    Returns:
+        Union[str, None]: Either the target variable or `None`.
+    """
+
+    if "target_var" in config["machine_learning"].keys():
+        return config["machine_learning"]["target_var"]
+    if isinstance(estimator, RandomForestRegressor):
+        raise ValueError("No target variable specified for regression model.")
+    click.echo("No targe variable specified, using default classification approach.")
+    return None
+
+
 def predictive(
     X: np.ndarray,
     estimator: ensemble.RandomForestClassifier,

From f258220aab1ce248987f672fdef9fae5b15a9a24 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 11:11:02 +0200
Subject: [PATCH 04/15] use simulation_name as output dir

---
 copro/io.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/copro/io.py b/copro/io.py
index 1a9e9e3..d1c2213 100644
--- a/copro/io.py
+++ b/copro/io.py
@@ -22,7 +22,9 @@ def make_and_collect_output_dirs(
     """
 
     # get path to main output directory as specified in cfg-file
-    out_dir = os.path.join(root_dir, config["general"]["output_dir"])
+    out_dir = os.path.join(
+        root_dir, config["general"]["output_dir"], config["general"]["simulation_name"]
+    )
     click.echo(f"Saving output to main output folder {out_dir}.")
 
     # initalize list for all out-dirs

From e53e33581a409b68f2284120731da1e9055773cf Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 11:29:46 +0200
Subject: [PATCH 05/15] data extraction from netCDF files in a separate
 function

---
 copro/xydata.py | 119 +++++++++++++++++++++++++++++-------------------
 1 file changed, 73 insertions(+), 46 deletions(-)

diff --git a/copro/xydata.py b/copro/xydata.py
index a35327a..1b57f16 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -45,8 +45,7 @@ def create_XY(
         conflict_gdf: gpd.GeoDataFrame,
     ) -> Tuple[np.ndarray, np.ndarray]:
         """Top-level function to create the X-array and Y-array.
-        If the XY-data was pre-computed and specified in cfg-file, the data is loaded.
-        If not, variable values and conflict data are read from file and stored in array.
+        Variable values and conflict data are read from file and stored in array.
         The resulting array is by default saved as npy-format to file.
 
         Args:
@@ -264,7 +263,7 @@ def _fill_XY(  # noqa: R0912
     Args:
         XY (dict): initiated, i.e. empty, XY-dictionary
         config (dict): Parsed configuration-settings of the model.
-        root_dir (str): Path to location of cfg-file.
+        root_dir (str): Path to location of yaml-file.
         conflict_data (gpd.GeoDataFrame): Geodataframe containing the selected conflicts.
         polygon_gdf (gpd.GeoDataFrame): Geodataframe containing the selected polygons.
         out_dir (path): Path to output folder.
@@ -347,50 +346,9 @@ def _fill_XY(  # noqa: R0912
 
                 else:
 
-                    nc_fo = os.path.join(
-                        root_dir,
-                        config["general"]["input_dir"],
-                        config["data"]["indicators"][key]["file"],
+                    XY[key] = _read_data_from_netCDF(
+                        root_dir, config, key, value, polygon_gdf, sim_year
                     )
-                    click.echo(f"Reading data for indicator {key} from {nc_fo}.")
-                    nc_ds = xr.open_dataset(nc_fo)
-
-                    if (np.dtype(nc_ds.time) == np.float32) or (
-                        np.dtype(nc_ds.time) == np.float64
-                    ):
-                        data_series = value
-                        data_list = variables.nc_with_float_timestamp(
-                            polygon_gdf, config, root_dir, key, sim_year
-                        )
-                        data_series = pd.concat(
-                            [data_series, pd.Series(data_list)],
-                            axis=0,
-                            ignore_index=True,
-                        )
-                        XY[key] = data_series
-
-                    elif np.dtype(nc_ds.time) == "datetime64[ns]":
-                        data_series = value
-                        data_list = variables.nc_with_continous_datetime_timestamp(
-                            polygon_gdf, config, root_dir, key, sim_year
-                        )
-                        data_series = pd.concat(
-                            [data_series, pd.Series(data_list)],
-                            axis=0,
-                            ignore_index=True,
-                        )
-                        XY[key] = data_series
-
-                    else:
-                        raise ValueError(
-                            "This file has an unsupported dtype for the time variable: {}".format(
-                                os.path.join(
-                                    root_dir,
-                                    config.get("general", "input_dir"),
-                                    config.get("data", key),
-                                )
-                            )
-                        )
 
             click.echo("All data read.")
 
@@ -399,6 +357,75 @@ def _fill_XY(  # noqa: R0912
     return df_out.to_numpy()
 
 
+def _read_data_from_netCDF(
+    root_dir: str,
+    config: dict,
+    key: str,
+    value: pd.Series,
+    polygon_gdf: gpd.GeoDataFrame,
+    sim_year: int,
+) -> pd.Series:
+    """Reads data from netCDF-file and appends it to the series of the XY-dictionary.
+    This happens per variable and simulation year.
+    Appends the extracted data to the series of the XY-dictionary.
+
+    .. todo::
+        Is the check for different time-dtypes necessary?
+
+    Args:
+        root_dir (str): Path to location of yaml-file.
+        config (dict):  Parsed configuration-settings of the model.
+        key (str): Variable name of feature for which data to be extracted.
+        value (pd.Series): Extracted feature values from previous years.
+        polygon_gdf (gpd.GeoDataFrame): Geodataframe containing the selected polygons.
+        sim_year (int): Simulation year.
+
+    Returns:
+        pd.Series: Appended series containing the extracted feature values up to the current simulation year.
+    """
+
+    nc_fo = os.path.join(
+        root_dir,
+        config["general"]["input_dir"],
+        config["data"]["indicators"][key]["file"],
+    )
+    click.echo(f"Reading data for indicator {key} from {nc_fo}.")
+    nc_ds = xr.open_dataset(nc_fo)
+
+    if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
+        data_series = value
+        data_list = variables.nc_with_float_timestamp(
+            polygon_gdf, config, root_dir, key, sim_year
+        )
+        data_series = pd.concat(
+            [data_series, pd.Series(data_list)],
+            axis=0,
+            ignore_index=True,
+        )
+    elif np.dtype(nc_ds.time) == "datetime64[ns]":
+        data_series = value
+        data_list = variables.nc_with_continous_datetime_timestamp(
+            polygon_gdf, config, root_dir, key, sim_year
+        )
+        data_series = pd.concat(
+            [data_series, pd.Series(data_list)],
+            axis=0,
+            ignore_index=True,
+        )
+    else:
+        raise ValueError(
+            "This file has an unsupported dtype for the time variable: {}".format(
+                os.path.join(
+                    root_dir,
+                    config.get("general", "input_dir"),
+                    config.get("data", key),
+                )
+            )
+        )
+
+    return data_series
+
+
 def _split_XY_data(XY_arr: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     """Separates the XY-array into array containing information about
     variable values (X-array or sample data) and conflict data (Y-array or target data).

From 998f209685e1f7fdaa640c473f004c9035d35a1a Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 12:58:54 +0200
Subject: [PATCH 06/15] target variable and estimator used determined in main
 script

---
 copro/models.py               | 13 +++++----
 copro/scripts/copro_runner.py | 13 +++++++--
 copro/settings.py             | 54 ++++++++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/copro/models.py b/copro/models.py
index 9cbcb80..51e8e15 100644
--- a/copro/models.py
+++ b/copro/models.py
@@ -17,7 +17,10 @@ def __init__(
         self,
         X: Union[np.ndarray, pd.DataFrame],
         Y: np.ndarray,
-        config: RawConfigParser,
+        estimator: Union[
+            ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
+        ],
+        config: dict,
         out_dir: str,
         n_jobs=2,
         verbose=0,
@@ -27,7 +30,8 @@ def __init__(
         Args:
             X (np.ndarray, pd.DataFrame): array containing the variable values plus IDs and geometry information.
             Y (np.ndarray): array containing merely the binary conflict classifier data.
-            config (RawConfigParser): object containing the parsed configuration-settings of the model.
+            estimator (Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]): ML model.
+            config (dict): object containing the parsed configuration-settings of the model.
             out_dir (str): path to output folder.
             n_jobs (int, optional): Number of jobs to run in parallel. Defaults to 2.
             verbose (int, optional): Verbosity level. Defaults to 0.
@@ -39,9 +43,7 @@ def __init__(
         self.scaler_all_data = self.scaler.fit(
             X[:, 2:]
         )  # NOTE: supposed to be used in projections
-        self.clf = ensemble.RandomForestClassifier(
-            n_estimators=1000, class_weight={1: 100}, random_state=42
-        )
+        self.estimator = estimator
         self.out_dir = out_dir
         self.n_jobs = n_jobs
         self.verbose = verbose
@@ -108,6 +110,7 @@ def _n_run(
 
         MLmodel = machine_learning.MachineLearning(
             self.config,
+            self.estimator,
         )
 
         # split X into training-set and test-set, scale training-set data
diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py
index dc64662..cbf8393 100644
--- a/copro/scripts/copro_runner.py
+++ b/copro/scripts/copro_runner.py
@@ -44,12 +44,15 @@ def cli(cfg: click.Path, cores: int, verbose: int):
 
     click.echo(click.style("\nINFO: reference run started\n", fg="cyan"))
 
+    estimator = settings.define_model(config_REF)
+    target_var = settings.define_target_var(config_REF, estimator)
+
     # - selecting conflicts and getting area-of-interest and aggregation level
     conflict_gdf, extent_active_polys_gdf, global_df = selection.select(
         config_REF, out_dir_REF, root_dir
     )
 
-    XY_class = xydata.XYData(config_REF)
+    XY_class = xydata.XYData(config_REF, target_var)
     X, Y = XY_class.create_XY(
         out_dir=out_dir_REF,
         root_dir=root_dir,
@@ -59,7 +62,13 @@ def cli(cfg: click.Path, cores: int, verbose: int):
 
     # - defining scaling and model algorithms
     ModelWorkflow = models.MainModel(
-        config=config_REF, X=X, Y=Y, out_dir=out_dir_REF, n_jobs=cores, verbose=verbose
+        config=config_REF,
+        X=X,
+        Y=Y,
+        estimator=estimator,
+        out_dir=out_dir_REF,
+        n_jobs=cores,
+        verbose=verbose,
     )
 
     # - fit-transform on scaler to be used later during projections
diff --git a/copro/settings.py b/copro/settings.py
index b53d695..4155b2b 100644
--- a/copro/settings.py
+++ b/copro/settings.py
@@ -3,8 +3,9 @@
 import numpy as np
 from configparser import RawConfigParser
 from shutil import copyfile
-from typing import Tuple
+from typing import Tuple, Union
 from copro import utils, io
+from sklearn import ensemble
 
 import yaml
 
@@ -118,6 +119,57 @@ def _collect_simulation_settings(config: dict, root_dir: click.Path) -> dict:
     return config_dict
 
 
+def define_model(
+    config: dict,
+) -> Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]:
+    """Defines model based on model configurations.
+
+    Args:
+        config (dict): Parsed configuration-settings of the model.
+
+    Returns:
+        model: the specified model instance.
+    """
+
+    if config["machine_learning"]["model"] in ["Classification", "C"]:
+        return ensemble.RandomForestClassifier(random_state=42)
+    if config["machine_learning"]["model"] in ["Regression", "R"]:
+        return ensemble.RandomForestRegressor(random_state=42)
+    raise ValueError(
+        "no supported model selected - \
+            choose between Classification or Regression"
+    )
+
+
+def define_target_var(
+    config: dict,
+    estimator: Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor],
+) -> Union[str, None]:
+    """Defines target variable of ML model.
+    A target variable needs to be specified for regression models.
+    For classification models, it can be provided in the configuration file.
+    If not, the default classification approach is used.
+
+    Args:
+        config (dict): Parsed configuration-settings of the model.
+        estimator (Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]): ML estimator.
+
+    Returns:
+        Union[str, None]: Either the target variable or `None`.
+    """
+
+    # if target variable is specified, return it
+    if "target_var" in config["machine_learning"].keys():
+        click.echo(f"Target variable is {config['machine_learning']['target_var']}.")
+        return config["machine_learning"]["target_var"]
+    # if not, but model is regression, raise error
+    if isinstance(estimator, ensemble.RandomForestRegressor):
+        raise ValueError("No target variable specified for regression model.")
+    # if not, but model is classification, return None and use default classification approach
+    click.echo("No target variable specified, using default classification approach.")
+    return None
+
+
 def determine_projection_period(
     config_REF: RawConfigParser, config_PROJ: RawConfigParser
 ) -> list:

From 737445bb6409b6f3fd0226b1e119410da2467cb0 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 13:02:17 +0200
Subject: [PATCH 07/15] should belong to previous commit

---
 copro/machine_learning.py | 98 +++++++++------------------------------
 1 file changed, 21 insertions(+), 77 deletions(-)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index e861041..682b282 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -7,24 +7,19 @@
 import click
 from pathlib import Path
 from sklearn.model_selection import GridSearchCV, KFold
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 
 class MachineLearning:
-    def __init__(self, config: dict) -> None:
+    def __init__(
+        self,
+        config: dict,
+        estimator: Union[
+            ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
+        ],
+    ) -> None:
         self.config = config
         self.scaler = define_scaling(config)
-        self.estimator = define_model(config)
-
-        if "target_var" in config["machine_learning"].keys():
-            self.target_var = config["machine_learning"]["target_var"]
-        else:
-            if isinstance(self.estimator, RandomForestRegressor):
-                raise ValueError("No target variable specified for regression model.")
-            click.echo(
-                "No targe variable specified, using default classification approach."
-            )
-            self.target_var = None
+        self.estimator = estimator
 
     def split_scale_train_test_split(
         self, X: Union[np.ndarray, pd.DataFrame], Y: np.ndarray
@@ -213,71 +208,20 @@ def define_scaling(
     """
 
     if config["machine_learning"]["scaler"] == "MinMaxScaler":
-        scaler = preprocessing.MinMaxScaler()
-    elif config["machine_learning"]["scaler"] == "StandardScaler":
-        scaler = preprocessing.StandardScaler()
-    elif config["machine_learning"]["scaler"] == "RobustScaler":
-        scaler = preprocessing.RobustScaler()
-    elif config["machine_learning"]["scaler"] == "QuantileTransformer":
-        scaler = preprocessing.QuantileTransformer(random_state=42)
-    else:
-        raise ValueError(
-            "no supported scaling-algorithm selected - \
-                choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
-        )
-
-    click.echo(f"Chosen scaling method is {scaler}.")
-
-    return scaler
-
-
-def define_model(
-    config: dict,
-) -> Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]:
-    """Defines model based on model configurations.
-
-    Args:
-        config (dict): Parsed configuration-settings of the model.
+        return preprocessing.MinMaxScaler()
+    if config["machine_learning"]["scaler"] == "StandardScaler":
+        return preprocessing.StandardScaler()
+    if config["machine_learning"]["scaler"] == "RobustScaler":
+        return preprocessing.RobustScaler()
+    if config["machine_learning"]["scaler"] == "QuantileTransformer":
+        return preprocessing.QuantileTransformer(random_state=42)
 
-    Returns:
-        model: the specified model instance.
-    """
-
-    if config["machine_learning"]["model"] in ["Classification", "C"]:
-        return ensemble.RandomForestClassifier(random_state=42)
-    if config["machine_learning"]["model"] in ["Regression", "R"]:
-        return ensemble.RandomForestRegressor(random_state=42)
     raise ValueError(
-        "no supported model selected - \
-            choose between Classification or Regression"
+        "no supported scaling-algorithm selected - \
+            choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
     )
 
 
-def define_target_var(
-    config: dict,
-    estimator: Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor],
-) -> Union[str, None]:
-    """Defines target variable of ML model.
-    A target variable needs to be specified for regression models.
-    For classification models, it can be provided in the configuration file.
-    If not, the default classification approach is used.
-
-    Args:
-        config (dict): Parsed configuration-settings of the model.
-        estimator (Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]): ML estimator.
-
-    Returns:
-        Union[str, None]: Either the target variable or `None`.
-    """
-
-    if "target_var" in config["machine_learning"].keys():
-        return config["machine_learning"]["target_var"]
-    if isinstance(estimator, RandomForestRegressor):
-        raise ValueError("No target variable specified for regression model.")
-    click.echo("No targe variable specified, using default classification approach.")
-    return None
-
-
 def predictive(
     X: np.ndarray,
     estimator: ensemble.RandomForestClassifier,
@@ -327,12 +271,12 @@ def predictive(
 
 
 def apply_gridsearchCV(
-    estimator: Union[RandomForestClassifier, RandomForestRegressor],
+    estimator: Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor],
     X_train: np.ndarray,
     y_train: np.ndarray,
     n_jobs=2,
     verbose=0,
-) -> RandomForestClassifier:
+) -> Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]:
     """Applies grid search to find the best hyperparameters for the RandomForestClassifier.
 
     Args:
@@ -343,12 +287,12 @@ def apply_gridsearchCV(
         verbose (int, optional): Verbosity level. Defaults to 0.
 
     Returns:
-        RandomForestClassifier: Best estimator of the grid search.
+        Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]: Best estimator of the grid search.
     """
 
     click.echo("Tuning hyperparameters with GridSearchCV.")
     # Define the parameter grid
-    if isinstance(estimator, RandomForestClassifier):
+    if isinstance(estimator, ensemble.RandomForestClassifier):
         param_grid = {
             "n_estimators": [50, 100, 200],
             "criterion": ["gini", "entropy"],

From dd513117a3082c58bef390da5ce2031c668fcef2 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 13:03:18 +0200
Subject: [PATCH 08/15] first step towards flexible target_vars when extracting
 conflict (Y) data

---
 copro/xydata.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/copro/xydata.py b/copro/xydata.py
index 1b57f16..0e54296 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -1,5 +1,5 @@
 from copro import conflict, variables, nb, utils
-from typing import Tuple
+from typing import Tuple, Union
 import click
 import numpy as np
 import xarray as xr
@@ -9,10 +9,11 @@
 
 
 class XYData:
-    def __init__(self, config: dict):
+    def __init__(self, config: dict, target_var: str):
         self.XY_dict = {}
         self.__XY_dict_initiated__ = False
         self.config = config
+        self.target_var = target_var
 
     def _initiate_XY_data(self):
 
@@ -29,7 +30,10 @@ def _initiate_XY_data(self):
             self.XY_dict[key] = pd.Series(dtype=float)
         self.XY_dict["conflict_t_min_1"] = pd.Series(dtype=bool)
         self.XY_dict["conflict_t_min_1_nb"] = pd.Series(dtype=float)
-        self.XY_dict["conflict"] = pd.Series(dtype=bool)
+        # TODO: somewhere a function needs to be added to cater different types of target variables
+        # dict key can remain "conflict" but the dtype should be adjusted as it may not be 0/1 anymore
+        # could be multi-label classification or regression
+        self.XY_dict["conflict"] = pd.Series()
 
         click.echo("The columns in the sample matrix used are:")
         for key in self.XY_dict:
@@ -62,7 +66,13 @@ def create_XY(
         self._initiate_XY_data()
         # fill the dictionary and get array
         XY_arr = _fill_XY(
-            self.XY_dict, self.config, root_dir, conflict_gdf, polygon_gdf, out_dir
+            self.XY_dict,
+            self.config,
+            root_dir,
+            conflict_gdf,
+            self.target_var,
+            polygon_gdf,
+            out_dir,
         )
         # save array to XY.npy out_dir
         click.echo(
@@ -253,6 +263,7 @@ def _fill_XY(  # noqa: R0912
     config: dict,
     root_dir: click.Path,
     conflict_data: gpd.GeoDataFrame,
+    target_var: Union[str, None],
     polygon_gdf: gpd.GeoDataFrame,
     out_dir: click.Path,
 ) -> np.ndarray:
@@ -265,6 +276,8 @@ def _fill_XY(  # noqa: R0912
         config (dict): Parsed configuration-settings of the model.
         root_dir (str): Path to location of yaml-file.
         conflict_data (gpd.GeoDataFrame): Geodataframe containing the selected conflicts.
+        target_var (str): Target variable of the ML model. Either a string or None. \
+            Depending on target_var, the conflict data is read differently.
         polygon_gdf (gpd.GeoDataFrame): Geodataframe containing the selected polygons.
         out_dir (path): Path to output folder.
 
@@ -292,6 +305,15 @@ def _fill_XY(  # noqa: R0912
                 if key == "conflict":
 
                     data_series = value
+                    # TODO: guess for target_vars others than None, a dedicasted function is needed
+                    if target_var is None:
+                        data_list = conflict.conflict_in_year_bool(
+                            config, conflict_data, polygon_gdf, sim_year, out_dir
+                        )
+                    else:
+                        raise NotImplementedError(
+                            "Implementation of target_var did not happen yet."
+                        )
                     data_list = conflict.conflict_in_year_bool(
                         config, conflict_data, polygon_gdf, sim_year, out_dir
                     )

From 633175617eeeb64d07927ace6cbd2d236d4fe19d Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 13:10:15 +0200
Subject: [PATCH 09/15] added class docstrings

---
 copro/machine_learning.py | 7 +++++++
 copro/models.py           | 1 +
 copro/xydata.py           | 9 ++++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
index 682b282..805a1e9 100644
--- a/copro/machine_learning.py
+++ b/copro/machine_learning.py
@@ -17,6 +17,13 @@ def __init__(
             ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
         ],
     ) -> None:
+        """Class for all ML related stuff.
+        Embedded in more top-level `models.MainModel()` class.
+
+        Args:
+            config (dict): Parsed configuration-settings of the model.
+            estimator (Union[ ensemble.RandomForestClassifier, ensemble.RandomForestRegressor ]): ML model.
+        """
         self.config = config
         self.scaler = define_scaling(config)
         self.estimator = estimator
diff --git a/copro/models.py b/copro/models.py
index 51e8e15..734e66c 100644
--- a/copro/models.py
+++ b/copro/models.py
@@ -26,6 +26,7 @@ def __init__(
         verbose=0,
     ):
         """Constructor for the MainModel class.
+        Under the hood, the class uses the `machine_learning.MachineLearning()` class to run the computations.
 
         Args:
             X (np.ndarray, pd.DataFrame): array containing the variable values plus IDs and geometry information.
diff --git a/copro/xydata.py b/copro/xydata.py
index 0e54296..d71c15d 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -9,7 +9,14 @@
 
 
 class XYData:
-    def __init__(self, config: dict, target_var: str):
+    def __init__(self, config: dict, target_var: Union[str, None]):
+        """Collects feature (X) and target (Y) data for the model.
+
+        Args:
+            config (dict): Parsed configuration-settings of the model.
+            target_var (Union[str, None]): Target variable of the ML model. Either a string or None. \
+                Can be `None` for classification models, but needs to be specified for regression models.
+        """
         self.XY_dict = {}
         self.__XY_dict_initiated__ = False
         self.config = config

From 75e316ed78d0da3ccc60559095da71127cef4e07 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 13:36:23 +0200
Subject: [PATCH 10/15] constructing X and Y data as dataframes instead of
 arrays

---
 copro/xydata.py | 49 ++++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/copro/xydata.py b/copro/xydata.py
index d71c15d..9c949ae 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -72,7 +72,7 @@ def create_XY(
 
         self._initiate_XY_data()
         # fill the dictionary and get array
-        XY_arr = _fill_XY(
+        XY_df = _fill_XY(
             self.XY_dict,
             self.config,
             root_dir,
@@ -81,14 +81,16 @@ def create_XY(
             polygon_gdf,
             out_dir,
         )
-        # save array to XY.npy out_dir
+
+        # save dataframe as geodataframe to GeoPackage in out_dir
         click.echo(
-            f"Saving XY data by default to file {os.path.join(out_dir, 'XY.npy')}."
+            f"Saving XY data by default to file {os.path.join(out_dir, 'XY.gpkg')}."
         )
-        np.save(os.path.join(out_dir, "XY"), XY_arr)
+        XY_gdf = gpd.GeoDataFrame(XY_df, geometry="poly_geometry")
+        XY_gdf.to_file(os.path.join(out_dir, "XY.gpkg"), driver="GPKG")
 
         # split the XY data into sample data X and target values Y
-        X, Y = _split_XY_data(XY_arr)
+        X, Y = _split_XY_data(XY_df)
 
         return X, Y
 
@@ -273,7 +275,7 @@ def _fill_XY(  # noqa: R0912
     target_var: Union[str, None],
     polygon_gdf: gpd.GeoDataFrame,
     out_dir: click.Path,
-) -> np.ndarray:
+) -> pd.DataFrame:
     """Fills the (XY-)dictionary with data for each variable and conflict for each polygon for each simulation year.
     The number of rows should therefore equal to number simulation years times number of polygons.
     At end of last simulation year, the dictionary is converted to a numpy-array.
@@ -289,7 +291,7 @@ def _fill_XY(  # noqa: R0912
         out_dir (path): Path to output folder.
 
     Returns:
-        np.ndarray: Filled array containing the variable values (X) and binary conflict data (Y) plus meta-data.
+        pd.DataFrame: Dataframe containing the variable values (X) and binary conflict data (Y) plus meta-data.
     """
 
     # go through all simulation years as specified in config-file
@@ -381,9 +383,7 @@ def _fill_XY(  # noqa: R0912
 
             click.echo("All data read.")
 
-    df_out = pd.DataFrame.from_dict(XY)
-
-    return df_out.to_numpy()
+    return pd.DataFrame.from_dict(XY)  # .to_numpy()
 
 
 def _read_data_from_netCDF(
@@ -455,37 +455,36 @@ def _read_data_from_netCDF(
     return data_series
 
 
-def _split_XY_data(XY_arr: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+def _split_XY_data(XY_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Separates the XY-array into array containing information about
     variable values (X-array or sample data) and conflict data (Y-array or target data).
     Thereby, the X-array also contains the information about
     unique identifier and polygon geometry.
 
     Args:
-        XY (array): array containing variable values and conflict data.
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        XY_df (pd.DataFrame): array containing variable values and conflict data.
 
     Returns:
-        arrays: two separate arrays, the X-array and Y-array.
+        pd.DataFrame: X-array, i.e. array containing feature values.
+        pd.DataFrame: Y-array, i.e. array containing target values.
     """
 
-    # convert array to dataframe for easier handling
-    XY_df = pd.DataFrame(XY_arr)
-    # fill all missing values with 0
-    XY_df = XY_df.fillna(0)
-    # convert dataframe back to array
-    XY_df = XY_df.to_numpy()
+    # drop missing values
+    XY_df_noNaNs = XY_df.dropna()
+    click.echo(
+        f"Dropped missing values, which made up {100 * len(XY_df.isna() / len(XY_df))} percent of the data."
+    )
 
     # get X data
     # since conflict is the last column, we know that all previous columns must be variable values
-    X = XY_df[:, :-1]
+    X_df = XY_df_noNaNs.iloc[:, :-1]
     # get Y data and convert to integer values
-    Y = XY_df[:, -1]
-    Y = Y.astype(int)
+    Y_df = XY_df_noNaNs.iloc[:, -1]
+    Y_df = Y_df.astype(int)
 
-    fraction_Y_1 = 100 * len(np.where(Y != 0)[0]) / len(Y)
+    fraction_Y_1 = 100 * len(Y_df[Y_df == 1]) / len(Y_df)
     click.echo(
         f"{round(fraction_Y_1, 2)} percent in the data corresponds to conflicts."
     )
 
-    return X, Y
+    return X_df, Y_df

From 11329fbcd7fde9df32ed55ee184bf6ded4b89ba9 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 15:34:25 +0200
Subject: [PATCH 11/15] no log-scale support for now, more consistent treatment
 of polygons w/o feature data

---
 copro/scripts/copro_runner.py |  4 ---
 copro/variables.py            | 60 +++++++++++++++--------------------
 2 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py
index cbf8393..4a58150 100644
--- a/copro/scripts/copro_runner.py
+++ b/copro/scripts/copro_runner.py
@@ -5,10 +5,6 @@
 import pandas as pd
 import os
 
-import warnings
-
-warnings.filterwarnings("ignore")
-
 
 @click.command()
 @click.argument("cfg", type=click.Path())
diff --git a/copro/variables.py b/copro/variables.py
index 7dd0c63..51b4688 100644
--- a/copro/variables.py
+++ b/copro/variables.py
@@ -5,12 +5,8 @@
 import rasterstats as rstats
 import numpy as np
 import os
-import math
-import click
-
 import warnings
-
-warnings.filterwarnings("once")
+import click
 
 
 def nc_with_float_timestamp(
@@ -80,17 +76,14 @@ def nc_with_float_timestamp(
     # initialize output list
     list_out = []
     # loop through all polygons in geo-dataframe and compute statistics, then append to output file
-    for i in range(len(extent_gdf)):
-
-        # province i
-        prov = extent_gdf.iloc[i]
+    for _, row in extent_gdf.iterrows():
 
         # compute zonal stats for this province
         # computes a value per polygon for all raster cells that are touched by polygon (all_touched=True)
         # if all_touched=False, only for raster cells with centre point in polygon are considered,
         # but this is problematic for very small polygons
         zonal_stats = rstats.zonal_stats(
-            prov.geometry,
+            row.geometry,
             nc_arr_vals,
             affine=affine,
             stats=stat_method,
@@ -98,19 +91,19 @@ def nc_with_float_timestamp(
         )
         val = zonal_stats[0][stat_method]
 
-        # # if specified, log-transform value
+        # if specified, log-transform value
         if ln_flag:
-            # works only if zonal stats is not None, i.e. if it's None it stays None
-            val_ln = np.log(val)
-            # in case log-transformed value results in -inf, replace with None
-            if val_ln == -math.inf:
-                val = np.log(val + 1)
-            else:
-                val = val_ln
+            raise NotImplementedError(
+                "Log-transformation for continuous datetime timestamp not yet implemented."
+            )
 
         # warn if result is NaN
-        if val is math.nan:
-            warnings.warn("NaN computed!")
+        if val is None:
+            warnings.warn(
+                f"`None` computed for {config['data']['extent']['id']} \
+                    {row[config['data']['extent']['id']]}, setting to `np.nan`!"
+            )
+            val = np.nan
 
         list_out.append(val)
 
@@ -185,17 +178,14 @@ def nc_with_continous_datetime_timestamp(
     # initialize output list
     list_out = []
     # loop through all polygons in geo-dataframe and compute statistics, then append to output file
-    for i in range(len(extent_gdf)):
-
-        # province i
-        prov = extent_gdf.iloc[i]
+    for _, row in extent_gdf.iterrows():
 
         # compute zonal stats for this province
         # computes a value per polygon for all raster cells that are touched by polygon (all_touched=True)
         # if all_touched=False, only for raster cells with centre point in polygon are considered,
         # but this is problematic for very small polygons
         zonal_stats = rstats.zonal_stats(
-            prov.geometry,
+            row.geometry,
             nc_arr_vals,
             affine=affine,
             stats=stat_method,
@@ -203,19 +193,19 @@ def nc_with_continous_datetime_timestamp(
         )
         val = zonal_stats[0][stat_method]
 
-        # # if specified, log-transform value
+        # if specified, log-transform value
         if ln_flag:
-            # works only if zonal stats is not None, i.e. if it's None it stays None
-            val_ln = np.log(val)
-            # in case log-transformed value results in -inf, replace with None
-            if val_ln == -math.inf:
-                val = np.log(val + 1)
-            else:
-                val = val_ln
+            raise NotImplementedError(
+                "Log-transformation for continuous datetime timestamp not yet implemented."
+            )
 
         # warn if result is NaN
-        if val is math.nan:
-            warnings.warn("NaN computed!")
+        if val is None:
+            warnings.warn(
+                f"`None` computed for {config['data']['extent']['id']} \
+                    {row[config['data']['extent']['id']]}, setting to `np.nan`!"
+            )
+            val = np.nan
 
         list_out.append(val)
 

From 5e8fd9577992fed195d52875702607532315c2b2 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 16:19:09 +0200
Subject: [PATCH 12/15] removing all polygons with 1 or more NaNs

---
 copro/variables.py | 9 +++++----
 copro/xydata.py    | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/copro/variables.py b/copro/variables.py
index 51b4688..d718a88 100644
--- a/copro/variables.py
+++ b/copro/variables.py
@@ -100,8 +100,8 @@ def nc_with_float_timestamp(
         # warn if result is NaN
         if val is None:
             warnings.warn(
-                f"`None` computed for {config['data']['extent']['id']} \
-                    {row[config['data']['extent']['id']]}, setting to `np.nan`!"
+                f"`None` computed for {config['data']['extent']['id']}"
+                f"{row[config['data']['extent']['id']]}, setting to `np.nan`!"
             )
             val = np.nan
 
@@ -202,9 +202,10 @@ def nc_with_continous_datetime_timestamp(
         # warn if result is NaN
         if val is None:
             warnings.warn(
-                f"`None` computed for {config['data']['extent']['id']} \
-                    {row[config['data']['extent']['id']]}, setting to `np.nan`!"
+                f"`None` computed for {config['data']['extent']['id']}"
+                f"{row[config['data']['extent']['id']]}, setting to `np.nan`!"
             )
+
             val = np.nan
 
         list_out.append(val)
diff --git a/copro/xydata.py b/copro/xydata.py
index 9c949ae..a5d962b 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -472,7 +472,7 @@ def _split_XY_data(XY_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     # drop missing values
     XY_df_noNaNs = XY_df.dropna()
     click.echo(
-        f"Dropped missing values, which made up {100 * len(XY_df.isna() / len(XY_df))} percent of the data."
+        f"Dropped missing values, which leaves {100 * len(XY_df_noNaNs) / (len(XY_df))} percent of the polygons."
     )
 
     # get X data

From fa6224d057b45573342031a1c74b22821dffbb76 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 16:43:03 +0200
Subject: [PATCH 13/15] fully pd.dataframe support implemented

---
 copro/models.py | 4 ++--
 copro/xydata.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/copro/models.py b/copro/models.py
index 734e66c..e851656 100644
--- a/copro/models.py
+++ b/copro/models.py
@@ -16,7 +16,7 @@ class MainModel:
     def __init__(
         self,
         X: Union[np.ndarray, pd.DataFrame],
-        Y: np.ndarray,
+        Y: Union[np.ndarray, pd.DataFrame],
         estimator: Union[
             ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
         ],
@@ -42,7 +42,7 @@ def __init__(
         self.config = config
         self.scaler = machine_learning.define_scaling(config)
         self.scaler_all_data = self.scaler.fit(
-            X[:, 2:]
+            X.iloc[:, 2:]
         )  # NOTE: supposed to be used in projections
         self.estimator = estimator
         self.out_dir = out_dir
diff --git a/copro/xydata.py b/copro/xydata.py
index a5d962b..337da72 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -54,7 +54,7 @@ def create_XY(
         root_dir: click.Path,
         polygon_gdf: gpd.GeoDataFrame,
         conflict_gdf: gpd.GeoDataFrame,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """Top-level function to create the X-array and Y-array.
         Variable values and conflict data are read from file and stored in array.
         The resulting array is by default saved as npy-format to file.
@@ -66,8 +66,8 @@ def create_XY(
             conflict_gdf (geo-dataframe): geo-dataframe containing the selected conflicts.
 
         Returns:
-            np.ndarray: X-array containing variable values.
-            np.ndarray: Y-array containing conflict data.
+            pd.DataFrame: dataframe containing feature (X) values.
+            pd.DataFrame: dataframe containing conflict (Y) data.
         """
 
         self._initiate_XY_data()

From e07f5f3527fbfd98a34d5a60cb877d62da05b8a1 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 16:48:47 +0200
Subject: [PATCH 14/15] save only selected conflicts which fall in simulation
 period

---
 copro/selection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/copro/selection.py b/copro/selection.py
index 8a4d9b9..ece3928 100644
--- a/copro/selection.py
+++ b/copro/selection.py
@@ -69,6 +69,11 @@ def _filter_conflict_properties(
         gpd.GeoDataFrame: geo-dataframe containing filtered entries.
     """
 
+    gdf = gdf[
+        (gdf.year >= config["general"]["y_start"])
+        & (gdf.year <= config["general"]["y_end"])
+    ]
+
     # if not thresholding options are found, return the original dataframe
     if "thresholds" not in config["data"]["conflict"]:
         click.echo("No thresholding options found in configuration file.")

From 3a053fe17c46d5ccc9c3386397de4e287fecaba4 Mon Sep 17 00:00:00 2001
From: JannisHoch <j.hoch@fathom.global>
Date: Mon, 26 Aug 2024 20:08:00 +0200
Subject: [PATCH 15/15] finetuning print output

---
 copro/evaluation.py | 5 ++---
 copro/xydata.py     | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/copro/evaluation.py b/copro/evaluation.py
index 49956aa..c4469ca 100644
--- a/copro/evaluation.py
+++ b/copro/evaluation.py
@@ -88,11 +88,10 @@ def polygon_model_accuracy(
 
     # - per polygon ID, compute sum of all conflict data points and add to dataframe
     if not make_proj:
-        df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum()
+        df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum().astype(float)
 
     # - per polygon ID, compute sum of all conflict data points and add to dataframe
-    df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum()
-
+    df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum().astype(float)
     # - per polygon ID, compute average probability that conflict occurs
     df_count["min_prob_1"] = pd.to_numeric(df.y_prob_1).groupby(df.ID).min()
     df_count["probability_of_conflict"] = (
diff --git a/copro/xydata.py b/copro/xydata.py
index 337da72..10bde9c 100644
--- a/copro/xydata.py
+++ b/copro/xydata.py
@@ -472,7 +472,8 @@ def _split_XY_data(XY_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     # drop missing values
     XY_df_noNaNs = XY_df.dropna()
     click.echo(
-        f"Dropped missing values, which leaves {100 * len(XY_df_noNaNs) / (len(XY_df))} percent of the polygons."
+        f"Dropped missing values, which leaves {len(XY_df_noNaNs)} out of {(len(XY_df))} data points."
+        f"Number of polygons with data is now {XY_df_noNaNs.poly_ID.nunique()} out of {XY_df.poly_ID.nunique()}."
     )
 
     # get X data