JannisHoch · JannisHoch · Aug 27, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/copro/evaluation.py b/copro/evaluation.py
@@ -88,11 +88,10 @@ def polygon_model_accuracy(
 
     # - per polygon ID, compute sum of all conflict data points and add to dataframe
     if not make_proj:
-        df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum()
+        df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum().astype(float)
 
     # - per polygon ID, compute sum of all conflict data points and add to dataframe
-    df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum()
-
+    df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum().astype(float)
     # - per polygon ID, compute average probability that conflict occurs
     df_count["min_prob_1"] = pd.to_numeric(df.y_prob_1).groupby(df.ID).min()
     df_count["probability_of_conflict"] = (

diff --git a/copro/io.py b/copro/io.py
@@ -22,7 +22,9 @@ def make_and_collect_output_dirs(
     """
 
     # get path to main output directory as specified in cfg-file
-    out_dir = os.path.join(root_dir, config["general"]["output_dir"])
+    out_dir = os.path.join(
+        root_dir, config["general"]["output_dir"], config["general"]["simulation_name"]
+    )
     click.echo(f"Saving output to main output folder {out_dir}.")
 
     # initalize list for all out-dirs

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
@@ -7,14 +7,26 @@
 import click
 from pathlib import Path
 from sklearn.model_selection import GridSearchCV, KFold
-from sklearn.ensemble import RandomForestClassifier
 
 
 class MachineLearning:
-    def __init__(self, config: dict) -> None:
+    def __init__(
+        self,
+        config: dict,
+        estimator: Union[
+            ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
+        ],
+    ) -> None:
+        """Class for all ML related stuff.
+        Embedded in more top-level `models.MainModel()` class.
+
+        Args:
+            config (dict): Parsed configuration-settings of the model.
+            estimator (Union[ ensemble.RandomForestClassifier, ensemble.RandomForestRegressor ]): ML model.
+        """
         self.config = config
         self.scaler = define_scaling(config)
-        self.clf = ensemble.RandomForestClassifier(random_state=42)
+        self.estimator = estimator
 
     def split_scale_train_test_split(
         self, X: Union[np.ndarray, pd.DataFrame], Y: np.ndarray
@@ -100,11 +112,11 @@ def fit_predict(
 
         if tune_hyperparameters:
             fitted_estimator = apply_gridsearchCV(
-                self.clf, X_train, y_train, n_jobs=n_jobs, verbose=verbose
+                self.estimator, X_train, y_train, n_jobs=n_jobs, verbose=verbose
             )
         else:
             # fit the classifier with training data
-            fitted_estimator = self.clf.fit(X_train, y_train)
+            fitted_estimator = self.estimator.fit(X_train, y_train)
 
         # compute permutation importance
         click.echo("Computing permutation importance.")
@@ -120,12 +132,14 @@ def fit_predict(
         perm_importances_arr = perm_importances["importances"].T
 
         # create folder to store all classifiers with pickle
-        clf_pickle_rep = os.path.join(out_dir, "clfs")
-        Path.mkdir(Path(clf_pickle_rep), parents=True, exist_ok=True)
+        estimator_pickle_rep = os.path.join(out_dir, "estimators")
+        Path.mkdir(Path(estimator_pickle_rep), parents=True, exist_ok=True)
 
         # save the fitted classifier to file via pickle.dump()
-        click.echo(f"Dumping classifier to {clf_pickle_rep}.")
-        with open(os.path.join(clf_pickle_rep, "clf_{}.pkl".format(run_nr)), "wb") as f:
+        click.echo(f"Dumping classifier to {estimator_pickle_rep}.")
+        with open(
+            os.path.join(estimator_pickle_rep, "estimator_{}.pkl".format(run_nr)), "wb"
+        ) as f:
             pickle.dump(fitted_estimator, f)
 
         # make prediction
@@ -136,7 +150,7 @@ def fit_predict(
         return y_pred, y_prob, perm_importances_arr
 
 
-def load_clfs(config: dict, out_dir: str) -> list[str]:
+def load_estimators(config: dict, out_dir: str) -> list[str]:
     """Loads the paths to all previously fitted classifiers to a list.
     Classifiers were saved to file in fit_predict().
     With this list, the classifiers can be loaded again during projections.
@@ -149,14 +163,14 @@ def load_clfs(config: dict, out_dir: str) -> list[str]:
         list: list with file names of classifiers.
     """
 
-    clfs = os.listdir(os.path.join(out_dir, "clfs"))
+    estimators = os.listdir(os.path.join(out_dir, "estimators"))
 
-    if len(clfs) != config["machine_learning"]["n_runs"]:
+    if len(estimators) != config["machine_learning"]["n_runs"]:
         raise ValueError(
             "Number of loaded classifiers does not match the specified number of runs in cfg-file!"
         )
 
-    return clfs
+    return estimators
 
 
 def _split_conflict_geom_data(
@@ -201,27 +215,23 @@ def define_scaling(
     """
 
     if config["machine_learning"]["scaler"] == "MinMaxScaler":
-        scaler = preprocessing.MinMaxScaler()
-    elif config["machine_learning"]["scaler"] == "StandardScaler":
-        scaler = preprocessing.StandardScaler()
-    elif config["machine_learning"]["scaler"] == "RobustScaler":
-        scaler = preprocessing.RobustScaler()
-    elif config["machine_learning"]["scaler"] == "QuantileTransformer":
-        scaler = preprocessing.QuantileTransformer(random_state=42)
-    else:
-        raise ValueError(
-            "no supported scaling-algorithm selected - \
-                choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
-        )
-
-    click.echo(f"Chosen scaling method is {scaler}.")
-
-    return scaler
+        return preprocessing.MinMaxScaler()
+    if config["machine_learning"]["scaler"] == "StandardScaler":
+        return preprocessing.StandardScaler()
+    if config["machine_learning"]["scaler"] == "RobustScaler":
+        return preprocessing.RobustScaler()
+    if config["machine_learning"]["scaler"] == "QuantileTransformer":
+        return preprocessing.QuantileTransformer(random_state=42)
+
+    raise ValueError(
+        "no supported scaling-algorithm selected - \
+            choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
+    )
 
 
 def predictive(
     X: np.ndarray,
-    clf: ensemble.RandomForestClassifier,
+    estimator: ensemble.RandomForestClassifier,
     scaler: Union[
         preprocessing.MinMaxScaler,
         preprocessing.StandardScaler,
@@ -236,7 +246,7 @@ def predictive(
 
     Args:
         X (np.ndarray): array containing the variable values plus unique identifer and geometry information.
-        clf (RandomForestClassifier): the fitted RandomForestClassifier.
+        estimator (RandomForestClassifier): the fitted RandomForestClassifier.
         scaler (scaler): the fitted specified scaling method instance.
 
     Returns:
@@ -251,10 +261,10 @@ def predictive(
     X_ft = scaler.transform(X_data)
 
     # make projection with transformed data
-    y_pred = clf.predict(X_ft)
+    y_pred = estimator.predict(X_ft)
 
     # predict probabilites of outcomes
-    y_prob = clf.predict_proba(X_ft)
+    y_prob = estimator.predict_proba(X_ft)
     y_prob_0 = y_prob[:, 0]  # probability to predict 0
     y_prob_1 = y_prob[:, 1]  # probability to predict 1
 
@@ -268,37 +278,49 @@ def predictive(
 
 
 def apply_gridsearchCV(
-    estimator: RandomForestClassifier,
+    estimator: Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor],
     X_train: np.ndarray,
     y_train: np.ndarray,
     n_jobs=2,
     verbose=0,
-) -> RandomForestClassifier:
+) -> Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]:
     """Applies grid search to find the best hyperparameters for the RandomForestClassifier.
 
     Args:
-        estimator (RandomForestClassifier): Estimator to be used in the grid search.
+        estimator (Union[RandomForestClassifier, RandomForestRegressor]): Estimator to be used in the grid search.
         X_train (np.ndarray): Feature matrix.
         y_train (np.ndarray): Target vector.
         n_jobs (int, optional): Number of cores to be used. Defaults to 2.
         verbose (int, optional): Verbosity level. Defaults to 0.
 
     Returns:
-        RandomForestClassifier: Best estimator of the grid search.
+        Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]: Best estimator of the grid search.
     """
 
     click.echo("Tuning hyperparameters with GridSearchCV.")
     # Define the parameter grid
-    param_grid = {
-        "n_estimators": [50, 100, 200],
-        "criterion": ["gini", "entropy"],
-        "min_impurity_decrease": [0, 0.5, 1],
-        "max_features": ("sqrt", "log2"),
-        "min_samples_split": [2, 5, 10],
-        "min_samples_leaf": [1, 2, 4],
-        "class_weight": [{1: 75}, {1: 100}, {1: 150}],
-        # 'bootstrap': [True, False]
-    }
+    if isinstance(estimator, ensemble.RandomForestClassifier):
+        param_grid = {
+            "n_estimators": [50, 100, 200],
+            "criterion": ["gini", "entropy"],
+            "min_impurity_decrease": [0, 0.5, 1],
+            "max_features": ("sqrt", "log2"),
+            "min_samples_split": [2, 5, 10],
+            "min_samples_leaf": [1, 2, 4],
+            "class_weight": [{1: 75}, {1: 100}, {1: 150}],
+            # 'bootstrap': [True, False]
+        }
+        scoring = "roc_auc"
+    else:
+        param_grid = {
+            "n_estimators": [10, 50, 100],
+            "criterion": ("squared_error", "absolute_error", "friedman_mse"),
+            "max_features": ("sqrt", "log2"),
+            "min_samples_split": [2, 5, 20],
+            "min_impurity_decrease": [0, 0.5, 1],
+            "min_samples_leaf": [1, 5, 10],
+        }
+        scoring = "r2"
 
     # Instantiate the grid search model
     grid_search = GridSearchCV(
@@ -307,7 +329,7 @@ def apply_gridsearchCV(
         cv=KFold(n_splits=5, shuffle=True, random_state=42),
         n_jobs=n_jobs,
         verbose=verbose,
-        scoring="roc_auc",
+        scoring=scoring,
     )
 
     # Fit the grid search to the data

diff --git a/copro/models.py b/copro/models.py
@@ -16,18 +16,23 @@ class MainModel:
     def __init__(
         self,
         X: Union[np.ndarray, pd.DataFrame],
-        Y: np.ndarray,
-        config: RawConfigParser,
+        Y: Union[np.ndarray, pd.DataFrame],
+        estimator: Union[
+            ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
+        ],
+        config: dict,
         out_dir: str,
         n_jobs=2,
         verbose=0,
     ):
         """Constructor for the MainModel class.
+        Under the hood, the class uses the `machine_learning.MachineLearning()` class to run the computations.
 
         Args:
             X (np.ndarray, pd.DataFrame): array containing the variable values plus IDs and geometry information.
             Y (np.ndarray): array containing merely the binary conflict classifier data.
-            config (RawConfigParser): object containing the parsed configuration-settings of the model.
+            estimator (Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]): ML model.
+            config (dict): object containing the parsed configuration-settings of the model.
             out_dir (str): path to output folder.
             n_jobs (int, optional): Number of jobs to run in parallel. Defaults to 2.
             verbose (int, optional): Verbosity level. Defaults to 0.
@@ -37,11 +42,9 @@ def __init__(
         self.config = config
         self.scaler = machine_learning.define_scaling(config)
         self.scaler_all_data = self.scaler.fit(
-            X[:, 2:]
+            X.iloc[:, 2:]
         )  # NOTE: supposed to be used in projections
-        self.clf = ensemble.RandomForestClassifier(
-            n_estimators=1000, class_weight={1: 100}, random_state=42
-        )
+        self.estimator = estimator
         self.out_dir = out_dir
         self.n_jobs = n_jobs
         self.verbose = verbose
@@ -108,6 +111,7 @@ def _n_run(
 
         MLmodel = machine_learning.MachineLearning(
             self.config,
+            self.estimator,
         )
 
         # split X into training-set and test-set, scale training-set data

diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py
@@ -5,10 +5,6 @@
 import pandas as pd
 import os
 
-import warnings
-
-warnings.filterwarnings("ignore")
-
 
 @click.command()
 @click.argument("cfg", type=click.Path())
@@ -44,12 +40,15 @@ def cli(cfg: click.Path, cores: int, verbose: int):
 
     click.echo(click.style("\nINFO: reference run started\n", fg="cyan"))
 
+    estimator = settings.define_model(config_REF)
+    target_var = settings.define_target_var(config_REF, estimator)
+
     # - selecting conflicts and getting area-of-interest and aggregation level
     conflict_gdf, extent_active_polys_gdf, global_df = selection.select(
         config_REF, out_dir_REF, root_dir
     )
 
-    XY_class = xydata.XYData(config_REF)
+    XY_class = xydata.XYData(config_REF, target_var)
     X, Y = XY_class.create_XY(
         out_dir=out_dir_REF,
         root_dir=root_dir,
@@ -59,7 +58,13 @@ def cli(cfg: click.Path, cores: int, verbose: int):
 
     # - defining scaling and model algorithms
     ModelWorkflow = models.MainModel(
-        config=config_REF, X=X, Y=Y, out_dir=out_dir_REF, n_jobs=cores, verbose=verbose
+        config=config_REF,
+        X=X,
+        Y=Y,
+        estimator=estimator,
+        out_dir=out_dir_REF,
+        n_jobs=cores,
+        verbose=verbose,
     )
 
     # - fit-transform on scaler to be used later during projections

diff --git a/copro/selection.py b/copro/selection.py
@@ -69,17 +69,25 @@ def _filter_conflict_properties(
         gpd.GeoDataFrame: geo-dataframe containing filtered entries.
     """
 
+    gdf = gdf[
+        (gdf.year >= config["general"]["y_start"])
+        & (gdf.year <= config["general"]["y_end"])
+    ]
+
+    # if not thresholding options are found, return the original dataframe
     if "thresholds" not in config["data"]["conflict"]:
         click.echo("No thresholding options found in configuration file.")
         return gdf
 
-    # go through all criteria
+    # otherwise, go through all variables for which tresholding is specified
     for key, value in config["data"]["conflict"]["thresholds"].items():
 
+        # if variable is not found in the dataframe, skip it
         if key not in gdf.columns:
             warnings.warn(
-                f"{key} is not found in geodataframe columns, will be skipped."
+                f"{key} is not found in geodataframe columns, thresholding be skipped."
             )
+        # otherwise, check which option is specified and apply it
         else:
             click.echo(f"Tresholding conflict data on {key}.")
             for v, k in value.items():
@@ -93,7 +101,7 @@ def _filter_conflict_properties(
                     click.echo(f"Selecting datapoints less or equal to {k}.")
                     gdf = gdf[gdf[key] <= k]
                 else:
-                    raise ValueError(
+                    warnings.warn(
                         f"{v} is not a recognized tresholding option - use 'values', 'vmin' or 'vmax'."
                     )