JannisHoch · JannisHoch · Aug 26, 2024 · May 28, 2024 · May 30, 2024 · Aug 24, 2024
diff --git a/copro/conflict.py b/copro/conflict.py
@@ -16,7 +16,8 @@ def conflict_in_year_bool(
     extent_gdf: gpd.GeoDataFrame,
     sim_year: int,
     out_dir: click.Path,
-    identifier="watprovID",
+    poly_identifier="watprovID",
+    conflict_identifier="event_id_cnty",
 ) -> list:
     """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
 
@@ -46,17 +47,17 @@ def conflict_in_year_bool(
 
     # determine the aggregated amount of fatalities in one region (e.g. water province)
     fatalities_per_poly = (
-        data_merged["best"]
-        .groupby(data_merged[identifier])
-        .sum()
+        data_merged[conflict_identifier]
+        .groupby(data_merged[poly_identifier])
+        .count()
         .to_frame()
-        .rename(columns={"best": "total_fatalities"})
+        .rename(columns={conflict_identifier: "total_fatalities"})
     )
 
     out_dir = os.path.join(out_dir, "files")
     Path.mkdir(Path(out_dir), exist_ok=True)
 
-    if sim_year == config.getint("settings", "y_end"):
+    if sim_year == config["general"]["y_end"]:
         _store_boolean_conflict_data_to_csv(
             fatalities_per_poly, extent_gdf, sim_year, out_dir
         )
@@ -65,7 +66,7 @@ def conflict_in_year_bool(
     # if so, this means that there was conflict and thus assign value 1
     list_out = []
     for i, _ in extent_gdf.iterrows():
-        i_poly = extent_gdf.iloc[i][identifier]
+        i_poly = extent_gdf.iloc[i][poly_identifier]
         if i_poly in fatalities_per_poly.index.values:
             list_out.append(1)
         else:
@@ -80,7 +81,8 @@ def conflict_in_previous_year_bool(
     sim_year: int,
     check_neighbors: bool = False,
     neighboring_matrix: Union[None, pd.DataFrame] = None,
-    identifier="watprovID",
+    poly_identifier="watprovID",  # TODO: no kwarg, should come from config
+    conflict_identifier="event_id_cnty",  # TODO: no kwarg, should come from config
 ) -> list:
     """Creates a list for each timestep with boolean information whether 
     a conflict took place in the previous year in a polygon or not.
@@ -104,27 +106,28 @@ def conflict_in_previous_year_bool(
     else:
         click.echo("Checking for conflict event in polygon at t-1")
 
+    # TODO: screening whether there is any conflict data in sim_year should be done earlier
     # get conflicts at t-1
-    temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year - 1]
+    temp_sel_year = conflict_gdf[conflict_gdf.year == sim_year - 1]
     if temp_sel_year.empty:
         warnings.warn(
             f"No conflicts were found in sampled conflict data set for year {sim_year - 1}."
         )
-
     # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
     data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
-
     conflicts_per_poly = (
-        data_merged.id.groupby(data_merged[identifier])
+        data_merged[conflict_identifier]
+        .groupby(data_merged[poly_identifier])
         .count()
         .to_frame()
-        .rename(columns={"id": "conflict_count"})
+        .rename(columns={conflict_identifier: "conflict_count"})
     )
+    # NOTE: WORKS UNTIL HERE
 
     # loop through all polygons
     list_out = []
     for i in range(len(extent_gdf)):
-        i_poly = extent_gdf[identifier].iloc[i]
+        i_poly = extent_gdf[poly_identifier].iloc[i]
         # check if polygon is in list with conflict polygons
         if i_poly in conflicts_per_poly.index.values:
             # if so, check if neighboring polygons contain conflict and assign boolean value

diff --git a/copro/io.py b/copro/io.py
@@ -1,20 +1,19 @@
 import pandas as pd
 import numpy as np
 from typing import Union
-from configparser import RawConfigParser
 from pathlib import Path
 import os
 import click
 
 
 def make_and_collect_output_dirs(
-    config: RawConfigParser, root_dir: click.Path, config_dict: dict
+    config: dict, root_dir: click.Path, config_dict: dict
 ) -> dict:
-    """Creates the output folder at location specfied in cfg-file
+    """Creates the output folder at location specfied in YAML-file
     and returns dictionary with config-objects and out-dir per run.
 
     Args:
-        config (RawConfigParser): object containing the parsed configuration-settings of the model.
+        config (dict): dictionary containing the parsed configuration-settings of the model.
         root_dir (Path): absolute path to location of configurations-file
         config_dict (dict): dictionary containing config-objects for reference run and all projection.
 
@@ -23,7 +22,7 @@ def make_and_collect_output_dirs(
     """
 
     # get path to main output directory as specified in cfg-file
-    out_dir = os.path.join(root_dir, config.get("general", "output_dir"))
+    out_dir = os.path.join(root_dir, config["general"]["output_dir"])
     click.echo(f"Saving output to main output folder {out_dir}.")
 
     # initalize list for all out-dirs

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
@@ -2,17 +2,16 @@
 import pickle
 import pandas as pd
 import numpy as np
-from configparser import RawConfigParser
 from sklearn import ensemble, preprocessing, model_selection, inspection
 from typing import Union, Tuple
 import click
 from pathlib import Path
-from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import GridSearchCV, KFold
 from sklearn.ensemble import RandomForestClassifier
 
 
 class MachineLearning:
-    def __init__(self, config: RawConfigParser) -> None:
+    def __init__(self, config: dict) -> None:
         self.config = config
         self.scaler = define_scaling(config)
         self.clf = ensemble.RandomForestClassifier(random_state=42)
@@ -49,7 +48,7 @@ def split_scale_train_test_split(
         X_train, X_test, y_train, y_test = model_selection.train_test_split(
             X_cs,
             Y,
-            test_size=1 - self.config.getfloat("machine_learning", "train_fraction"),
+            test_size=1 - self.config["machine_learning"]["train_fraction"],
         )
 
         # for training-set and test-set, split in ID, geometry, and values
@@ -77,7 +76,7 @@ def fit_predict(
         tune_hyperparameters=False,
         n_jobs=2,
         verbose=0,
-    ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Fits classifier based on training-data and makes predictions.
         The fitted classifier is dumped to file with pickle to be used again during projections.
         Makes prediction with test-data including probabilities of those predictions.
@@ -96,7 +95,7 @@ def fit_predict(
         Returns:
             np.ndarray: array with the predictions made.
             np.ndarray: array with probabilities of the predictions made.
-            pd.DataFrame: dataframe containing permutation importances of variables.
+            np.ndarray: dataframe containing permutation importances of variables.
         """
 
         if tune_hyperparameters:
@@ -117,11 +116,8 @@ def fit_predict(
             random_state=42,
             n_jobs=n_jobs,
         )
-        sorted_importances_idx = perm_importances.importances_mean.argsort()
-        perm_importances_df = pd.DataFrame(
-            perm_importances.importances[sorted_importances_idx].T,
-            # columns=X_train.columns[sorted_importances_idx],
-        )
+        # transpose because by default features are in rows
+        perm_importances_arr = perm_importances["importances"].T
 
         # create folder to store all classifiers with pickle
         clf_pickle_rep = os.path.join(out_dir, "clfs")
@@ -137,16 +133,16 @@ def fit_predict(
         # make prediction of probability
         y_prob = fitted_estimator.predict_proba(X_test)
 
-        return y_pred, y_prob, perm_importances_df
+        return y_pred, y_prob, perm_importances_arr
 
 
-def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:
+def load_clfs(config: dict, out_dir: str) -> list[str]:
     """Loads the paths to all previously fitted classifiers to a list.
     Classifiers were saved to file in fit_predict().
     With this list, the classifiers can be loaded again during projections.
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
         out_dir (path): path to output folder.
 
     Returns:
@@ -155,7 +151,7 @@ def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:
 
     clfs = os.listdir(os.path.join(out_dir, "clfs"))
 
-    if len(clfs) != config.getint("machine_learning", "n_runs"):
+    if len(clfs) != config["machine_learning"]["n_runs"]:
         raise ValueError(
             "Number of loaded classifiers does not match the specified number of runs in cfg-file!"
         )
@@ -188,7 +184,7 @@ def _split_conflict_geom_data(
 
 
 def define_scaling(
-    config: RawConfigParser,
+    config: dict,
 ) -> Union[
     preprocessing.MinMaxScaler,
     preprocessing.StandardScaler,
@@ -198,19 +194,19 @@ def define_scaling(
     """Defines scaling method based on model configurations.
 
     Args:
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        config (dict): Parsed configuration-settings of the model.
 
     Returns:
         scaler: the specified scaling method instance.
     """
 
-    if config.get("machine_learning", "scaler") == "MinMaxScaler":
+    if config["machine_learning"]["scaler"] == "MinMaxScaler":
         scaler = preprocessing.MinMaxScaler()
-    elif config.get("machine_learning", "scaler") == "StandardScaler":
+    elif config["machine_learning"]["scaler"] == "StandardScaler":
         scaler = preprocessing.StandardScaler()
-    elif config.get("machine_learning", "scaler") == "RobustScaler":
+    elif config["machine_learning"]["scaler"] == "RobustScaler":
         scaler = preprocessing.RobustScaler()
-    elif config.get("machine_learning", "scaler") == "QuantileTransformer":
+    elif config["machine_learning"]["scaler"] == "QuantileTransformer":
         scaler = preprocessing.QuantileTransformer(random_state=42)
     else:
         raise ValueError(
@@ -308,7 +304,7 @@ def apply_gridsearchCV(
     grid_search = GridSearchCV(
         estimator=estimator,
         param_grid=param_grid,
-        cv=5,
+        cv=KFold(n_splits=5, shuffle=True, random_state=42),
         n_jobs=n_jobs,
         verbose=verbose,
         scoring="roc_auc",

diff --git a/copro/models.py b/copro/models.py
@@ -58,7 +58,7 @@ def run(
         Returns:
             pd.DataFrame: Prediction dataframes.
             pd.DataFrame: model output on polygon-basis.
-            pd.DataFrame: containing permutation importances for all runs.
+            np.ndarray: containing permutation importances for all runs.
             dict: evaluation dictionary.
         """
 
@@ -67,33 +67,31 @@ def run(
         # - initializing output variables
         out_X_df = pd.DataFrame()
         out_y_df = pd.DataFrame()
-        out_perm_importances_df = pd.DataFrame()
+        out_perm_importances_arr = np.array([]).reshape(0, self.X.shape[1] - 2)
         out_dict = evaluation.init_out_dict()
 
         click.echo("Training and testing machine learning model")
         for n in range(number_runs):
             click.echo(f"Run {n+1} of {number_runs}.")
 
             # - run machine learning model and return outputs
-            X_df, y_df, eval_dict, perm_importances_df_n = self._n_run(
+            X_df, y_df, eval_dict, perm_importances_arr_n = self._n_run(
                 run_nr=n, tune_hyperparameters=tune_hyperparameters
             )
 
             # - append per model execution
             out_X_df = pd.concat([out_X_df, X_df], axis=0, ignore_index=True)
             out_y_df = pd.concat([out_y_df, y_df], axis=0, ignore_index=True)
-            out_perm_importances_df = pd.concat(
-                [out_perm_importances_df, perm_importances_df_n],
-                axis=0,
-                ignore_index=True,
+            out_perm_importances_arr = np.vstack(
+                [out_perm_importances_arr, perm_importances_arr_n]
             )
             out_dict = evaluation.fill_out_dict(out_dict, eval_dict)
 
-        return out_X_df, out_y_df, out_perm_importances_df, out_dict
+        return out_X_df, out_y_df, out_perm_importances_arr, out_dict
 
     def _n_run(
         self, run_nr: int, tune_hyperparameters=False
-    ) -> tuple[pd.DataFrame, pd.DataFrame, dict, pd.DataFrame]:
+    ) -> tuple[pd.DataFrame, pd.DataFrame, dict, np.ndarray]:
         """Runs workflow per specified number of runs.
         The model workflow is executed for each classifier.
 
@@ -105,7 +103,7 @@ def _n_run(
             pd.DataFrame: containing the test-data X-array values.
             pd.DataFrame: containing model output on polygon-basis.
             dict: dictionary containing evaluation metrics per simulation.
-            pd.DataFrame: containing permutation importances for run n.
+            np.ndarray: containing permutation importances for run n.
         """
 
         MLmodel = machine_learning.MachineLearning(
@@ -128,7 +126,7 @@ def _n_run(
         X_df = pd.DataFrame(X_test)
 
         # fit classifier and make prediction with test-set
-        y_pred, y_prob, perm_importances_df_n = MLmodel.fit_predict(
+        y_pred, y_prob, perm_importances_arr_n = MLmodel.fit_predict(
             X_train,
             y_train,
             X_test,
@@ -149,7 +147,7 @@ def _n_run(
             X_test_ID, X_test_geom, y_test, y_pred, y_prob_0, y_prob_1
         )
 
-        return X_df, y_df, eval_dict, perm_importances_df_n
+        return X_df, y_df, eval_dict, perm_importances_arr_n
 
     def run_prediction(
         self,
@@ -180,7 +178,7 @@ def run_prediction(
         clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF)
 
         # going through each projection specified
-        for each_key, _ in config_REF.items("PROJ_files"):
+        for each_key, _ in config_REF.items():
 
             # get config-object and out-dir per projection
             click.echo(f"Loading config-object for projection run: {each_key}.")

diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py
@@ -2,6 +2,7 @@
 
 import click
 import numpy as np
+import pandas as pd
 import os
 
 import warnings
@@ -62,17 +63,19 @@ def cli(cfg: click.Path, cores: int, verbose: int):
     )
 
     # - fit-transform on scaler to be used later during projections
-
-    _, out_y_df, out_perm_importances_df, out_dict = ModelWorkflow.run(
-        config_REF.getint("machine_learning", "n_runs"), tune_hyperparameters=True
+    _, out_y_df, out_perm_importances_arr, out_dict = ModelWorkflow.run(
+        config_REF["machine_learning"]["n_runs"], tune_hyperparameters=True
     )
 
     # - save output to files
-    out_perm_importances_df.columns = [
-        key
-        for key in XY_class.XY_dict
-        if key not in ["poly_ID", "poly_geometry", "conflict"]
-    ]
+    out_perm_importances_df = pd.DataFrame(
+        data=out_perm_importances_arr,
+        columns=[
+            key
+            for key in XY_class.XY_dict
+            if key not in ["poly_ID", "poly_geometry", "conflict"]
+        ],
+    )
     out_perm_importances_df.to_parquet(
         os.path.join(out_dir_REF, "perm_importances.parquet")
     )
@@ -84,7 +87,7 @@ def cli(cfg: click.Path, cores: int, verbose: int):
         click.echo(
             "Average {} of run with {} repetitions is {:0.3f}".format(
                 key,
-                config_REF.getint("machine_learning", "n_runs"),
+                config_REF["machine_learning"]["n_runs"],
                 np.mean(value),
             )
         )
@@ -97,10 +100,10 @@ def cli(cfg: click.Path, cores: int, verbose: int):
 
     click.echo(click.style("\nINFO: reference run succesfully finished\n", fg="cyan"))
 
-    click.echo(click.style("INFO: starting projections\n", fg="cyan"))
-
-    # - running prediction runs
-    # TODO: scaler_fitted is now not part of the class
-    ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf)
+    if "projections" in config_REF.keys():
+        click.echo(click.style("INFO: starting projections\n", fg="cyan"))
+        # - running prediction runs
+        # TODO: scaler_fitted is now not part of the class
+        ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf)
 
     click.echo(click.style("\nINFO: all projections succesfully finished\n", fg="cyan"))