Fix/projections (#201)

* option for either Regression or Classification model * fine tuned selection of conflict data points * option to provide ML target variable * use simulation_name as output dir * data extraction from netCDF files in a separate function * target variable and estimator used determined in main script * should belong to previous commit * first step towards flexible target_vars when extracting conflict (Y) data * added class docstrings * constructing X and Y data as dataframes instead of arrays * no log-scale support for now, more consistent treatment of polygons w/o feature data * removing all polygons with 1 or more NaNs * fully pd.dataframe support implemented * save only selected conflicts which fall in simulation period * finetuning print output * no random state set for Kfold in GridSearchCV to ensure all n models are fitted on different data * better handling of cores via command line * remove content of output dir to avoid conflicts with expected files * settings parsed for projections with new yaml file structure * udpated docstring for load_estimators * fixed definition of projection period * reinitated initiate_X_data function * saving files as GPKG instead GeoJSON * no output for None output from rasterstats * make run_prediction() work * apply isort * no ML target var for nwo * fix Geopandas driver * corrected number of function arguments
JannisHoch · Dec 11, 2024 · ecdc385 · ecdc385
1 parent 338021e
commit ecdc385
Show file tree

Hide file tree

Showing 17 changed files with 463 additions and 387 deletions.
diff --git a/copro/conflict.py b/copro/conflict.py
@@ -1,13 +1,15 @@
-from copro import utils, nb
+import os
+import warnings
 from configparser import RawConfigParser
 from pathlib import Path
-from typing import Union, Literal
-import pandas as pd
+from typing import Literal, Union
+
+import click
 import geopandas as gpd
 import numpy as np
-import os
-import click
-import warnings
+import pandas as pd
+
+from copro import nb, utils
 
 
 def conflict_in_year_bool(

diff --git a/copro/evaluation.py b/copro/evaluation.py
@@ -1,8 +1,9 @@
-from sklearn import metrics
-import pandas as pd
-import geopandas as gpd
 from typing import Union
 
+import geopandas as gpd
+import pandas as pd
+from sklearn import metrics
+
 
 def init_out_dict(scores: Union[list[str], None] = None) -> dict:
     """Initiates the main model evaluatoin dictionary for a range of model metric scores.

diff --git a/copro/io.py b/copro/io.py
@@ -1,9 +1,11 @@
-import pandas as pd
-import numpy as np
-from typing import Union
-from pathlib import Path
 import os
+import shutil
+from pathlib import Path
+from typing import Union
+
 import click
+import numpy as np
+import pandas as pd
 
 
 def make_and_collect_output_dirs(
@@ -27,6 +29,10 @@ def make_and_collect_output_dirs(
     )
     click.echo(f"Saving output to main output folder {out_dir}.")
 
+    # Check if out_dir exists and delete its contents if it does
+    if os.path.exists(out_dir):
+        shutil.rmtree(out_dir)
+
     # initalize list for all out-dirs
     all_out_dirs = []
     # create reference output dir '_REF' under main output dir

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
@@ -1,11 +1,12 @@
 import os
 import pickle
-import pandas as pd
-import numpy as np
-from sklearn import ensemble, preprocessing, model_selection, inspection
-from typing import Union, Tuple
-import click
 from pathlib import Path
+from typing import Tuple, Union
+
+import click
+import numpy as np
+import pandas as pd
+from sklearn import ensemble, inspection, model_selection, preprocessing
 from sklearn.model_selection import GridSearchCV, KFold
 
 
@@ -151,23 +152,23 @@ def fit_predict(
 
 
 def load_estimators(config: dict, out_dir: str) -> list[str]:
-    """Loads the paths to all previously fitted classifiers to a list.
-    Classifiers were saved to file in fit_predict().
+    """Loads the paths to all previously fitted estimators to a list.
+    Estimators were saved to file in `fit_predict()`.
     With this list, the classifiers can be loaded again during projections.
 
     Args:
         config (dict): Parsed configuration-settings of the model.
         out_dir (path): path to output folder.
 
     Returns:
-        list: list with file names of classifiers.
+        list: list with file names of estimators.
     """
 
     estimators = os.listdir(os.path.join(out_dir, "estimators"))
 
     if len(estimators) != config["machine_learning"]["n_runs"]:
         raise ValueError(
-            "Number of loaded classifiers does not match the specified number of runs in cfg-file!"
+            "Number of loaded estimators does not match the specified number of runs in reference yaml-file!"
         )
 
     return estimators
@@ -326,7 +327,7 @@ def apply_gridsearchCV(
     grid_search = GridSearchCV(
         estimator=estimator,
         param_grid=param_grid,
-        cv=KFold(n_splits=5, shuffle=True, random_state=42),
+        cv=KFold(n_splits=5, shuffle=True),
         n_jobs=n_jobs,
         verbose=verbose,
         scoring=scoring,

diff --git a/copro/models.py b/copro/models.py
@@ -1,15 +1,17 @@
-from copro import machine_learning, conflict, evaluation, utils, xydata, settings
+import os
+import pickle
 from configparser import RawConfigParser
-from sklearn import ensemble
-from sklearn.utils.validation import check_is_fitted
 from pathlib import Path
-import pandas as pd
-import numpy as np
-from typing import Union, Tuple
-import geopandas as gpd
+from typing import Tuple, Union
+
 import click
-import os
-import pickle
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+from sklearn import ensemble
+from sklearn.utils.validation import check_is_fitted
+
+from copro import conflict, evaluation, machine_learning, settings, utils, xydata
 
 
 class MainModel:
@@ -47,6 +49,7 @@ def __init__(
         self.estimator = estimator
         self.out_dir = out_dir
         self.n_jobs = n_jobs
+        click.echo(f"Number of jobs to run in parallel: {n_jobs}.")
         self.verbose = verbose
 
     def run(
@@ -179,26 +182,26 @@ def run_prediction(
         config_REF = main_dict["_REF"][0]
         out_dir_REF = main_dict["_REF"][1]
 
-        clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF)
+        estimators, all_y_df = _init_prediction_run(config_REF, out_dir_REF)
 
         # going through each projection specified
-        for each_key, _ in config_REF.items():
+        for projection, _ in config_REF["projections"].items():
 
             # get config-object and out-dir per projection
-            click.echo(f"Loading config-object for projection run: {each_key}.")
-            config_PROJ = main_dict[str(each_key)][0][0]
-            out_dir_PROJ = main_dict[str(each_key)][1]
+            click.echo(f"Loading config-object for projection run: {projection}.")
+            config_PROJ = main_dict[str(projection)][0][0]
+            out_dir_PROJ = main_dict[str(projection)][1]
 
-            click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.")
+            click.echo(f"Storing output for this projection to folder {out_dir_PROJ}.")
             Path.mkdir(
-                Path(os.path.join(out_dir_PROJ, "clfs")), parents=True, exist_ok=True
+                Path(os.path.join(out_dir_PROJ, "estimators")),
+                parents=True,
+                exist_ok=True,
             )
 
             # get projection period for this projection
             # defined as all years starting from end of reference run until specified end of projections
-            projection_period = settings.determine_projection_period(
-                config_REF, config_PROJ
-            )
+            projection_period = settings.determine_projection_period(config_REF)
 
             # for this projection, go through all years
             for i, proj_year in enumerate(projection_period):
@@ -219,7 +222,7 @@ def run_prediction(
                                 out_dir_REF,
                                 "files",
                                 "conflicts_in_{}.csv".format(
-                                    config_REF.getint("settings", "y_end")
+                                    config_REF["general"]["y_end"]
                                 ),
                             )
                         )
@@ -229,7 +232,7 @@ def run_prediction(
                             out_dir_REF,
                             "files",
                             "conflicts_in_{}.csv".format(
-                                config_REF.getint("settings", "y_end")
+                                config_REF["general"]["y_end"]
                             ),
                         ),
                         index_col=0,
@@ -242,35 +245,37 @@ def run_prediction(
                 y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
 
                 # now load all classifiers created in the reference run
-                for clf in clfs:
+                for estimator in estimators:
 
                     # creating an individual output folder per classifier
                     if not os.path.isdir(
                         os.path.join(
                             os.path.join(
                                 out_dir_PROJ,
-                                "clfs",
-                                str(clf).rsplit(".", maxsplit=1)[0],
+                                "estimators",
+                                str(estimator).rsplit(".", maxsplit=1)[0],
                             )
                         )
                     ):
                         os.makedirs(
                             os.path.join(
                                 out_dir_PROJ,
-                                "clfs",
-                                str(clf).rsplit(".", maxsplit=1)[0],
+                                "estimators",
+                                str(estimator).rsplit(".", maxsplit=1)[0],
                             )
                         )
 
                     # load the pickled objects
                     # TODO: keep them in memory, i.e. after reading the clfs-folder above
-                    with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f:
+                    with open(
+                        os.path.join(out_dir_REF, "estimators", estimator), "rb"
+                    ) as f:
                         click.echo(
-                            "Loading classifier {} from {}".format(
-                                clf, os.path.join(out_dir_REF, "clfs")
+                            "Loading estimator {} from {}".format(
+                                estimator, os.path.join(out_dir_REF, "estimators")
                             )
                         )
-                        clf_obj = pickle.load(f)
+                        estimator_obj = pickle.load(f)
 
                     # for all other projection years than the first one,
                     # we need to read projected conflict from the previous projection year
@@ -279,17 +284,17 @@ def run_prediction(
                             "Reading previous conflicts from file {}".format(
                                 os.path.join(
                                     out_dir_PROJ,
-                                    "clfs",
-                                    str(clf),
+                                    "estimators",
+                                    str(estimator),
                                     "projection_for_{}.csv".format(proj_year - 1),
                                 )
                             )
                         )
                         conflict_data = pd.read_csv(
                             os.path.join(
                                 out_dir_PROJ,
-                                "clfs",
-                                str(clf).rsplit(".", maxsplit=1)[0],
+                                "estimators",
+                                str(estimator).rsplit(".", maxsplit=1)[0],
                                 "projection_for_{}.csv".format(proj_year - 1),
                             ),
                             index_col=0,
@@ -307,15 +312,15 @@ def run_prediction(
                     # here the data will be used to make projections with various classifiers
                     # returns the prediction based on one individual classifier
                     y_df_clf = machine_learning.predictive(
-                        X, clf_obj, self.scaler_all_data
+                        X, estimator_obj, self.scaler_all_data
                     )
 
                     # storing the projection per clf to be used in the following timestep
                     y_df_clf.to_csv(
                         os.path.join(
                             out_dir_PROJ,
-                            "clfs",
-                            str(clf).rsplit(".", maxsplit=1)[0],
+                            "estimators",
+                            str(estimator).rsplit(".", maxsplit=1)[0],
                             "projection_for_{}.csv".format(proj_year),
                         )
                     )
@@ -333,8 +338,9 @@ def run_prediction(
                     y_df, global_df, make_proj=True
                 )
                 gdf_hit.to_file(
-                    os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"),
-                    driver="GeoJSON",
+                    os.path.join(out_dir_PROJ, f"output_in_{proj_year}.gpkg"),
+                    driver="GPKG",
+                    crs="EPSG:4326",
                 )
 
             # create one major output dataframe containing all output for all projections with all classifiers
@@ -346,20 +352,21 @@ def run_prediction(
 def _init_prediction_run(
     config_REF: RawConfigParser, out_dir_REF: str
 ) -> Tuple[list, pd.DataFrame]:
-    """Initializes the prediction run by loading all classifiers created in the reference run.
+    """Initializes the prediction run by loading all estimators created in the reference run.
     Also initiates an empty dataframe to store the predictions.
 
     Args:
         config_REF (RawConfigParser): Reference configuration object.
         out_dir_REF (str): Output directory for reference run.
 
     Returns:
-        Tuple[list, pd.DataFrame]: List with classifiers and initiated empty dataframe for predictions.
+        list: List with estimators.
+        pd.DataFrame: Initiated empty dataframe for predictions.
     """
 
-    clfs = machine_learning.load_clfs(config_REF, out_dir_REF)
+    estimators = machine_learning.load_estimators(config_REF, out_dir_REF)
 
     # initiate output dataframe
     all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])
 
-    return clfs, all_y_df
+    return estimators, all_y_df
diff --git a/copro/nb.py b/copro/nb.py
@@ -1,7 +1,7 @@
 import click
-import pandas as pd
-import numpy as np
 import geopandas as gpd
+import numpy as np
+import pandas as pd
 
 
 def neighboring_polys(