Skip to content

Commit

Permalink
Fix/projections (#201)
Browse files Browse the repository at this point in the history
* option for either Regression or Classification model

* fine tuned selection of conflict data points

* option to provide ML target variable

* use simulation_name as output dir

* data extraction from netCDF files in a separate function

* target variable and estimator used determined in main script

* should belong to previous commit

* first step towards flexible target_vars when extracting conflict (Y) data

* added class docstrings

* constructing X and Y data as dataframes instead of arrays

* no log-scale support for now, more consistent treatment of polygons w/o feature data

* removing all polygons with 1 or more NaNs

* fully pd.dataframe support implemented

* save only selected conflicts which fall in simulation period

* finetuning print output

* no random state set for Kfold in GridSearchCV to ensure all n models are fitted on different data

* better handling of cores via command line

* remove content of output dir to avoid conflicts with expected files

* settings parsed for projections with new yaml file structure

* udpated docstring for load_estimators

* fixed definition of projection period

* reinitated initiate_X_data function

* saving files as GPKG instead GeoJSON

* no output for None output from rasterstats

* make run_prediction() work

* apply isort

* no ML target var for nwo

* fix Geopandas driver

* corrected number of function arguments
  • Loading branch information
JannisHoch authored Dec 11, 2024
1 parent 338021e commit ecdc385
Show file tree
Hide file tree
Showing 17 changed files with 463 additions and 387 deletions.
14 changes: 8 additions & 6 deletions copro/conflict.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from copro import utils, nb
import os
import warnings
from configparser import RawConfigParser
from pathlib import Path
from typing import Union, Literal
import pandas as pd
from typing import Literal, Union

import click
import geopandas as gpd
import numpy as np
import os
import click
import warnings
import pandas as pd

from copro import nb, utils


def conflict_in_year_bool(
Expand Down
7 changes: 4 additions & 3 deletions copro/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from sklearn import metrics
import pandas as pd
import geopandas as gpd
from typing import Union

import geopandas as gpd
import pandas as pd
from sklearn import metrics


def init_out_dict(scores: Union[list[str], None] = None) -> dict:
"""Initiates the main model evaluatoin dictionary for a range of model metric scores.
Expand Down
14 changes: 10 additions & 4 deletions copro/io.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pandas as pd
import numpy as np
from typing import Union
from pathlib import Path
import os
import shutil
from pathlib import Path
from typing import Union

import click
import numpy as np
import pandas as pd


def make_and_collect_output_dirs(
Expand All @@ -27,6 +29,10 @@ def make_and_collect_output_dirs(
)
click.echo(f"Saving output to main output folder {out_dir}.")

# Check if out_dir exists and delete its contents if it does
if os.path.exists(out_dir):
shutil.rmtree(out_dir)

# initalize list for all out-dirs
all_out_dirs = []
# create reference output dir '_REF' under main output dir
Expand Down
21 changes: 11 additions & 10 deletions copro/machine_learning.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
import pickle
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing, model_selection, inspection
from typing import Union, Tuple
import click
from pathlib import Path
from typing import Tuple, Union

import click
import numpy as np
import pandas as pd
from sklearn import ensemble, inspection, model_selection, preprocessing
from sklearn.model_selection import GridSearchCV, KFold


Expand Down Expand Up @@ -151,23 +152,23 @@ def fit_predict(


def load_estimators(config: dict, out_dir: str) -> list[str]:
"""Loads the paths to all previously fitted classifiers to a list.
Classifiers were saved to file in fit_predict().
"""Loads the paths to all previously fitted estimators to a list.
Estimators were saved to file in `fit_predict()`.
With this list, the classifiers can be loaded again during projections.
Args:
config (dict): Parsed configuration-settings of the model.
out_dir (path): path to output folder.
Returns:
list: list with file names of classifiers.
list: list with file names of estimators.
"""

estimators = os.listdir(os.path.join(out_dir, "estimators"))

if len(estimators) != config["machine_learning"]["n_runs"]:
raise ValueError(
"Number of loaded classifiers does not match the specified number of runs in cfg-file!"
"Number of loaded estimators does not match the specified number of runs in reference yaml-file!"
)

return estimators
Expand Down Expand Up @@ -326,7 +327,7 @@ def apply_gridsearchCV(
grid_search = GridSearchCV(
estimator=estimator,
param_grid=param_grid,
cv=KFold(n_splits=5, shuffle=True, random_state=42),
cv=KFold(n_splits=5, shuffle=True),
n_jobs=n_jobs,
verbose=verbose,
scoring=scoring,
Expand Down
93 changes: 50 additions & 43 deletions copro/models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from copro import machine_learning, conflict, evaluation, utils, xydata, settings
import os
import pickle
from configparser import RawConfigParser
from sklearn import ensemble
from sklearn.utils.validation import check_is_fitted
from pathlib import Path
import pandas as pd
import numpy as np
from typing import Union, Tuple
import geopandas as gpd
from typing import Tuple, Union

import click
import os
import pickle
import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.utils.validation import check_is_fitted

from copro import conflict, evaluation, machine_learning, settings, utils, xydata


class MainModel:
Expand Down Expand Up @@ -47,6 +49,7 @@ def __init__(
self.estimator = estimator
self.out_dir = out_dir
self.n_jobs = n_jobs
click.echo(f"Number of jobs to run in parallel: {n_jobs}.")
self.verbose = verbose

def run(
Expand Down Expand Up @@ -179,26 +182,26 @@ def run_prediction(
config_REF = main_dict["_REF"][0]
out_dir_REF = main_dict["_REF"][1]

clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF)
estimators, all_y_df = _init_prediction_run(config_REF, out_dir_REF)

# going through each projection specified
for each_key, _ in config_REF.items():
for projection, _ in config_REF["projections"].items():

# get config-object and out-dir per projection
click.echo(f"Loading config-object for projection run: {each_key}.")
config_PROJ = main_dict[str(each_key)][0][0]
out_dir_PROJ = main_dict[str(each_key)][1]
click.echo(f"Loading config-object for projection run: {projection}.")
config_PROJ = main_dict[str(projection)][0][0]
out_dir_PROJ = main_dict[str(projection)][1]

click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.")
click.echo(f"Storing output for this projection to folder {out_dir_PROJ}.")
Path.mkdir(
Path(os.path.join(out_dir_PROJ, "clfs")), parents=True, exist_ok=True
Path(os.path.join(out_dir_PROJ, "estimators")),
parents=True,
exist_ok=True,
)

# get projection period for this projection
# defined as all years starting from end of reference run until specified end of projections
projection_period = settings.determine_projection_period(
config_REF, config_PROJ
)
projection_period = settings.determine_projection_period(config_REF)

# for this projection, go through all years
for i, proj_year in enumerate(projection_period):
Expand All @@ -219,7 +222,7 @@ def run_prediction(
out_dir_REF,
"files",
"conflicts_in_{}.csv".format(
config_REF.getint("settings", "y_end")
config_REF["general"]["y_end"]
),
)
)
Expand All @@ -229,7 +232,7 @@ def run_prediction(
out_dir_REF,
"files",
"conflicts_in_{}.csv".format(
config_REF.getint("settings", "y_end")
config_REF["general"]["y_end"]
),
),
index_col=0,
Expand All @@ -242,35 +245,37 @@ def run_prediction(
y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])

# now load all classifiers created in the reference run
for clf in clfs:
for estimator in estimators:

# creating an individual output folder per classifier
if not os.path.isdir(
os.path.join(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
)
)
):
os.makedirs(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
)
)

# load the pickled objects
# TODO: keep them in memory, i.e. after reading the clfs-folder above
with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f:
with open(
os.path.join(out_dir_REF, "estimators", estimator), "rb"
) as f:
click.echo(
"Loading classifier {} from {}".format(
clf, os.path.join(out_dir_REF, "clfs")
"Loading estimator {} from {}".format(
estimator, os.path.join(out_dir_REF, "estimators")
)
)
clf_obj = pickle.load(f)
estimator_obj = pickle.load(f)

# for all other projection years than the first one,
# we need to read projected conflict from the previous projection year
Expand All @@ -279,17 +284,17 @@ def run_prediction(
"Reading previous conflicts from file {}".format(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf),
"estimators",
str(estimator),
"projection_for_{}.csv".format(proj_year - 1),
)
)
)
conflict_data = pd.read_csv(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
"projection_for_{}.csv".format(proj_year - 1),
),
index_col=0,
Expand All @@ -307,15 +312,15 @@ def run_prediction(
# here the data will be used to make projections with various classifiers
# returns the prediction based on one individual classifier
y_df_clf = machine_learning.predictive(
X, clf_obj, self.scaler_all_data
X, estimator_obj, self.scaler_all_data
)

# storing the projection per clf to be used in the following timestep
y_df_clf.to_csv(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
"projection_for_{}.csv".format(proj_year),
)
)
Expand All @@ -333,8 +338,9 @@ def run_prediction(
y_df, global_df, make_proj=True
)
gdf_hit.to_file(
os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"),
driver="GeoJSON",
os.path.join(out_dir_PROJ, f"output_in_{proj_year}.gpkg"),
driver="GPKG",
crs="EPSG:4326",
)

# create one major output dataframe containing all output for all projections with all classifiers
Expand All @@ -346,20 +352,21 @@ def run_prediction(
def _init_prediction_run(
config_REF: RawConfigParser, out_dir_REF: str
) -> Tuple[list, pd.DataFrame]:
"""Initializes the prediction run by loading all classifiers created in the reference run.
"""Initializes the prediction run by loading all estimators created in the reference run.
Also initiates an empty dataframe to store the predictions.
Args:
config_REF (RawConfigParser): Reference configuration object.
out_dir_REF (str): Output directory for reference run.
Returns:
Tuple[list, pd.DataFrame]: List with classifiers and initiated empty dataframe for predictions.
list: List with estimators.
pd.DataFrame: Initiated empty dataframe for predictions.
"""

clfs = machine_learning.load_clfs(config_REF, out_dir_REF)
estimators = machine_learning.load_estimators(config_REF, out_dir_REF)

# initiate output dataframe
all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])

return clfs, all_y_df
return estimators, all_y_df
4 changes: 2 additions & 2 deletions copro/nb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import click
import pandas as pd
import numpy as np
import geopandas as gpd
import numpy as np
import pandas as pd


def neighboring_polys(
Expand Down
Loading

0 comments on commit ecdc385

Please sign in to comment.