Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/projections #201

Merged
merged 30 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9dee8ab
option for either Regression or Classification model
JannisHoch Aug 26, 2024
70a4e9f
fine tuned selection of conflict data points
JannisHoch Aug 26, 2024
34770b8
option to provide ML target variable
JannisHoch Aug 26, 2024
f258220
use simulation_name as output dir
JannisHoch Aug 26, 2024
e53e335
data extraction from netCDF files in a separate function
JannisHoch Aug 26, 2024
998f209
target variable and estimator used determined in main script
JannisHoch Aug 26, 2024
737445b
should belong to previous commit
JannisHoch Aug 26, 2024
dd51311
first step towards flexible target_vars when extracting conflict (Y) …
JannisHoch Aug 26, 2024
6331756
added class docstrings
JannisHoch Aug 26, 2024
75e316e
constructing X and Y data as dataframes instead of arrays
JannisHoch Aug 26, 2024
11329fb
no log-scale support for now, more consistent treatment of polygons w…
JannisHoch Aug 26, 2024
5e8fd95
removing all polygons with 1 or more NaNs
JannisHoch Aug 26, 2024
fa6224d
fully pd.dataframe support implemented
JannisHoch Aug 26, 2024
e07f5f3
save only selected conflicts which fall in simulation period
JannisHoch Aug 26, 2024
3a053fe
finetuning print output
JannisHoch Aug 26, 2024
e77ae62
no random state set for Kfold in GridSearchCV to ensure all n models …
JannisHoch Aug 27, 2024
85b5d3a
better handling of cores via command line
JannisHoch Sep 3, 2024
dec66b3
remove content of output dir to avoid conflicts with expected files
JannisHoch Sep 9, 2024
b7fb849
settings parsed for projections with new yaml file structure
JannisHoch Sep 9, 2024
89650ac
udpated docstring for load_estimators
JannisHoch Sep 9, 2024
00d93a3
fixed definition of projection period
JannisHoch Sep 11, 2024
a5804e2
reinitated initiate_X_data function
JannisHoch Sep 11, 2024
8626b76
saving files as GPKG instead GeoJSON
JannisHoch Sep 11, 2024
2b64d57
no output for None output from rasterstats
JannisHoch Sep 11, 2024
6facc8b
make run_prediction() work
JannisHoch Sep 11, 2024
daa5e76
apply isort
JannisHoch Sep 11, 2024
c56d403
no ML target var for nwo
JannisHoch Sep 16, 2024
c5ce6fd
fix Geopandas driver
JannisHoch Sep 16, 2024
f07c07c
corrected number of function arguments
JannisHoch Sep 16, 2024
3505358
Merge branch 'dev' into fix/projections
JannisHoch Dec 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions copro/conflict.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from copro import utils, nb
import os
import warnings
from configparser import RawConfigParser
from pathlib import Path
from typing import Union, Literal
import pandas as pd
from typing import Literal, Union

import click
import geopandas as gpd
import numpy as np
import os
import click
import warnings
import pandas as pd

from copro import nb, utils


def conflict_in_year_bool(
Expand Down
7 changes: 4 additions & 3 deletions copro/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from sklearn import metrics
import pandas as pd
import geopandas as gpd
from typing import Union

import geopandas as gpd
import pandas as pd
from sklearn import metrics


def init_out_dict(scores: Union[list[str], None] = None) -> dict:
"""Initiates the main model evaluatoin dictionary for a range of model metric scores.
Expand Down
14 changes: 10 additions & 4 deletions copro/io.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pandas as pd
import numpy as np
from typing import Union
from pathlib import Path
import os
import shutil
from pathlib import Path
from typing import Union

import click
import numpy as np
import pandas as pd


def make_and_collect_output_dirs(
Expand All @@ -27,6 +29,10 @@ def make_and_collect_output_dirs(
)
click.echo(f"Saving output to main output folder {out_dir}.")

# Check if out_dir exists and delete its contents if it does
if os.path.exists(out_dir):
shutil.rmtree(out_dir)

# initalize list for all out-dirs
all_out_dirs = []
# create reference output dir '_REF' under main output dir
Expand Down
21 changes: 11 additions & 10 deletions copro/machine_learning.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
import pickle
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing, model_selection, inspection
from typing import Union, Tuple
import click
from pathlib import Path
from typing import Tuple, Union

import click
import numpy as np
import pandas as pd
from sklearn import ensemble, inspection, model_selection, preprocessing
from sklearn.model_selection import GridSearchCV, KFold


Expand Down Expand Up @@ -151,23 +152,23 @@ def fit_predict(


def load_estimators(config: dict, out_dir: str) -> list[str]:
"""Loads the paths to all previously fitted classifiers to a list.
Classifiers were saved to file in fit_predict().
"""Loads the paths to all previously fitted estimators to a list.
Estimators were saved to file in `fit_predict()`.
With this list, the classifiers can be loaded again during projections.

Args:
config (dict): Parsed configuration-settings of the model.
out_dir (path): path to output folder.

Returns:
list: list with file names of classifiers.
list: list with file names of estimators.
"""

estimators = os.listdir(os.path.join(out_dir, "estimators"))

if len(estimators) != config["machine_learning"]["n_runs"]:
raise ValueError(
"Number of loaded classifiers does not match the specified number of runs in cfg-file!"
"Number of loaded estimators does not match the specified number of runs in reference yaml-file!"
)

return estimators
Expand Down Expand Up @@ -326,7 +327,7 @@ def apply_gridsearchCV(
grid_search = GridSearchCV(
estimator=estimator,
param_grid=param_grid,
cv=KFold(n_splits=5, shuffle=True, random_state=42),
cv=KFold(n_splits=5, shuffle=True),
n_jobs=n_jobs,
verbose=verbose,
scoring=scoring,
Expand Down
93 changes: 50 additions & 43 deletions copro/models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from copro import machine_learning, conflict, evaluation, utils, xydata, settings
import os
import pickle
from configparser import RawConfigParser
from sklearn import ensemble
from sklearn.utils.validation import check_is_fitted
from pathlib import Path
import pandas as pd
import numpy as np
from typing import Union, Tuple
import geopandas as gpd
from typing import Tuple, Union

import click
import os
import pickle
import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.utils.validation import check_is_fitted

from copro import conflict, evaluation, machine_learning, settings, utils, xydata


class MainModel:
Expand Down Expand Up @@ -47,6 +49,7 @@ def __init__(
self.estimator = estimator
self.out_dir = out_dir
self.n_jobs = n_jobs
click.echo(f"Number of jobs to run in parallel: {n_jobs}.")
self.verbose = verbose

def run(
Expand Down Expand Up @@ -179,26 +182,26 @@ def run_prediction(
config_REF = main_dict["_REF"][0]
out_dir_REF = main_dict["_REF"][1]

clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF)
estimators, all_y_df = _init_prediction_run(config_REF, out_dir_REF)

# going through each projection specified
for each_key, _ in config_REF.items():
for projection, _ in config_REF["projections"].items():

# get config-object and out-dir per projection
click.echo(f"Loading config-object for projection run: {each_key}.")
config_PROJ = main_dict[str(each_key)][0][0]
out_dir_PROJ = main_dict[str(each_key)][1]
click.echo(f"Loading config-object for projection run: {projection}.")
config_PROJ = main_dict[str(projection)][0][0]
out_dir_PROJ = main_dict[str(projection)][1]

click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.")
click.echo(f"Storing output for this projection to folder {out_dir_PROJ}.")
Path.mkdir(
Path(os.path.join(out_dir_PROJ, "clfs")), parents=True, exist_ok=True
Path(os.path.join(out_dir_PROJ, "estimators")),
parents=True,
exist_ok=True,
)

# get projection period for this projection
# defined as all years starting from end of reference run until specified end of projections
projection_period = settings.determine_projection_period(
config_REF, config_PROJ
)
projection_period = settings.determine_projection_period(config_REF)

# for this projection, go through all years
for i, proj_year in enumerate(projection_period):
Expand All @@ -219,7 +222,7 @@ def run_prediction(
out_dir_REF,
"files",
"conflicts_in_{}.csv".format(
config_REF.getint("settings", "y_end")
config_REF["general"]["y_end"]
),
)
)
Expand All @@ -229,7 +232,7 @@ def run_prediction(
out_dir_REF,
"files",
"conflicts_in_{}.csv".format(
config_REF.getint("settings", "y_end")
config_REF["general"]["y_end"]
),
),
index_col=0,
Expand All @@ -242,35 +245,37 @@ def run_prediction(
y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])

# now load all classifiers created in the reference run
for clf in clfs:
for estimator in estimators:

# creating an individual output folder per classifier
if not os.path.isdir(
os.path.join(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
)
)
):
os.makedirs(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
)
)

# load the pickled objects
# TODO: keep them in memory, i.e. after reading the clfs-folder above
with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f:
with open(
os.path.join(out_dir_REF, "estimators", estimator), "rb"
) as f:
click.echo(
"Loading classifier {} from {}".format(
clf, os.path.join(out_dir_REF, "clfs")
"Loading estimator {} from {}".format(
estimator, os.path.join(out_dir_REF, "estimators")
)
)
clf_obj = pickle.load(f)
estimator_obj = pickle.load(f)

# for all other projection years than the first one,
# we need to read projected conflict from the previous projection year
Expand All @@ -279,17 +284,17 @@ def run_prediction(
"Reading previous conflicts from file {}".format(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf),
"estimators",
str(estimator),
"projection_for_{}.csv".format(proj_year - 1),
)
)
)
conflict_data = pd.read_csv(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
"projection_for_{}.csv".format(proj_year - 1),
),
index_col=0,
Expand All @@ -307,15 +312,15 @@ def run_prediction(
# here the data will be used to make projections with various classifiers
# returns the prediction based on one individual classifier
y_df_clf = machine_learning.predictive(
X, clf_obj, self.scaler_all_data
X, estimator_obj, self.scaler_all_data
)

# storing the projection per clf to be used in the following timestep
y_df_clf.to_csv(
os.path.join(
out_dir_PROJ,
"clfs",
str(clf).rsplit(".", maxsplit=1)[0],
"estimators",
str(estimator).rsplit(".", maxsplit=1)[0],
"projection_for_{}.csv".format(proj_year),
)
)
Expand All @@ -333,8 +338,9 @@ def run_prediction(
y_df, global_df, make_proj=True
)
gdf_hit.to_file(
os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"),
driver="GeoJSON",
os.path.join(out_dir_PROJ, f"output_in_{proj_year}.gpkg"),
driver="GPKG",
crs="EPSG:4326",
)

# create one major output dataframe containing all output for all projections with all classifiers
Expand All @@ -346,20 +352,21 @@ def run_prediction(
def _init_prediction_run(
config_REF: RawConfigParser, out_dir_REF: str
) -> Tuple[list, pd.DataFrame]:
"""Initializes the prediction run by loading all classifiers created in the reference run.
"""Initializes the prediction run by loading all estimators created in the reference run.
Also initiates an empty dataframe to store the predictions.

Args:
config_REF (RawConfigParser): Reference configuration object.
out_dir_REF (str): Output directory for reference run.

Returns:
Tuple[list, pd.DataFrame]: List with classifiers and initiated empty dataframe for predictions.
list: List with estimators.
pd.DataFrame: Initiated empty dataframe for predictions.
"""

clfs = machine_learning.load_clfs(config_REF, out_dir_REF)
estimators = machine_learning.load_estimators(config_REF, out_dir_REF)

# initiate output dataframe
all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"])

return clfs, all_y_df
return estimators, all_y_df
4 changes: 2 additions & 2 deletions copro/nb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import click
import pandas as pd
import numpy as np
import geopandas as gpd
import numpy as np
import pandas as pd


def neighboring_polys(
Expand Down
Loading