Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better NaN handling and dataframe support #200

Merged
merged 15 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions copro/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,10 @@ def polygon_model_accuracy(

# - per polygon ID, compute sum of all conflict data points and add to dataframe
if not make_proj:
df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum()
df_count["nr_observed_conflicts"] = df.y_test.groupby(df.ID).sum().astype(float)

# - per polygon ID, compute sum of all conflict data points and add to dataframe
df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum()

df_count["nr_predicted_conflicts"] = df.y_pred.groupby(df.ID).sum().astype(float)
# - per polygon ID, compute average probability that conflict occurs
df_count["min_prob_1"] = pd.to_numeric(df.y_prob_1).groupby(df.ID).min()
df_count["probability_of_conflict"] = (
Expand Down
4 changes: 3 additions & 1 deletion copro/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def make_and_collect_output_dirs(
"""

# get path to main output directory as specified in cfg-file
out_dir = os.path.join(root_dir, config["general"]["output_dir"])
out_dir = os.path.join(
root_dir, config["general"]["output_dir"], config["general"]["simulation_name"]
)
click.echo(f"Saving output to main output folder {out_dir}.")

# initalize list for all out-dirs
Expand Down
118 changes: 70 additions & 48 deletions copro/machine_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,26 @@
import click
from pathlib import Path
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier


class MachineLearning:
def __init__(self, config: dict) -> None:
def __init__(
self,
config: dict,
estimator: Union[
ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
],
) -> None:
"""Class for all ML related stuff.
Embedded in more top-level `models.MainModel()` class.

Args:
config (dict): Parsed configuration-settings of the model.
estimator (Union[ ensemble.RandomForestClassifier, ensemble.RandomForestRegressor ]): ML model.
"""
self.config = config
self.scaler = define_scaling(config)
self.clf = ensemble.RandomForestClassifier(random_state=42)
self.estimator = estimator

def split_scale_train_test_split(
self, X: Union[np.ndarray, pd.DataFrame], Y: np.ndarray
Expand Down Expand Up @@ -100,11 +112,11 @@ def fit_predict(

if tune_hyperparameters:
fitted_estimator = apply_gridsearchCV(
self.clf, X_train, y_train, n_jobs=n_jobs, verbose=verbose
self.estimator, X_train, y_train, n_jobs=n_jobs, verbose=verbose
)
else:
# fit the classifier with training data
fitted_estimator = self.clf.fit(X_train, y_train)
fitted_estimator = self.estimator.fit(X_train, y_train)

# compute permutation importance
click.echo("Computing permutation importance.")
Expand All @@ -120,12 +132,14 @@ def fit_predict(
perm_importances_arr = perm_importances["importances"].T

# create folder to store all classifiers with pickle
clf_pickle_rep = os.path.join(out_dir, "clfs")
Path.mkdir(Path(clf_pickle_rep), parents=True, exist_ok=True)
estimator_pickle_rep = os.path.join(out_dir, "estimators")
Path.mkdir(Path(estimator_pickle_rep), parents=True, exist_ok=True)

# save the fitted classifier to file via pickle.dump()
click.echo(f"Dumping classifier to {clf_pickle_rep}.")
with open(os.path.join(clf_pickle_rep, "clf_{}.pkl".format(run_nr)), "wb") as f:
click.echo(f"Dumping classifier to {estimator_pickle_rep}.")
with open(
os.path.join(estimator_pickle_rep, "estimator_{}.pkl".format(run_nr)), "wb"
) as f:
pickle.dump(fitted_estimator, f)

# make prediction
Expand All @@ -136,7 +150,7 @@ def fit_predict(
return y_pred, y_prob, perm_importances_arr


def load_clfs(config: dict, out_dir: str) -> list[str]:
def load_estimators(config: dict, out_dir: str) -> list[str]:
"""Loads the paths to all previously fitted classifiers to a list.
Classifiers were saved to file in fit_predict().
With this list, the classifiers can be loaded again during projections.
Expand All @@ -149,14 +163,14 @@ def load_clfs(config: dict, out_dir: str) -> list[str]:
list: list with file names of classifiers.
"""

clfs = os.listdir(os.path.join(out_dir, "clfs"))
estimators = os.listdir(os.path.join(out_dir, "estimators"))

if len(clfs) != config["machine_learning"]["n_runs"]:
if len(estimators) != config["machine_learning"]["n_runs"]:
raise ValueError(
"Number of loaded classifiers does not match the specified number of runs in cfg-file!"
)

return clfs
return estimators


def _split_conflict_geom_data(
Expand Down Expand Up @@ -201,27 +215,23 @@ def define_scaling(
"""

if config["machine_learning"]["scaler"] == "MinMaxScaler":
scaler = preprocessing.MinMaxScaler()
elif config["machine_learning"]["scaler"] == "StandardScaler":
scaler = preprocessing.StandardScaler()
elif config["machine_learning"]["scaler"] == "RobustScaler":
scaler = preprocessing.RobustScaler()
elif config["machine_learning"]["scaler"] == "QuantileTransformer":
scaler = preprocessing.QuantileTransformer(random_state=42)
else:
raise ValueError(
"no supported scaling-algorithm selected - \
choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
)

click.echo(f"Chosen scaling method is {scaler}.")

return scaler
return preprocessing.MinMaxScaler()
if config["machine_learning"]["scaler"] == "StandardScaler":
return preprocessing.StandardScaler()
if config["machine_learning"]["scaler"] == "RobustScaler":
return preprocessing.RobustScaler()
if config["machine_learning"]["scaler"] == "QuantileTransformer":
return preprocessing.QuantileTransformer(random_state=42)

raise ValueError(
"no supported scaling-algorithm selected - \
choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer"
)


def predictive(
X: np.ndarray,
clf: ensemble.RandomForestClassifier,
estimator: ensemble.RandomForestClassifier,
scaler: Union[
preprocessing.MinMaxScaler,
preprocessing.StandardScaler,
Expand All @@ -236,7 +246,7 @@ def predictive(

Args:
X (np.ndarray): array containing the variable values plus unique identifer and geometry information.
clf (RandomForestClassifier): the fitted RandomForestClassifier.
estimator (RandomForestClassifier): the fitted RandomForestClassifier.
scaler (scaler): the fitted specified scaling method instance.

Returns:
Expand All @@ -251,10 +261,10 @@ def predictive(
X_ft = scaler.transform(X_data)

# make projection with transformed data
y_pred = clf.predict(X_ft)
y_pred = estimator.predict(X_ft)

# predict probabilites of outcomes
y_prob = clf.predict_proba(X_ft)
y_prob = estimator.predict_proba(X_ft)
y_prob_0 = y_prob[:, 0] # probability to predict 0
y_prob_1 = y_prob[:, 1] # probability to predict 1

Expand All @@ -268,37 +278,49 @@ def predictive(


def apply_gridsearchCV(
estimator: RandomForestClassifier,
estimator: Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor],
X_train: np.ndarray,
y_train: np.ndarray,
n_jobs=2,
verbose=0,
) -> RandomForestClassifier:
) -> Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]:
"""Applies grid search to find the best hyperparameters for the RandomForestClassifier.

Args:
estimator (RandomForestClassifier): Estimator to be used in the grid search.
estimator (Union[RandomForestClassifier, RandomForestRegressor]): Estimator to be used in the grid search.
X_train (np.ndarray): Feature matrix.
y_train (np.ndarray): Target vector.
n_jobs (int, optional): Number of cores to be used. Defaults to 2.
verbose (int, optional): Verbosity level. Defaults to 0.

Returns:
RandomForestClassifier: Best estimator of the grid search.
Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]: Best estimator of the grid search.
"""

click.echo("Tuning hyperparameters with GridSearchCV.")
# Define the parameter grid
param_grid = {
"n_estimators": [50, 100, 200],
"criterion": ["gini", "entropy"],
"min_impurity_decrease": [0, 0.5, 1],
"max_features": ("sqrt", "log2"),
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"class_weight": [{1: 75}, {1: 100}, {1: 150}],
# 'bootstrap': [True, False]
}
if isinstance(estimator, ensemble.RandomForestClassifier):
param_grid = {
"n_estimators": [50, 100, 200],
"criterion": ["gini", "entropy"],
"min_impurity_decrease": [0, 0.5, 1],
"max_features": ("sqrt", "log2"),
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"class_weight": [{1: 75}, {1: 100}, {1: 150}],
# 'bootstrap': [True, False]
}
scoring = "roc_auc"
else:
param_grid = {
"n_estimators": [10, 50, 100],
"criterion": ("squared_error", "absolute_error", "friedman_mse"),
"max_features": ("sqrt", "log2"),
"min_samples_split": [2, 5, 20],
"min_impurity_decrease": [0, 0.5, 1],
"min_samples_leaf": [1, 5, 10],
}
scoring = "r2"

# Instantiate the grid search model
grid_search = GridSearchCV(
Expand All @@ -307,7 +329,7 @@ def apply_gridsearchCV(
cv=KFold(n_splits=5, shuffle=True, random_state=42),
n_jobs=n_jobs,
verbose=verbose,
scoring="roc_auc",
scoring=scoring,
)

# Fit the grid search to the data
Expand Down
18 changes: 11 additions & 7 deletions copro/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,23 @@ class MainModel:
def __init__(
self,
X: Union[np.ndarray, pd.DataFrame],
Y: np.ndarray,
config: RawConfigParser,
Y: Union[np.ndarray, pd.DataFrame],
estimator: Union[
ensemble.RandomForestClassifier, ensemble.RandomForestRegressor
],
config: dict,
out_dir: str,
n_jobs=2,
verbose=0,
):
"""Constructor for the MainModel class.
Under the hood, the class uses the `machine_learning.MachineLearning()` class to run the computations.

Args:
X (np.ndarray, pd.DataFrame): array containing the variable values plus IDs and geometry information.
Y (np.ndarray): array containing merely the binary conflict classifier data.
config (RawConfigParser): object containing the parsed configuration-settings of the model.
estimator (Union[ensemble.RandomForestClassifier, ensemble.RandomForestRegressor]): ML model.
config (dict): object containing the parsed configuration-settings of the model.
out_dir (str): path to output folder.
n_jobs (int, optional): Number of jobs to run in parallel. Defaults to 2.
verbose (int, optional): Verbosity level. Defaults to 0.
Expand All @@ -37,11 +42,9 @@ def __init__(
self.config = config
self.scaler = machine_learning.define_scaling(config)
self.scaler_all_data = self.scaler.fit(
X[:, 2:]
X.iloc[:, 2:]
) # NOTE: supposed to be used in projections
self.clf = ensemble.RandomForestClassifier(
n_estimators=1000, class_weight={1: 100}, random_state=42
)
self.estimator = estimator
self.out_dir = out_dir
self.n_jobs = n_jobs
self.verbose = verbose
Expand Down Expand Up @@ -108,6 +111,7 @@ def _n_run(

MLmodel = machine_learning.MachineLearning(
self.config,
self.estimator,
)

# split X into training-set and test-set, scale training-set data
Expand Down
17 changes: 11 additions & 6 deletions copro/scripts/copro_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@
import pandas as pd
import os

import warnings

warnings.filterwarnings("ignore")


@click.command()
@click.argument("cfg", type=click.Path())
Expand Down Expand Up @@ -44,12 +40,15 @@ def cli(cfg: click.Path, cores: int, verbose: int):

click.echo(click.style("\nINFO: reference run started\n", fg="cyan"))

estimator = settings.define_model(config_REF)
target_var = settings.define_target_var(config_REF, estimator)

# - selecting conflicts and getting area-of-interest and aggregation level
conflict_gdf, extent_active_polys_gdf, global_df = selection.select(
config_REF, out_dir_REF, root_dir
)

XY_class = xydata.XYData(config_REF)
XY_class = xydata.XYData(config_REF, target_var)
X, Y = XY_class.create_XY(
out_dir=out_dir_REF,
root_dir=root_dir,
Expand All @@ -59,7 +58,13 @@ def cli(cfg: click.Path, cores: int, verbose: int):

# - defining scaling and model algorithms
ModelWorkflow = models.MainModel(
config=config_REF, X=X, Y=Y, out_dir=out_dir_REF, n_jobs=cores, verbose=verbose
config=config_REF,
X=X,
Y=Y,
estimator=estimator,
out_dir=out_dir_REF,
n_jobs=cores,
verbose=verbose,
)

# - fit-transform on scaler to be used later during projections
Expand Down
14 changes: 11 additions & 3 deletions copro/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,25 @@ def _filter_conflict_properties(
gpd.GeoDataFrame: geo-dataframe containing filtered entries.
"""

gdf = gdf[
(gdf.year >= config["general"]["y_start"])
& (gdf.year <= config["general"]["y_end"])
]

# if not thresholding options are found, return the original dataframe
if "thresholds" not in config["data"]["conflict"]:
click.echo("No thresholding options found in configuration file.")
return gdf

# go through all criteria
# otherwise, go through all variables for which tresholding is specified
for key, value in config["data"]["conflict"]["thresholds"].items():

# if variable is not found in the dataframe, skip it
if key not in gdf.columns:
warnings.warn(
f"{key} is not found in geodataframe columns, will be skipped."
f"{key} is not found in geodataframe columns, thresholding be skipped."
)
# otherwise, check which option is specified and apply it
else:
click.echo(f"Tresholding conflict data on {key}.")
for v, k in value.items():
Expand All @@ -93,7 +101,7 @@ def _filter_conflict_properties(
click.echo(f"Selecting datapoints less or equal to {k}.")
gdf = gdf[gdf[key] <= k]
else:
raise ValueError(
warnings.warn(
f"{v} is not a recognized tresholding option - use 'values', 'vmin' or 'vmax'."
)

Expand Down
Loading