Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ACLED support and use "yaml" file type for model settings #199

Merged
merged 3 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions copro/conflict.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def conflict_in_year_bool(
extent_gdf: gpd.GeoDataFrame,
sim_year: int,
out_dir: click.Path,
identifier="watprovID",
poly_identifier="watprovID",
conflict_identifier="event_id_cnty",
) -> list:
"""Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.

Expand Down Expand Up @@ -46,17 +47,17 @@ def conflict_in_year_bool(

# determine the aggregated amount of fatalities in one region (e.g. water province)
fatalities_per_poly = (
data_merged["best"]
.groupby(data_merged[identifier])
.sum()
data_merged[conflict_identifier]
.groupby(data_merged[poly_identifier])
.count()
.to_frame()
.rename(columns={"best": "total_fatalities"})
.rename(columns={conflict_identifier: "total_fatalities"})
)

out_dir = os.path.join(out_dir, "files")
Path.mkdir(Path(out_dir), exist_ok=True)

if sim_year == config.getint("settings", "y_end"):
if sim_year == config["general"]["y_end"]:
_store_boolean_conflict_data_to_csv(
fatalities_per_poly, extent_gdf, sim_year, out_dir
)
Expand All @@ -65,7 +66,7 @@ def conflict_in_year_bool(
# if so, this means that there was conflict and thus assign value 1
list_out = []
for i, _ in extent_gdf.iterrows():
i_poly = extent_gdf.iloc[i][identifier]
i_poly = extent_gdf.iloc[i][poly_identifier]
if i_poly in fatalities_per_poly.index.values:
list_out.append(1)
else:
Expand All @@ -80,7 +81,8 @@ def conflict_in_previous_year_bool(
sim_year: int,
check_neighbors: bool = False,
neighboring_matrix: Union[None, pd.DataFrame] = None,
identifier="watprovID",
poly_identifier="watprovID", # TODO: no kwarg, should come from config
conflict_identifier="event_id_cnty", # TODO: no kwarg, should come from config
) -> list:
"""Creates a list for each timestep with boolean information whether
a conflict took place in the previous year in a polygon or not.
Expand All @@ -104,27 +106,28 @@ def conflict_in_previous_year_bool(
else:
click.echo("Checking for conflict event in polygon at t-1")

# TODO: screening whether there is any conflict data in sim_year should be done earlier
# get conflicts at t-1
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year - 1]
temp_sel_year = conflict_gdf[conflict_gdf.year == sim_year - 1]
if temp_sel_year.empty:
warnings.warn(
f"No conflicts were found in sampled conflict data set for year {sim_year - 1}."
)

# merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
data_merged = gpd.sjoin(temp_sel_year, extent_gdf)

conflicts_per_poly = (
data_merged.id.groupby(data_merged[identifier])
data_merged[conflict_identifier]
.groupby(data_merged[poly_identifier])
.count()
.to_frame()
.rename(columns={"id": "conflict_count"})
.rename(columns={conflict_identifier: "conflict_count"})
)
# NOTE: WORKS UNTIL HERE

# loop through all polygons
list_out = []
for i in range(len(extent_gdf)):
i_poly = extent_gdf[identifier].iloc[i]
i_poly = extent_gdf[poly_identifier].iloc[i]
# check if polygon is in list with conflict polygons
if i_poly in conflicts_per_poly.index.values:
# if so, check if neighboring polygons contain conflict and assign boolean value
Expand Down
9 changes: 4 additions & 5 deletions copro/io.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
import pandas as pd
import numpy as np
from typing import Union
from configparser import RawConfigParser
from pathlib import Path
import os
import click


def make_and_collect_output_dirs(
config: RawConfigParser, root_dir: click.Path, config_dict: dict
config: dict, root_dir: click.Path, config_dict: dict
) -> dict:
"""Creates the output folder at location specfied in cfg-file
"""Creates the output folder at location specfied in YAML-file
and returns dictionary with config-objects and out-dir per run.

Args:
config (RawConfigParser): object containing the parsed configuration-settings of the model.
config (dict): dictionary containing the parsed configuration-settings of the model.
root_dir (Path): absolute path to location of configurations-file
config_dict (dict): dictionary containing config-objects for reference run and all projection.

Expand All @@ -23,7 +22,7 @@ def make_and_collect_output_dirs(
"""

# get path to main output directory as specified in cfg-file
out_dir = os.path.join(root_dir, config.get("general", "output_dir"))
out_dir = os.path.join(root_dir, config["general"]["output_dir"])
click.echo(f"Saving output to main output folder {out_dir}.")

# initalize list for all out-dirs
Expand Down
40 changes: 18 additions & 22 deletions copro/machine_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
import pickle
import pandas as pd
import numpy as np
from configparser import RawConfigParser
from sklearn import ensemble, preprocessing, model_selection, inspection
from typing import Union, Tuple
import click
from pathlib import Path
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier


class MachineLearning:
def __init__(self, config: RawConfigParser) -> None:
def __init__(self, config: dict) -> None:
self.config = config
self.scaler = define_scaling(config)
self.clf = ensemble.RandomForestClassifier(random_state=42)
Expand Down Expand Up @@ -49,7 +48,7 @@ def split_scale_train_test_split(
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X_cs,
Y,
test_size=1 - self.config.getfloat("machine_learning", "train_fraction"),
test_size=1 - self.config["machine_learning"]["train_fraction"],
)

# for training-set and test-set, split in ID, geometry, and values
Expand Down Expand Up @@ -77,7 +76,7 @@ def fit_predict(
tune_hyperparameters=False,
n_jobs=2,
verbose=0,
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Fits classifier based on training-data and makes predictions.
The fitted classifier is dumped to file with pickle to be used again during projections.
Makes prediction with test-data including probabilities of those predictions.
Expand All @@ -96,7 +95,7 @@ def fit_predict(
Returns:
np.ndarray: array with the predictions made.
np.ndarray: array with probabilities of the predictions made.
pd.DataFrame: dataframe containing permutation importances of variables.
np.ndarray: dataframe containing permutation importances of variables.
"""

if tune_hyperparameters:
Expand All @@ -117,11 +116,8 @@ def fit_predict(
random_state=42,
n_jobs=n_jobs,
)
sorted_importances_idx = perm_importances.importances_mean.argsort()
perm_importances_df = pd.DataFrame(
perm_importances.importances[sorted_importances_idx].T,
# columns=X_train.columns[sorted_importances_idx],
)
# transpose because by default features are in rows
perm_importances_arr = perm_importances["importances"].T

# create folder to store all classifiers with pickle
clf_pickle_rep = os.path.join(out_dir, "clfs")
Expand All @@ -137,16 +133,16 @@ def fit_predict(
# make prediction of probability
y_prob = fitted_estimator.predict_proba(X_test)

return y_pred, y_prob, perm_importances_df
return y_pred, y_prob, perm_importances_arr


def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:
def load_clfs(config: dict, out_dir: str) -> list[str]:
"""Loads the paths to all previously fitted classifiers to a list.
Classifiers were saved to file in fit_predict().
With this list, the classifiers can be loaded again during projections.

Args:
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
config (dict): Parsed configuration-settings of the model.
out_dir (path): path to output folder.

Returns:
Expand All @@ -155,7 +151,7 @@ def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]:

clfs = os.listdir(os.path.join(out_dir, "clfs"))

if len(clfs) != config.getint("machine_learning", "n_runs"):
if len(clfs) != config["machine_learning"]["n_runs"]:
raise ValueError(
"Number of loaded classifiers does not match the specified number of runs in cfg-file!"
)
Expand Down Expand Up @@ -188,7 +184,7 @@ def _split_conflict_geom_data(


def define_scaling(
config: RawConfigParser,
config: dict,
) -> Union[
preprocessing.MinMaxScaler,
preprocessing.StandardScaler,
Expand All @@ -198,19 +194,19 @@ def define_scaling(
"""Defines scaling method based on model configurations.

Args:
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
config (dict): Parsed configuration-settings of the model.

Returns:
scaler: the specified scaling method instance.
"""

if config.get("machine_learning", "scaler") == "MinMaxScaler":
if config["machine_learning"]["scaler"] == "MinMaxScaler":
scaler = preprocessing.MinMaxScaler()
elif config.get("machine_learning", "scaler") == "StandardScaler":
elif config["machine_learning"]["scaler"] == "StandardScaler":
scaler = preprocessing.StandardScaler()
elif config.get("machine_learning", "scaler") == "RobustScaler":
elif config["machine_learning"]["scaler"] == "RobustScaler":
scaler = preprocessing.RobustScaler()
elif config.get("machine_learning", "scaler") == "QuantileTransformer":
elif config["machine_learning"]["scaler"] == "QuantileTransformer":
scaler = preprocessing.QuantileTransformer(random_state=42)
else:
raise ValueError(
Expand Down Expand Up @@ -308,7 +304,7 @@ def apply_gridsearchCV(
grid_search = GridSearchCV(
estimator=estimator,
param_grid=param_grid,
cv=5,
cv=KFold(n_splits=5, shuffle=True, random_state=42),
n_jobs=n_jobs,
verbose=verbose,
scoring="roc_auc",
Expand Down
24 changes: 11 additions & 13 deletions copro/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def run(
Returns:
pd.DataFrame: Prediction dataframes.
pd.DataFrame: model output on polygon-basis.
pd.DataFrame: containing permutation importances for all runs.
np.ndarray: containing permutation importances for all runs.
dict: evaluation dictionary.
"""

Expand All @@ -67,33 +67,31 @@ def run(
# - initializing output variables
out_X_df = pd.DataFrame()
out_y_df = pd.DataFrame()
out_perm_importances_df = pd.DataFrame()
out_perm_importances_arr = np.array([]).reshape(0, self.X.shape[1] - 2)
out_dict = evaluation.init_out_dict()

click.echo("Training and testing machine learning model")
for n in range(number_runs):
click.echo(f"Run {n+1} of {number_runs}.")

# - run machine learning model and return outputs
X_df, y_df, eval_dict, perm_importances_df_n = self._n_run(
X_df, y_df, eval_dict, perm_importances_arr_n = self._n_run(
run_nr=n, tune_hyperparameters=tune_hyperparameters
)

# - append per model execution
out_X_df = pd.concat([out_X_df, X_df], axis=0, ignore_index=True)
out_y_df = pd.concat([out_y_df, y_df], axis=0, ignore_index=True)
out_perm_importances_df = pd.concat(
[out_perm_importances_df, perm_importances_df_n],
axis=0,
ignore_index=True,
out_perm_importances_arr = np.vstack(
[out_perm_importances_arr, perm_importances_arr_n]
)
out_dict = evaluation.fill_out_dict(out_dict, eval_dict)

return out_X_df, out_y_df, out_perm_importances_df, out_dict
return out_X_df, out_y_df, out_perm_importances_arr, out_dict

def _n_run(
self, run_nr: int, tune_hyperparameters=False
) -> tuple[pd.DataFrame, pd.DataFrame, dict, pd.DataFrame]:
) -> tuple[pd.DataFrame, pd.DataFrame, dict, np.ndarray]:
"""Runs workflow per specified number of runs.
The model workflow is executed for each classifier.

Expand All @@ -105,7 +103,7 @@ def _n_run(
pd.DataFrame: containing the test-data X-array values.
pd.DataFrame: containing model output on polygon-basis.
dict: dictionary containing evaluation metrics per simulation.
pd.DataFrame: containing permutation importances for run n.
np.ndarray: containing permutation importances for run n.
"""

MLmodel = machine_learning.MachineLearning(
Expand All @@ -128,7 +126,7 @@ def _n_run(
X_df = pd.DataFrame(X_test)

# fit classifier and make prediction with test-set
y_pred, y_prob, perm_importances_df_n = MLmodel.fit_predict(
y_pred, y_prob, perm_importances_arr_n = MLmodel.fit_predict(
X_train,
y_train,
X_test,
Expand All @@ -149,7 +147,7 @@ def _n_run(
X_test_ID, X_test_geom, y_test, y_pred, y_prob_0, y_prob_1
)

return X_df, y_df, eval_dict, perm_importances_df_n
return X_df, y_df, eval_dict, perm_importances_arr_n

def run_prediction(
self,
Expand Down Expand Up @@ -180,7 +178,7 @@ def run_prediction(
clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF)

# going through each projection specified
for each_key, _ in config_REF.items("PROJ_files"):
for each_key, _ in config_REF.items():

# get config-object and out-dir per projection
click.echo(f"Loading config-object for projection run: {each_key}.")
Expand Down
31 changes: 17 additions & 14 deletions copro/scripts/copro_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import click
import numpy as np
import pandas as pd
import os

import warnings
Expand Down Expand Up @@ -62,17 +63,19 @@ def cli(cfg: click.Path, cores: int, verbose: int):
)

# - fit-transform on scaler to be used later during projections

_, out_y_df, out_perm_importances_df, out_dict = ModelWorkflow.run(
config_REF.getint("machine_learning", "n_runs"), tune_hyperparameters=True
_, out_y_df, out_perm_importances_arr, out_dict = ModelWorkflow.run(
config_REF["machine_learning"]["n_runs"], tune_hyperparameters=True
)

# - save output to files
out_perm_importances_df.columns = [
key
for key in XY_class.XY_dict
if key not in ["poly_ID", "poly_geometry", "conflict"]
]
out_perm_importances_df = pd.DataFrame(
data=out_perm_importances_arr,
columns=[
key
for key in XY_class.XY_dict
if key not in ["poly_ID", "poly_geometry", "conflict"]
],
)
out_perm_importances_df.to_parquet(
os.path.join(out_dir_REF, "perm_importances.parquet")
)
Expand All @@ -84,7 +87,7 @@ def cli(cfg: click.Path, cores: int, verbose: int):
click.echo(
"Average {} of run with {} repetitions is {:0.3f}".format(
key,
config_REF.getint("machine_learning", "n_runs"),
config_REF["machine_learning"]["n_runs"],
np.mean(value),
)
)
Expand All @@ -97,10 +100,10 @@ def cli(cfg: click.Path, cores: int, verbose: int):

click.echo(click.style("\nINFO: reference run succesfully finished\n", fg="cyan"))

click.echo(click.style("INFO: starting projections\n", fg="cyan"))

# - running prediction runs
# TODO: scaler_fitted is now not part of the class
ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf)
if "projections" in config_REF.keys():
click.echo(click.style("INFO: starting projections\n", fg="cyan"))
# - running prediction runs
# TODO: scaler_fitted is now not part of the class
ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf)

click.echo(click.style("\nINFO: all projections succesfully finished\n", fg="cyan"))
Loading