From 2425c336f97b8f123703c6232c044782d6d893aa Mon Sep 17 00:00:00 2001 From: JannisHoch Date: Tue, 28 May 2024 13:43:19 +0200 Subject: [PATCH 1/3] first steps --- copro/selection.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/copro/selection.py b/copro/selection.py index 8a66025..71da5c6 100644 --- a/copro/selection.py +++ b/copro/selection.py @@ -68,26 +68,30 @@ def _filter_conflict_properties( gpd.GeoDataFrame: geo-dataframe containing filtered entries. """ - # create dictionary with all selection criteria - selection_criteria = { - "best": config.getint("conflict", "min_nr_casualties"), - "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","), - } + conflict_items = dict(config.items("conflict")) + + # # create dictionary with all selection criteria + # selection_criteria = { + # "best": config.getint("conflict", "min_nr_casualties"), + # "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","), + # } click.echo("Filtering based on conflict properties.") # go through all criteria - for key, value in selection_criteria.items(): - - # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold - if key == "best" and value != "": - click.echo(f"Filtering key {key} with lower value {value}.") - gdf = gdf[gdf["best"] >= value] - # for other criteria, select all entries matching the specified value(s) per criterion - if key == "type_of_violence" and value != "": - click.echo(f"Filtering key {key} with value(s) {value}.") - # NOTE: check if this works like this - values = [literal_eval(i) for i in value] - gdf = gdf[gdf[key].isin(values)] + for key, value in conflict_items.items(): + + if key not in ["conflict_file", "predicted_var"]: + + # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold + if key == "best" and value != "": + click.echo(f"Filtering key {key} with lower value {value}.") + gdf = gdf[gdf["best"] >= value] + # for other criteria, select all entries matching the specified value(s) per criterion + if key == "type_of_violence" and value != "": + click.echo(f"Filtering key {key} with value(s) {value}.") + # NOTE: check if this works like this + values = [literal_eval(i) for i in value] + gdf = gdf[gdf[key].isin(values)] return gdf From de683e6b88fe1a0d691aa598648df43251b81da9 Mon Sep 17 00:00:00 2001 From: Jannis Hoch <10956703+JannisHoch@users.noreply.github.com> Date: Thu, 30 May 2024 19:18:45 +0200 Subject: [PATCH 2/3] Switch from cfg to yaml (#194) * parse YAML file * parsing settings from YAML-file works * improved conflict property selection based on YAML-file input * fix _clip_to_extent * reading indicator data works, also with options * adding poly_id and conflict_id * added todos * fixed problem with perm. importance and debugged workflow to be compatible with yaml input --- copro/conflict.py | 31 +++--- copro/io.py | 9 +- copro/machine_learning.py | 36 ++++--- copro/models.py | 24 +++-- copro/scripts/copro_runner.py | 31 +++--- copro/selection.py | 82 ++++++++-------- copro/settings.py | 40 ++++---- copro/utils.py | 11 +-- copro/variables.py | 171 +++++++++++----------------------- copro/xydata.py | 139 +++++++++++++-------------- 10 files changed, 252 insertions(+), 322 deletions(-) diff --git a/copro/conflict.py b/copro/conflict.py index a85bb5b..bb80283 100644 --- a/copro/conflict.py +++ b/copro/conflict.py @@ -16,7 +16,8 @@ def conflict_in_year_bool( extent_gdf: gpd.GeoDataFrame, sim_year: int, out_dir: click.Path, - identifier="watprovID", + poly_identifier="watprovID", + conflict_identifier="event_id_cnty", ) -> list: """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not. @@ -46,17 +47,17 @@ def conflict_in_year_bool( # determine the aggregated amount of fatalities in one region (e.g. water province) fatalities_per_poly = ( - data_merged["best"] - .groupby(data_merged[identifier]) - .sum() + data_merged[conflict_identifier] + .groupby(data_merged[poly_identifier]) + .count() .to_frame() - .rename(columns={"best": "total_fatalities"}) + .rename(columns={conflict_identifier: "total_fatalities"}) ) out_dir = os.path.join(out_dir, "files") Path.mkdir(Path(out_dir), exist_ok=True) - if sim_year == config.getint("settings", "y_end"): + if sim_year == config["general"]["y_end"]: _store_boolean_conflict_data_to_csv( fatalities_per_poly, extent_gdf, sim_year, out_dir ) @@ -65,7 +66,7 @@ def conflict_in_year_bool( # if so, this means that there was conflict and thus assign value 1 list_out = [] for i, _ in extent_gdf.iterrows(): - i_poly = extent_gdf.iloc[i][identifier] + i_poly = extent_gdf.iloc[i][poly_identifier] if i_poly in fatalities_per_poly.index.values: list_out.append(1) else: @@ -80,7 +81,8 @@ def conflict_in_previous_year_bool( sim_year: int, check_neighbors: bool = False, neighboring_matrix: Union[None, pd.DataFrame] = None, - identifier="watprovID", + poly_identifier="watprovID", # TODO: no kwarg, should come from config + conflict_identifier="event_id_cnty", # TODO: no kwarg, should come from config ) -> list: """Creates a list for each timestep with boolean information whether a conflict took place in the previous year in a polygon or not. @@ -104,27 +106,28 @@ def conflict_in_previous_year_bool( else: click.echo("Checking for conflict event in polygon at t-1") + # TODO: screening whether there is any conflict data in sim_year should be done earlier # get conflicts at t-1 - temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year - 1] + temp_sel_year = conflict_gdf[conflict_gdf.year == sim_year - 1] if temp_sel_year.empty: warnings.warn( f"No conflicts were found in sampled conflict data set for year {sim_year - 1}." ) - # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions data_merged = gpd.sjoin(temp_sel_year, extent_gdf) - conflicts_per_poly = ( - data_merged.id.groupby(data_merged[identifier]) + data_merged[conflict_identifier] + .groupby(data_merged[poly_identifier]) .count() .to_frame() - .rename(columns={"id": "conflict_count"}) + .rename(columns={conflict_identifier: "conflict_count"}) ) + # NOTE: WORKS UNTIL HERE # loop through all polygons list_out = [] for i in range(len(extent_gdf)): - i_poly = extent_gdf[identifier].iloc[i] + i_poly = extent_gdf[poly_identifier].iloc[i] # check if polygon is in list with conflict polygons if i_poly in conflicts_per_poly.index.values: # if so, check if neighboring polygons contain conflict and assign boolean value diff --git a/copro/io.py b/copro/io.py index 719699c..1a9e9e3 100644 --- a/copro/io.py +++ b/copro/io.py @@ -1,20 +1,19 @@ import pandas as pd import numpy as np from typing import Union -from configparser import RawConfigParser from pathlib import Path import os import click def make_and_collect_output_dirs( - config: RawConfigParser, root_dir: click.Path, config_dict: dict + config: dict, root_dir: click.Path, config_dict: dict ) -> dict: - """Creates the output folder at location specfied in cfg-file + """Creates the output folder at location specfied in YAML-file and returns dictionary with config-objects and out-dir per run. Args: - config (RawConfigParser): object containing the parsed configuration-settings of the model. + config (dict): dictionary containing the parsed configuration-settings of the model. root_dir (Path): absolute path to location of configurations-file config_dict (dict): dictionary containing config-objects for reference run and all projection. @@ -23,7 +22,7 @@ def make_and_collect_output_dirs( """ # get path to main output directory as specified in cfg-file - out_dir = os.path.join(root_dir, config.get("general", "output_dir")) + out_dir = os.path.join(root_dir, config["general"]["output_dir"]) click.echo(f"Saving output to main output folder {out_dir}.") # initalize list for all out-dirs diff --git a/copro/machine_learning.py b/copro/machine_learning.py index 48e6a0c..8dc5f21 100644 --- a/copro/machine_learning.py +++ b/copro/machine_learning.py @@ -2,7 +2,6 @@ import pickle import pandas as pd import numpy as np -from configparser import RawConfigParser from sklearn import ensemble, preprocessing, model_selection, inspection from typing import Union, Tuple import click @@ -12,7 +11,7 @@ class MachineLearning: - def __init__(self, config: RawConfigParser) -> None: + def __init__(self, config: dict) -> None: self.config = config self.scaler = define_scaling(config) self.clf = ensemble.RandomForestClassifier(random_state=42) @@ -49,7 +48,7 @@ def split_scale_train_test_split( X_train, X_test, y_train, y_test = model_selection.train_test_split( X_cs, Y, - test_size=1 - self.config.getfloat("machine_learning", "train_fraction"), + test_size=1 - self.config["machine_learning"]["train_fraction"], ) # for training-set and test-set, split in ID, geometry, and values @@ -77,7 +76,7 @@ def fit_predict( tune_hyperparameters=False, n_jobs=2, verbose=0, - ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Fits classifier based on training-data and makes predictions. The fitted classifier is dumped to file with pickle to be used again during projections. Makes prediction with test-data including probabilities of those predictions. @@ -96,7 +95,7 @@ def fit_predict( Returns: np.ndarray: array with the predictions made. np.ndarray: array with probabilities of the predictions made. - pd.DataFrame: dataframe containing permutation importances of variables. + np.ndarray: dataframe containing permutation importances of variables. """ if tune_hyperparameters: @@ -117,11 +116,8 @@ def fit_predict( random_state=42, n_jobs=n_jobs, ) - sorted_importances_idx = perm_importances.importances_mean.argsort() - perm_importances_df = pd.DataFrame( - perm_importances.importances[sorted_importances_idx].T, - # columns=X_train.columns[sorted_importances_idx], - ) + # transpose because by default features are in rows + perm_importances_arr = perm_importances["importances"].T # create folder to store all classifiers with pickle clf_pickle_rep = os.path.join(out_dir, "clfs") @@ -137,16 +133,16 @@ def fit_predict( # make prediction of probability y_prob = fitted_estimator.predict_proba(X_test) - return y_pred, y_prob, perm_importances_df + return y_pred, y_prob, perm_importances_arr -def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]: +def load_clfs(config: dict, out_dir: str) -> list[str]: """Loads the paths to all previously fitted classifiers to a list. Classifiers were saved to file in fit_predict(). With this list, the classifiers can be loaded again during projections. Args: - config (ConfigParser-object): object containing the parsed configuration-settings of the model. + config (dict): Parsed configuration-settings of the model. out_dir (path): path to output folder. Returns: @@ -155,7 +151,7 @@ def load_clfs(config: RawConfigParser, out_dir: str) -> list[str]: clfs = os.listdir(os.path.join(out_dir, "clfs")) - if len(clfs) != config.getint("machine_learning", "n_runs"): + if len(clfs) != config["machine_learning"]["n_runs"]: raise ValueError( "Number of loaded classifiers does not match the specified number of runs in cfg-file!" ) @@ -188,7 +184,7 @@ def _split_conflict_geom_data( def define_scaling( - config: RawConfigParser, + config: dict, ) -> Union[ preprocessing.MinMaxScaler, preprocessing.StandardScaler, @@ -198,19 +194,19 @@ def define_scaling( """Defines scaling method based on model configurations. Args: - config (ConfigParser-object): object containing the parsed configuration-settings of the model. + config (dict): Parsed configuration-settings of the model. Returns: scaler: the specified scaling method instance. """ - if config.get("machine_learning", "scaler") == "MinMaxScaler": + if config["machine_learning"]["scaler"] == "MinMaxScaler": scaler = preprocessing.MinMaxScaler() - elif config.get("machine_learning", "scaler") == "StandardScaler": + elif config["machine_learning"]["scaler"] == "StandardScaler": scaler = preprocessing.StandardScaler() - elif config.get("machine_learning", "scaler") == "RobustScaler": + elif config["machine_learning"]["scaler"] == "RobustScaler": scaler = preprocessing.RobustScaler() - elif config.get("machine_learning", "scaler") == "QuantileTransformer": + elif config["machine_learning"]["scaler"] == "QuantileTransformer": scaler = preprocessing.QuantileTransformer(random_state=42) else: raise ValueError( diff --git a/copro/models.py b/copro/models.py index 1ee5766..9cbcb80 100644 --- a/copro/models.py +++ b/copro/models.py @@ -58,7 +58,7 @@ def run( Returns: pd.DataFrame: Prediction dataframes. pd.DataFrame: model output on polygon-basis. - pd.DataFrame: containing permutation importances for all runs. + np.ndarray: containing permutation importances for all runs. dict: evaluation dictionary. """ @@ -67,7 +67,7 @@ def run( # - initializing output variables out_X_df = pd.DataFrame() out_y_df = pd.DataFrame() - out_perm_importances_df = pd.DataFrame() + out_perm_importances_arr = np.array([]).reshape(0, self.X.shape[1] - 2) out_dict = evaluation.init_out_dict() click.echo("Training and testing machine learning model") @@ -75,25 +75,23 @@ def run( click.echo(f"Run {n+1} of {number_runs}.") # - run machine learning model and return outputs - X_df, y_df, eval_dict, perm_importances_df_n = self._n_run( + X_df, y_df, eval_dict, perm_importances_arr_n = self._n_run( run_nr=n, tune_hyperparameters=tune_hyperparameters ) # - append per model execution out_X_df = pd.concat([out_X_df, X_df], axis=0, ignore_index=True) out_y_df = pd.concat([out_y_df, y_df], axis=0, ignore_index=True) - out_perm_importances_df = pd.concat( - [out_perm_importances_df, perm_importances_df_n], - axis=0, - ignore_index=True, + out_perm_importances_arr = np.vstack( + [out_perm_importances_arr, perm_importances_arr_n] ) out_dict = evaluation.fill_out_dict(out_dict, eval_dict) - return out_X_df, out_y_df, out_perm_importances_df, out_dict + return out_X_df, out_y_df, out_perm_importances_arr, out_dict def _n_run( self, run_nr: int, tune_hyperparameters=False - ) -> tuple[pd.DataFrame, pd.DataFrame, dict, pd.DataFrame]: + ) -> tuple[pd.DataFrame, pd.DataFrame, dict, np.ndarray]: """Runs workflow per specified number of runs. The model workflow is executed for each classifier. @@ -105,7 +103,7 @@ def _n_run( pd.DataFrame: containing the test-data X-array values. pd.DataFrame: containing model output on polygon-basis. dict: dictionary containing evaluation metrics per simulation. - pd.DataFrame: containing permutation importances for run n. + np.ndarray: containing permutation importances for run n. """ MLmodel = machine_learning.MachineLearning( @@ -128,7 +126,7 @@ def _n_run( X_df = pd.DataFrame(X_test) # fit classifier and make prediction with test-set - y_pred, y_prob, perm_importances_df_n = MLmodel.fit_predict( + y_pred, y_prob, perm_importances_arr_n = MLmodel.fit_predict( X_train, y_train, X_test, @@ -149,7 +147,7 @@ def _n_run( X_test_ID, X_test_geom, y_test, y_pred, y_prob_0, y_prob_1 ) - return X_df, y_df, eval_dict, perm_importances_df_n + return X_df, y_df, eval_dict, perm_importances_arr_n def run_prediction( self, @@ -180,7 +178,7 @@ def run_prediction( clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF) # going through each projection specified - for each_key, _ in config_REF.items("PROJ_files"): + for each_key, _ in config_REF.items(): # get config-object and out-dir per projection click.echo(f"Loading config-object for projection run: {each_key}.") diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py index 389bffc..dc64662 100644 --- a/copro/scripts/copro_runner.py +++ b/copro/scripts/copro_runner.py @@ -2,6 +2,7 @@ import click import numpy as np +import pandas as pd import os import warnings @@ -62,17 +63,19 @@ def cli(cfg: click.Path, cores: int, verbose: int): ) # - fit-transform on scaler to be used later during projections - - _, out_y_df, out_perm_importances_df, out_dict = ModelWorkflow.run( - config_REF.getint("machine_learning", "n_runs"), tune_hyperparameters=True + _, out_y_df, out_perm_importances_arr, out_dict = ModelWorkflow.run( + config_REF["machine_learning"]["n_runs"], tune_hyperparameters=True ) # - save output to files - out_perm_importances_df.columns = [ - key - for key in XY_class.XY_dict - if key not in ["poly_ID", "poly_geometry", "conflict"] - ] + out_perm_importances_df = pd.DataFrame( + data=out_perm_importances_arr, + columns=[ + key + for key in XY_class.XY_dict + if key not in ["poly_ID", "poly_geometry", "conflict"] + ], + ) out_perm_importances_df.to_parquet( os.path.join(out_dir_REF, "perm_importances.parquet") ) @@ -84,7 +87,7 @@ def cli(cfg: click.Path, cores: int, verbose: int): click.echo( "Average {} of run with {} repetitions is {:0.3f}".format( key, - config_REF.getint("machine_learning", "n_runs"), + config_REF["machine_learning"]["n_runs"], np.mean(value), ) ) @@ -97,10 +100,10 @@ def cli(cfg: click.Path, cores: int, verbose: int): click.echo(click.style("\nINFO: reference run succesfully finished\n", fg="cyan")) - click.echo(click.style("INFO: starting projections\n", fg="cyan")) - - # - running prediction runs - # TODO: scaler_fitted is now not part of the class - ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf) + if "projections" in config_REF.keys(): + click.echo(click.style("INFO: starting projections\n", fg="cyan")) + # - running prediction runs + # TODO: scaler_fitted is now not part of the class + ModelWorkflow.run_prediction(main_dict, root_dir, extent_active_polys_gdf) click.echo(click.style("\nINFO: all projections succesfully finished\n", fg="cyan")) diff --git a/copro/selection.py b/copro/selection.py index 71da5c6..3a84896 100644 --- a/copro/selection.py +++ b/copro/selection.py @@ -2,14 +2,13 @@ import pandas as pd import os from copro import utils -from configparser import RawConfigParser import click from typing import Tuple -from ast import literal_eval +import warnings def select( - config: RawConfigParser, out_dir: click.Path, root_dir: click.Path + config: dict, out_dir: click.Path, root_dir: click.Path ) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame, pd.DataFrame]: """Main function performing the selection procedure. First, selects only conflicts matching specified properties. @@ -17,7 +16,7 @@ def select( Third, retrieves the geometry of all polygons in the spatial extent and assigns IDs. Args: - config (RawConfigParser): object containing the parsed configuration-settings of the model. + config (dict): Parsed configuration-settings of the model. out_dir (Path): path to output folder. root_dir (Path): path to location of cfg-file for reference run. @@ -55,66 +54,71 @@ def select( def _filter_conflict_properties( - gdf: gpd.GeoDataFrame, config: RawConfigParser + gdf: gpd.GeoDataFrame, config: dict ) -> gpd.GeoDataFrame: - """Filters conflict database according to certain conflict properties - such as number of casualties, type of violence or country. + """Filters conflict database according to certain treshold options. + These options are 'values', 'vmin' and 'vmax'. + These options and the conflict properties to which they are applied + need to be specified in the YAML-file. Args: - gdf (gpd.GeoDataFrame): geo-dataframe containing entries with conflicts. - config (RawConfigParser): object containing the parsed configuration-settings of the model. + gdf (gpd.GeoDataFrame): Geodataframe containing entries with conflicts. + config (dict): Parsed configuration-settings of the model. Returns: gpd.GeoDataFrame: geo-dataframe containing filtered entries. """ - conflict_items = dict(config.items("conflict")) + if "thresholds" not in config["data"]["conflict"]: + click.echo("No thresholding options found in configuration file.") + return gdf - # # create dictionary with all selection criteria - # selection_criteria = { - # "best": config.getint("conflict", "min_nr_casualties"), - # "type_of_violence": (config.get("conflict", "type_of_violence")).rsplit(","), - # } - - click.echo("Filtering based on conflict properties.") # go through all criteria - for key, value in conflict_items.items(): - - if key not in ["conflict_file", "predicted_var"]: - - # for criterion 'best' (i.e. best estimate of fatalities), select all entries above threshold - if key == "best" and value != "": - click.echo(f"Filtering key {key} with lower value {value}.") - gdf = gdf[gdf["best"] >= value] - # for other criteria, select all entries matching the specified value(s) per criterion - if key == "type_of_violence" and value != "": - click.echo(f"Filtering key {key} with value(s) {value}.") - # NOTE: check if this works like this - values = [literal_eval(i) for i in value] - gdf = gdf[gdf[key].isin(values)] + for key, value in config["data"]["conflict"]["thresholds"].items(): + + if key not in gdf.columns: + warnings.warn( + f"{key} is not found in geodataframe columns, will be skipped." + ) + else: + click.echo(f"Tresholding conflict data on {key}.") + for v, k in value.items(): + if v == "values": + click.echo(f"Selecting datapoints with values {k}.") + gdf = gdf[gdf[key].isin(k)] + elif v == "vmin": + click.echo(f"Selecting datapoints greater or equal to {k}.") + gdf = gdf[gdf[key] >= k] + elif v == "vmax": + click.echo(f"Selecting datapoints less or equal to {k}.") + gdf = gdf[gdf[key] <= k] + else: + raise ValueError( + f"{v} is not a recognized tresholding option - use 'values', 'vmin' or 'vmax'." + ) return gdf def _clip_to_extent( - conflict_gdf: gpd.GeoDataFrame, config: RawConfigParser, root_dir: click.Path + conflict_gdf: gpd.GeoDataFrame, config: dict, root_dir: click.Path ) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: """As the original conflict data has global extent, this function clips the database to those entries which have occured on a specified continent. Args: - conflict_gdf (geo-dataframe): geo-dataframe containing entries with conflicts. - config (ConfigParser-object): object containing the parsed configuration-settings of the model. - root_dir (str): path to location of cfg-file. + conflict_gdf (gpd.GeoDataFrame): Geodataframe containing entries with conflicts. + config (dict): Parsed configuration-settings of the model. + root_dir (str): Path to location of cfg-file. Returns: - geo-dataframe: geo-dataframe containing filtered entries. - geo-dataframe: geo-dataframe containing country polygons of selected continent. + gpd.GeoDataFrame: Geodataframe containing filtered entries. + gpd.GeoDataFrame: Geodataframe containing country polygons of selected continent. """ # get path to file with polygons for which analysis is carried out shp_fo = os.path.join( - root_dir, config.get("general", "input_dir"), config.get("extent", "shp") + root_dir, config["general"]["input_dir"], config["data"]["extent"]["file"] ) # read file @@ -126,7 +130,7 @@ def _clip_to_extent( extent_gdf.geometry = extent_gdf.buffer(0) # clip the conflict dataframe to the specified polygons - click.echo("Clipping clipping conflict dataset to extent.") + click.echo("Clipping conflict dataset to extent.") conflict_gdf = gpd.clip(conflict_gdf, extent_gdf) return conflict_gdf, extent_gdf diff --git a/copro/settings.py b/copro/settings.py index f22c3d7..b53d695 100644 --- a/copro/settings.py +++ b/copro/settings.py @@ -6,6 +6,8 @@ from typing import Tuple from copro import utils, io +import yaml + def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]: """Initiates the model set-up. @@ -53,25 +55,25 @@ def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]: return main_dict, root_dir -def _parse_settings(settings_file: click.Path) -> RawConfigParser: - """Reads the model configuration file. +def _parse_settings(settings_file: click.Path) -> dict: + """Reads the model configuration YAML-file and returns contant as dictionary. Args: settings_file (Path): path to settings-file (cfg-file). Returns: - RawConfigParser: parsed model configuration. + dict: parsed model configuration. """ click.echo(f"Parsing settings from file {settings_file}.") - config = RawConfigParser(allow_no_value=True, inline_comment_prefixes="#") - config.optionxform = lambda option: option - config.read(settings_file) + + with open(settings_file, "r") as stream: + config = yaml.safe_load(stream) return config -def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path) -> dict: +def _collect_simulation_settings(config: dict, root_dir: click.Path) -> dict: """Collects the configuration settings for the reference run and all projection runs. These cfg-files need to be specified one by one in the PROJ_files section of the cfg-file for the reference run. The function returns then a dictionary with the name of the run and the associated config-object. @@ -86,7 +88,7 @@ def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path) config_dict = {'_REF': [config_REF], 'run1': [config_run1], 'run2': [config_run2]} Args: - config (ConfigParser-object): object containing the parsed configuration-settings \ + config (dict): dictionariy containing the parsed configuration-settings \ of the model for the reference run. root_dir (Path): path to location of the cfg-file for the reference run. @@ -99,17 +101,19 @@ def _collect_simulation_settings(config: RawConfigParser, root_dir: click.Path) # first entry is config-object for reference run config_dict["_REF"] = config - # loop through all keys and values in PROJ_files section of reference config-object - for (each_key, each_val) in config.items("PROJ_files"): + if "PROJ_files" in config["general"].keys(): + + # loop through all keys and values in PROJ_files section of reference config-object + for (each_key, each_val) in config.items("PROJ_files"): - # for each value (here representing the cfg-files of the projections), get the absolute path - each_val = os.path.abspath(os.path.join(root_dir, each_val)) + # for each value (here representing the cfg-files of the projections), get the absolute path + each_val = os.path.abspath(os.path.join(root_dir, each_val)) - # parse each config-file specified - each_config = _parse_settings(each_val) + # parse each config-file specified + each_config = _parse_settings(each_val) - # update the output dictionary with key and config-object - config_dict[each_key] = [each_config] + # update the output dictionary with key and config-object + config_dict[each_key] = [each_config] return config_dict @@ -131,8 +135,8 @@ def determine_projection_period( # get all years of projection period projection_period = np.arange( - config_REF.getint("settings", "y_end") + 1, - config_PROJ.getint("settings", "y_proj") + 1, + config_REF["general"]["y_end"] + 1, + config_PROJ["general"]["y_proj"] + 1, 1, ) # convert to list diff --git a/copro/utils.py b/copro/utils.py index fa83f3a..e505579 100644 --- a/copro/utils.py +++ b/copro/utils.py @@ -2,14 +2,13 @@ import pandas as pd import numpy as np import os -from configparser import RawConfigParser from datetime import date import click from copro import __version__, __author__, __email__ def get_conflict_geodataframe( - config: RawConfigParser, + config: dict, root_dir: click.Path, longitude="longitude", latitude="latitude", @@ -18,21 +17,21 @@ def get_conflict_geodataframe( """Converts a csv-file containing geo-referenced conflict data to a geodataframe. Args: - config (RawConfigParser): object containing the parsed configuration-settings of the model. + config (dict): Parsed configuration-settings of the model. root_dir (Path): path to location of cfg-file. longitude (str, optional): column name with longitude coordinates. Defaults to 'longitude'. latitude (str, optional): column name with latitude coordinates. Defaults to 'latitude'. crs (str, optional): coordinate system to be used for georeferencing. Defaults to 'EPSG:4326'. Returns: - geo-dataframe: geo-referenced conflict data. + gpd.GeoDataFrame: geo-referenced conflict data. """ # get path to file containing data conflict_fo = os.path.join( root_dir, - config.get("general", "input_dir"), - config.get("conflict", "conflict_file"), + config["general"]["input_dir"], + config["data"]["conflict"]["file"], ) # read file to pandas dataframe diff --git a/copro/variables.py b/copro/variables.py index 0db563e..7dd0c63 100644 --- a/copro/variables.py +++ b/copro/variables.py @@ -7,7 +7,6 @@ import os import math import click -from configparser import RawConfigParser import warnings @@ -16,85 +15,63 @@ def nc_with_float_timestamp( extent_gdf: gpd.GeoDataFrame, - config: RawConfigParser, + config: dict, root_dir: str, var_name: str, sim_year: int, ) -> list: - """This function extracts a value from a netCDF-file (specified in the cfg-file) + """This function extracts a value from a netCDF-file (specified in the yaml-file) for each polygon specified in extent_gdf for a given year. - In the cfg-file, it must also be specified whether the value is log-transformed or not, + In the yaml-file, it must also be specified whether the value is log-transformed or not, and which statistical method is applied. .. note:: - The key in the cfg-file must be identical to variable name in netCDF-file. + The key in the yaml-file must be identical to variable name in netCDF-file. .. note:: Works only with nc-files with annual data. Args: extent_gdf (gpd.GeoDataFrame): One or more polygons with geometry information for which values are extracted. - config (RawConfigParser): parsed configuration settings of run. - root_dir (str): path to location of cfg-file. - var_name (str): name of variable in nc-file. Must be the same as is specified in cfg-file. - sim_year (int): year for which data is extracted. + config (dict): Parsed configuration settings of run. + root_dir (str): Path to location of yaml-file. + var_name (str): Name of variable in nc-file. Must be the same as is specified in yaml-file. + sim_year (int): Year for which data is extracted. Returns: - list: list containing statistical value per polygon, i.e. with same length as extent_gdf. + list: List containing statistical value per polygon, i.e. with same length as extent_gdf. """ - # get the filename, True/False whether log-transform shall be applied, and statistical method from cfg-file as list - data_fo = os.path.join( - root_dir, config.get("general", "input_dir"), config.get("data", var_name) - ).rsplit(",") - - # if not all of these three aspects are provided, raise error - if len(data_fo) != 3: - raise ValueError( - "Not all settings for input data set {} provided - \ - it must contain of path, False/True, and statistical method".format( - os.path.join( - root_dir, - config.get("general", "input_dir"), - config.get("data", var_name), - ) - ) - ) - - # if not, split the list into separate variables - nc_fo = data_fo[0] - ln_flag = bool(data_fo[1]) - stat_method = str(data_fo[2]) + nc_fo = os.path.join( + root_dir, + config["general"]["input_dir"], + config["data"]["indicators"][var_name]["file"], + ) + if "log" not in config["data"]["indicators"][var_name].keys(): + ln_flag = False + else: + ln_flag = config["data"]["indicators"][var_name]["log"] + if "stat" not in config["data"]["indicators"][var_name].keys(): + stat_method = "mean" + else: + stat_method = config["data"]["indicators"][var_name]["stat"] LAG_TIME = 1 - click.echo(f"Applying {LAG_TIME} year lag time.") - sim_year = sim_year - LAG_TIME + click.echo(f"\tuse log: {ln_flag}.") + click.echo(f"\tstatistical method: {stat_method}.") + click.echo(f"\tlag time: {LAG_TIME} year(s).") - if ln_flag: - click.echo( - "Calculating log-transformed {0} {1} per aggregation unit from file {2} for year {3}".format( - stat_method, var_name, nc_fo, sim_year - ) - ) - else: - click.echo( - "Calculating {0} {1} per aggregation unit from file {2} for year {3}".format( - stat_method, var_name, nc_fo, sim_year - ) - ) + sim_year = sim_year - LAG_TIME # open nc-file with xarray as dataset nc_ds = xr.open_dataset(nc_fo) # get xarray data-array for specified variable nc_var = nc_ds[var_name] - if ln_flag: - nc_var = np.log(nc_var) # open nc-file with rasterio to get affine information affine = rio.open(nc_fo).transform # get values from data-array for specified year - nc_arr = nc_var.sel(time=sim_year) - nc_arr_vals = nc_arr.values + nc_arr_vals = nc_var.sel({"time": sim_year}).values if nc_arr_vals.size == 0: raise ValueError( f"No data was found for this year in the nc-file {nc_fo}, check if all is correct." @@ -142,106 +119,66 @@ def nc_with_float_timestamp( def nc_with_continous_datetime_timestamp( extent_gdf: gpd.GeoDataFrame, - config: RawConfigParser, + config: dict, root_dir: str, var_name: str, sim_year: int, ) -> list: - """This function extracts a value from a netCDF-file (specified in the cfg-file) + """This function extracts a value from a netCDF-file (specified in the yaml-file) for each polygon specified in extent_gdf for a given year. - In the cfg-file, it must also be specified whether the value is log-transformed or not, + In the yaml-file, it must also be specified whether the value is log-transformed or not, and which statistical method is applied. .. note:: - The key in the cfg-file must be identical to variable name in netCDF-file. + The key in the yaml-file must be identical to variable name in netCDF-file. .. note:: Works only with nc-files with annual data. Args: extent_gdf (gpd.GeoDataFrame): One or more polygons with geometry information for which values are extracted. - config (RawConfigParser): parsed configuration settings of run. - root_dir (str): path to location of cfg-file. - var_name (str): name of variable in nc-file. Must be the same as in the cfg-file. - sim_year (int): year for which data is extracted. + config (config): Parsed configuration settings of run. + root_dir (str): Path to location of yaml-file. + var_name (str): Name of variable in nc-file. Must be the same as in the yaml-file. + sim_year (int): Year for which data is extracted. Returns: - list: list containing statistical value per polygon, i.e. with same length as extent_gdf. + list: List containing statistical value per polygon, i.e. with same length as extent_gdf. """ - # get the filename, True/False whether log-transform shall be applied, and statistical method from cfg-file as list - data_fo = os.path.join( - root_dir, config.get("general", "input_dir"), config.get("data", var_name) - ).rsplit(",") - - # if not all of these three aspects are provided, raise error - if len(data_fo) != 3: - raise ValueError( - "Not all settings for input data set {} provided - \ - it must contain of path, False/True, and statistical method".format( - os.path.join( - root_dir, - config.get("general", "input_dir"), - config.get("data", var_name), - ) - ) - ) - - # if not, split the list into separate variables - nc_fo = data_fo[0] - ln_flag = bool(data_fo[1]) - stat_method = str(data_fo[2]) - - LAG_TIME = 1 - click.echo(f"Applying {LAG_TIME} year lag time for variable {var_name}.") - sim_year = sim_year - LAG_TIME + nc_fo = os.path.join( + root_dir, + config["general"]["input_dir"], + config["data"]["indicators"][var_name]["file"], + ) - if ln_flag: - click.echo( - "Calculating log-transformed {0} {1} per aggregation unit from file {2} for year {3}".format( - stat_method, var_name, nc_fo, sim_year - ) - ) + if "log" not in config["data"]["indicators"][var_name].keys(): + ln_flag = False else: - click.echo( - "Calculating {0} {1} per aggregation unit from file {2} for year {3}".format( - stat_method, var_name, nc_fo, sim_year - ) - ) + ln_flag = config["data"]["indicators"][var_name]["log"] + if "stat" not in config["data"]["indicators"][var_name].keys(): + stat_method = "mean" + else: + stat_method = config["data"]["indicators"][var_name]["stat"] + LAG_TIME = 1 + click.echo(f"\tuse log: {ln_flag}.") + click.echo(f"\tstatistical method: {stat_method}.") + click.echo(f"\tlag time: {LAG_TIME} year(s).") + sim_year = sim_year - LAG_TIME # open nc-file with xarray as dataset nc_ds = xr.open_dataset(nc_fo) # get xarray data-array for specified variable nc_var = nc_ds[var_name] - # get years contained in nc-file as integer array to be compatible with sim_year - years = ( - pd.to_datetime(nc_ds.time.values) - .to_period(freq="Y") - .strftime("%Y") - .to_numpy(dtype=int) - ) - if sim_year not in years: - warnings.warn( - f"The simulation year {sim_year} can not be found in file {nc_fo}." - ) - warnings.warn( - "Using the next following year instead (yes that is an ugly solution...)" - ) - sim_year = sim_year + 1 - # raise ValueError('ERROR: the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo)) - # get index which corresponds with sim_year in years in nc-file - sim_year_idx = int(np.where(years == sim_year)[0]) # get values from data-array for specified year based on index - nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx]) - nc_arr_vals = nc_arr.values + nc_arr_vals = nc_var.sel({"time": pd.to_datetime(sim_year, format="%Y")}).values if nc_arr_vals.size == 0: raise ValueError( "No data was found for this year in the nc-file {}, check if all is correct".format( nc_fo ) ) - # open nc-file with rasterio to get affine information affine = rio.open(nc_fo).transform diff --git a/copro/xydata.py b/copro/xydata.py index 330adc5..a35327a 100644 --- a/copro/xydata.py +++ b/copro/xydata.py @@ -1,5 +1,4 @@ from copro import conflict, variables, nb, utils -from configparser import RawConfigParser from typing import Tuple import click import numpy as np @@ -10,7 +9,7 @@ class XYData: - def __init__(self, config: RawConfigParser): + def __init__(self, config: dict): self.XY_dict = {} self.__XY_dict_initiated__ = False self.config = config @@ -26,8 +25,8 @@ def _initiate_XY_data(self): # some entries are set by default, besides the ones corresponding to input data variables self.XY_dict["poly_ID"] = pd.Series() self.XY_dict["poly_geometry"] = pd.Series() - for key in self.config.items("data"): - self.XY_dict[str(key[0])] = pd.Series(dtype=float) + for key in self.config["data"]["indicators"]: + self.XY_dict[key] = pd.Series(dtype=float) self.XY_dict["conflict_t_min_1"] = pd.Series(dtype=bool) self.XY_dict["conflict_t_min_1_nb"] = pd.Series(dtype=float) self.XY_dict["conflict"] = pd.Series(dtype=bool) @@ -44,45 +43,33 @@ def create_XY( root_dir: click.Path, polygon_gdf: gpd.GeoDataFrame, conflict_gdf: gpd.GeoDataFrame, - ) -> Tuple[np.array, np.array]: + ) -> Tuple[np.ndarray, np.ndarray]: """Top-level function to create the X-array and Y-array. If the XY-data was pre-computed and specified in cfg-file, the data is loaded. If not, variable values and conflict data are read from file and stored in array. The resulting array is by default saved as npy-format to file. Args: - config (ConfigParser-object): object containing the parsed configuration-settings of the model. out_dir (str): path to output folder. root_dir (str): path to location of cfg-file. polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons. conflict_gdf (geo-dataframe): geo-dataframe containing the selected conflicts. Returns: - array: X-array containing variable values. - array: Y-array containing conflict data. + np.ndarray: X-array containing variable values. + np.ndarray: Y-array containing conflict data. """ - # if nothing is specified in cfg-file, then initiate and fill XY data from scratch - if self.config.get("pre_calc", "XY") != " ": - self._initiate_XY_data() - # fill the dictionary and get array - XY_arr = _fill_XY( - self.XY_dict, self.config, root_dir, conflict_gdf, polygon_gdf, out_dir - ) - # save array to XY.npy out_dir - click.echo( - f"Saving XY data by default to file {os.path.join(out_dir, 'XY.npy')}." - ) - np.save(os.path.join(out_dir, "XY"), XY_arr) - # if path to XY.npy is specified, read the data intead - else: - click.echo( - f"Loading XY data from file {os.path.join(root_dir, self.config.get('pre_calc', 'XY'))}." - ) - XY_arr = np.load( - os.path.join(root_dir, self.config.get("pre_calc", "XY")), - allow_pickle=True, - ) + self._initiate_XY_data() + # fill the dictionary and get array + XY_arr = _fill_XY( + self.XY_dict, self.config, root_dir, conflict_gdf, polygon_gdf, out_dir + ) + # save array to XY.npy out_dir + click.echo( + f"Saving XY data by default to file {os.path.join(out_dir, 'XY.npy')}." + ) + np.save(os.path.join(out_dir, "XY"), XY_arr) # split the XY data into sample data X and target values Y X, Y = _split_XY_data(XY_arr) @@ -90,41 +77,41 @@ def create_XY( return X, Y -def initiate_X_data(config: RawConfigParser) -> dict: - """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data. - This is needed for each time step of each projection run. - By default, the first column is for the polygon ID and the second for polygon geometry. - The penultimate column is for boolean information about conflict at t-1 - while the last column is for boolean information about conflict at t-1 in neighboring polygons. - All remaining columns correspond to the variables provided in the cfg-file. +# def initiate_X_data(config: RawConfigParser) -> dict: +# """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data. +# This is needed for each time step of each projection run. +# By default, the first column is for the polygon ID and the second for polygon geometry. +# The penultimate column is for boolean information about conflict at t-1 +# while the last column is for boolean information about conflict at t-1 in neighboring polygons. +# All remaining columns correspond to the variables provided in the cfg-file. - Args: - config (RawConfigParser): object containing the parsed configuration-settings of the model. +# Args: +# config (RawConfigParser): object containing the parsed configuration-settings of the model. - Returns: - dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data. - """ +# Returns: +# dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data. +# """ - # Initialize dictionary - # some entries are set by default, besides the ones corresponding to input data variables - X = {} - X["poly_ID"] = pd.Series() - X["poly_geometry"] = pd.Series() - for key in config.items("data"): - X[str(key[0])] = pd.Series(dtype=float) - X["conflict_t_min_1"] = pd.Series(dtype=bool) - X["conflict_t_min_1_nb"] = pd.Series(dtype=float) +# # Initialize dictionary +# # some entries are set by default, besides the ones corresponding to input data variables +# X = {} +# X["poly_ID"] = pd.Series() +# X["poly_geometry"] = pd.Series() +# for key in config.items("data"): +# X[str(key[0])] = pd.Series(dtype=float) +# X["conflict_t_min_1"] = pd.Series(dtype=bool) +# X["conflict_t_min_1_nb"] = pd.Series(dtype=float) - click.echo("The columns in the sample matrix used are:") - for key in X: - click.echo(f"...{key}") +# click.echo("The columns in the sample matrix used are:") +# for key in X: +# click.echo(f"...{key}") - return X +# return X def fill_X_sample( X: dict, - config: RawConfigParser, + config: dict, root_dir: str, polygon_gdf: gpd.GeoDataFrame, proj_year: int, @@ -136,7 +123,7 @@ def fill_X_sample( Args: X (dict): dictionary containing keys to be sampled. - config (RawConfigParser): object containing the parsed configuration-settings of the model. + config (dict): Parsed configuration-settings of the model. root_dir (str): path to location of cfg-file of reference run. polygon_gdf (gpd.GeoDataFrame): geo-dataframe containing the selected polygons. proj_year (int): year for which projection is made. @@ -173,9 +160,9 @@ def fill_X_sample( nc_ds = xr.open_dataset( os.path.join( root_dir, - config.get("general", "input_dir"), - config.get("data", key), - ).rsplit(",")[0] + config["general"]["input_dir"], + config["data"]["indicators"][key]["file"], + ) ) if (np.dtype(nc_ds.time) == np.float32) or ( @@ -205,8 +192,8 @@ def fill_X_sample( "This file has an unsupported dtype for the time variable: {}".format( os.path.join( root_dir, - config.get("general", "input_dir"), - config.get("data", key), + config["general"]["input_dir"], + config["data"]["indicators"][key]["file"], ) ) ) @@ -264,7 +251,7 @@ def fill_X_conflict( def _fill_XY( # noqa: R0912 XY: dict, - config: RawConfigParser, + config: dict, root_dir: click.Path, conflict_data: gpd.GeoDataFrame, polygon_gdf: gpd.GeoDataFrame, @@ -276,19 +263,19 @@ def _fill_XY( # noqa: R0912 Args: XY (dict): initiated, i.e. empty, XY-dictionary - config (ConfigParser-object): object containing the parsed configuration-settings of the model. - root_dir (str): path to location of cfg-file. - conflict_data (geo-dataframe): geo-dataframe containing the selected conflicts. - polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons. - out_dir (path): path to output folder. + config (dict): Parsed configuration-settings of the model. + root_dir (str): Path to location of cfg-file. + conflict_data (gpd.GeoDataFrame): Geodataframe containing the selected conflicts. + polygon_gdf (gpd.GeoDataFrame): Geodataframe containing the selected polygons. + out_dir (path): Path to output folder. Returns: - array: filled array containing the variable values (X) and binary conflict data (Y) plus meta-data. + np.ndarray: Filled array containing the variable values (X) and binary conflict data (Y) plus meta-data. """ # go through all simulation years as specified in config-file model_period = np.arange( - config.getint("settings", "y_start"), config.getint("settings", "y_end") + 1, 1 + config["general"]["y_start"], config["general"]["y_end"] + 1, 1 ) click.echo(f"Reading data for period from {model_period[0]} to {model_period[-1]}.") @@ -297,7 +284,7 @@ def _fill_XY( # noqa: R0912 for (sim_year, i) in zip(model_period, range(len(model_period))): if i == 0: - click.echo(f"Skipping first year {sim_year} to start up model") + click.echo(f"Skipping first year {sim_year} to start up model.") else: click.echo(f"Entering year {sim_year}.") # go through all keys in dictionary @@ -360,13 +347,13 @@ def _fill_XY( # noqa: R0912 else: - nc_ds = xr.open_dataset( - os.path.join( - root_dir, - config.get("general", "input_dir"), - config.get("data", key), - ).rsplit(",")[0] + nc_fo = os.path.join( + root_dir, + config["general"]["input_dir"], + config["data"]["indicators"][key]["file"], ) + click.echo(f"Reading data for indicator {key} from {nc_fo}.") + nc_ds = xr.open_dataset(nc_fo) if (np.dtype(nc_ds.time) == np.float32) or ( np.dtype(nc_ds.time) == np.float64 From 8219d766b08e0649dc38df3993aa12208ff5493a Mon Sep 17 00:00:00 2001 From: JannisHoch Date: Sat, 24 Aug 2024 21:03:47 +0200 Subject: [PATCH 3/3] add KFold to GridSearchCV --- copro/machine_learning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/copro/machine_learning.py b/copro/machine_learning.py index 8dc5f21..6bf8641 100644 --- a/copro/machine_learning.py +++ b/copro/machine_learning.py @@ -6,7 +6,7 @@ from typing import Union, Tuple import click from pathlib import Path -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSearchCV, KFold from sklearn.ensemble import RandomForestClassifier @@ -304,7 +304,7 @@ def apply_gridsearchCV( grid_search = GridSearchCV( estimator=estimator, param_grid=param_grid, - cv=5, + cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=n_jobs, verbose=verbose, scoring="roc_auc",