From ecdc385ebe5bfa39d067c705db33e0b6a2d9bd34 Mon Sep 17 00:00:00 2001 From: Jannis Hoch <10956703+JannisHoch@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:06:34 +0100 Subject: [PATCH] Fix/projections (#201) * option for either Regression or Classification model * fine tuned selection of conflict data points * option to provide ML target variable * use simulation_name as output dir * data extraction from netCDF files in a separate function * target variable and estimator used determined in main script * should belong to previous commit * first step towards flexible target_vars when extracting conflict (Y) data * added class docstrings * constructing X and Y data as dataframes instead of arrays * no log-scale support for now, more consistent treatment of polygons w/o feature data * removing all polygons with 1 or more NaNs * fully pd.dataframe support implemented * save only selected conflicts which fall in simulation period * finetuning print output * no random state set for Kfold in GridSearchCV to ensure all n models are fitted on different data * better handling of cores via command line * remove content of output dir to avoid conflicts with expected files * settings parsed for projections with new yaml file structure * udpated docstring for load_estimators * fixed definition of projection period * reinitated initiate_X_data function * saving files as GPKG instead GeoJSON * no output for None output from rasterstats * make run_prediction() work * apply isort * no ML target var for nwo * fix Geopandas driver * corrected number of function arguments --- copro/conflict.py | 14 +- copro/evaluation.py | 7 +- copro/io.py | 14 +- copro/machine_learning.py | 21 +- copro/models.py | 93 ++++---- copro/nb.py | 4 +- copro/plots.py | 91 ++++---- copro/scripts/copro_runner.py | 9 +- copro/scripts/postprocessing/avg_over_time.py | 202 ++++++++++++------ copro/scripts/postprocessing/geojson2gif.py | 108 ++++++---- .../postprocessing/plot_polygon_vals.py | 75 ++++--- .../postprocessing/plot_value_over_time.py | 11 +- copro/selection.py | 18 +- copro/settings.py | 29 +-- copro/utils.py | 10 +- copro/variables.py | 18 +- copro/xydata.py | 126 ++++------- 17 files changed, 463 insertions(+), 387 deletions(-) diff --git a/copro/conflict.py b/copro/conflict.py index bb80283b..aef2f262 100644 --- a/copro/conflict.py +++ b/copro/conflict.py @@ -1,13 +1,15 @@ -from copro import utils, nb +import os +import warnings from configparser import RawConfigParser from pathlib import Path -from typing import Union, Literal -import pandas as pd +from typing import Literal, Union + +import click import geopandas as gpd import numpy as np -import os -import click -import warnings +import pandas as pd + +from copro import nb, utils def conflict_in_year_bool( diff --git a/copro/evaluation.py b/copro/evaluation.py index c4469ca9..20e98fed 100644 --- a/copro/evaluation.py +++ b/copro/evaluation.py @@ -1,8 +1,9 @@ -from sklearn import metrics -import pandas as pd -import geopandas as gpd from typing import Union +import geopandas as gpd +import pandas as pd +from sklearn import metrics + def init_out_dict(scores: Union[list[str], None] = None) -> dict: """Initiates the main model evaluatoin dictionary for a range of model metric scores. diff --git a/copro/io.py b/copro/io.py index d1c22134..22bd5549 100644 --- a/copro/io.py +++ b/copro/io.py @@ -1,9 +1,11 @@ -import pandas as pd -import numpy as np -from typing import Union -from pathlib import Path import os +import shutil +from pathlib import Path +from typing import Union + import click +import numpy as np +import pandas as pd def make_and_collect_output_dirs( @@ -27,6 +29,10 @@ def make_and_collect_output_dirs( ) click.echo(f"Saving output to main output folder {out_dir}.") + # Check if out_dir exists and delete its contents if it does + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + # initalize list for all out-dirs all_out_dirs = [] # create reference output dir '_REF' under main output dir diff --git a/copro/machine_learning.py b/copro/machine_learning.py index 805a1e9a..d7f06aa6 100644 --- a/copro/machine_learning.py +++ b/copro/machine_learning.py @@ -1,11 +1,12 @@ import os import pickle -import pandas as pd -import numpy as np -from sklearn import ensemble, preprocessing, model_selection, inspection -from typing import Union, Tuple -import click from pathlib import Path +from typing import Tuple, Union + +import click +import numpy as np +import pandas as pd +from sklearn import ensemble, inspection, model_selection, preprocessing from sklearn.model_selection import GridSearchCV, KFold @@ -151,8 +152,8 @@ def fit_predict( def load_estimators(config: dict, out_dir: str) -> list[str]: - """Loads the paths to all previously fitted classifiers to a list. - Classifiers were saved to file in fit_predict(). + """Loads the paths to all previously fitted estimators to a list. + Estimators were saved to file in `fit_predict()`. With this list, the classifiers can be loaded again during projections. Args: @@ -160,14 +161,14 @@ def load_estimators(config: dict, out_dir: str) -> list[str]: out_dir (path): path to output folder. Returns: - list: list with file names of classifiers. + list: list with file names of estimators. """ estimators = os.listdir(os.path.join(out_dir, "estimators")) if len(estimators) != config["machine_learning"]["n_runs"]: raise ValueError( - "Number of loaded classifiers does not match the specified number of runs in cfg-file!" + "Number of loaded estimators does not match the specified number of runs in reference yaml-file!" ) return estimators @@ -326,7 +327,7 @@ def apply_gridsearchCV( grid_search = GridSearchCV( estimator=estimator, param_grid=param_grid, - cv=KFold(n_splits=5, shuffle=True, random_state=42), + cv=KFold(n_splits=5, shuffle=True), n_jobs=n_jobs, verbose=verbose, scoring=scoring, diff --git a/copro/models.py b/copro/models.py index e8516563..92657480 100644 --- a/copro/models.py +++ b/copro/models.py @@ -1,15 +1,17 @@ -from copro import machine_learning, conflict, evaluation, utils, xydata, settings +import os +import pickle from configparser import RawConfigParser -from sklearn import ensemble -from sklearn.utils.validation import check_is_fitted from pathlib import Path -import pandas as pd -import numpy as np -from typing import Union, Tuple -import geopandas as gpd +from typing import Tuple, Union + import click -import os -import pickle +import geopandas as gpd +import numpy as np +import pandas as pd +from sklearn import ensemble +from sklearn.utils.validation import check_is_fitted + +from copro import conflict, evaluation, machine_learning, settings, utils, xydata class MainModel: @@ -47,6 +49,7 @@ def __init__( self.estimator = estimator self.out_dir = out_dir self.n_jobs = n_jobs + click.echo(f"Number of jobs to run in parallel: {n_jobs}.") self.verbose = verbose def run( @@ -179,26 +182,26 @@ def run_prediction( config_REF = main_dict["_REF"][0] out_dir_REF = main_dict["_REF"][1] - clfs, all_y_df = _init_prediction_run(config_REF, out_dir_REF) + estimators, all_y_df = _init_prediction_run(config_REF, out_dir_REF) # going through each projection specified - for each_key, _ in config_REF.items(): + for projection, _ in config_REF["projections"].items(): # get config-object and out-dir per projection - click.echo(f"Loading config-object for projection run: {each_key}.") - config_PROJ = main_dict[str(each_key)][0][0] - out_dir_PROJ = main_dict[str(each_key)][1] + click.echo(f"Loading config-object for projection run: {projection}.") + config_PROJ = main_dict[str(projection)][0][0] + out_dir_PROJ = main_dict[str(projection)][1] - click.echo(f"Storing output for this projections to folder {out_dir_PROJ}.") + click.echo(f"Storing output for this projection to folder {out_dir_PROJ}.") Path.mkdir( - Path(os.path.join(out_dir_PROJ, "clfs")), parents=True, exist_ok=True + Path(os.path.join(out_dir_PROJ, "estimators")), + parents=True, + exist_ok=True, ) # get projection period for this projection # defined as all years starting from end of reference run until specified end of projections - projection_period = settings.determine_projection_period( - config_REF, config_PROJ - ) + projection_period = settings.determine_projection_period(config_REF) # for this projection, go through all years for i, proj_year in enumerate(projection_period): @@ -219,7 +222,7 @@ def run_prediction( out_dir_REF, "files", "conflicts_in_{}.csv".format( - config_REF.getint("settings", "y_end") + config_REF["general"]["y_end"] ), ) ) @@ -229,7 +232,7 @@ def run_prediction( out_dir_REF, "files", "conflicts_in_{}.csv".format( - config_REF.getint("settings", "y_end") + config_REF["general"]["y_end"] ), ), index_col=0, @@ -242,35 +245,37 @@ def run_prediction( y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"]) # now load all classifiers created in the reference run - for clf in clfs: + for estimator in estimators: # creating an individual output folder per classifier if not os.path.isdir( os.path.join( os.path.join( out_dir_PROJ, - "clfs", - str(clf).rsplit(".", maxsplit=1)[0], + "estimators", + str(estimator).rsplit(".", maxsplit=1)[0], ) ) ): os.makedirs( os.path.join( out_dir_PROJ, - "clfs", - str(clf).rsplit(".", maxsplit=1)[0], + "estimators", + str(estimator).rsplit(".", maxsplit=1)[0], ) ) # load the pickled objects # TODO: keep them in memory, i.e. after reading the clfs-folder above - with open(os.path.join(out_dir_REF, "clfs", clf), "rb") as f: + with open( + os.path.join(out_dir_REF, "estimators", estimator), "rb" + ) as f: click.echo( - "Loading classifier {} from {}".format( - clf, os.path.join(out_dir_REF, "clfs") + "Loading estimator {} from {}".format( + estimator, os.path.join(out_dir_REF, "estimators") ) ) - clf_obj = pickle.load(f) + estimator_obj = pickle.load(f) # for all other projection years than the first one, # we need to read projected conflict from the previous projection year @@ -279,8 +284,8 @@ def run_prediction( "Reading previous conflicts from file {}".format( os.path.join( out_dir_PROJ, - "clfs", - str(clf), + "estimators", + str(estimator), "projection_for_{}.csv".format(proj_year - 1), ) ) @@ -288,8 +293,8 @@ def run_prediction( conflict_data = pd.read_csv( os.path.join( out_dir_PROJ, - "clfs", - str(clf).rsplit(".", maxsplit=1)[0], + "estimators", + str(estimator).rsplit(".", maxsplit=1)[0], "projection_for_{}.csv".format(proj_year - 1), ), index_col=0, @@ -307,15 +312,15 @@ def run_prediction( # here the data will be used to make projections with various classifiers # returns the prediction based on one individual classifier y_df_clf = machine_learning.predictive( - X, clf_obj, self.scaler_all_data + X, estimator_obj, self.scaler_all_data ) # storing the projection per clf to be used in the following timestep y_df_clf.to_csv( os.path.join( out_dir_PROJ, - "clfs", - str(clf).rsplit(".", maxsplit=1)[0], + "estimators", + str(estimator).rsplit(".", maxsplit=1)[0], "projection_for_{}.csv".format(proj_year), ) ) @@ -333,8 +338,9 @@ def run_prediction( y_df, global_df, make_proj=True ) gdf_hit.to_file( - os.path.join(out_dir_PROJ, f"output_in_{proj_year}.geojson"), - driver="GeoJSON", + os.path.join(out_dir_PROJ, f"output_in_{proj_year}.gpkg"), + driver="GPKG", + crs="EPSG:4326", ) # create one major output dataframe containing all output for all projections with all classifiers @@ -346,7 +352,7 @@ def run_prediction( def _init_prediction_run( config_REF: RawConfigParser, out_dir_REF: str ) -> Tuple[list, pd.DataFrame]: - """Initializes the prediction run by loading all classifiers created in the reference run. + """Initializes the prediction run by loading all estimators created in the reference run. Also initiates an empty dataframe to store the predictions. Args: @@ -354,12 +360,13 @@ def _init_prediction_run( out_dir_REF (str): Output directory for reference run. Returns: - Tuple[list, pd.DataFrame]: List with classifiers and initiated empty dataframe for predictions. + list: List with estimators. + pd.DataFrame: Initiated empty dataframe for predictions. """ - clfs = machine_learning.load_clfs(config_REF, out_dir_REF) + estimators = machine_learning.load_estimators(config_REF, out_dir_REF) # initiate output dataframe all_y_df = pd.DataFrame(columns=["ID", "geometry", "y_pred"]) - return clfs, all_y_df + return estimators, all_y_df diff --git a/copro/nb.py b/copro/nb.py index 08b6a39d..df67cbc7 100644 --- a/copro/nb.py +++ b/copro/nb.py @@ -1,7 +1,7 @@ import click -import pandas as pd -import numpy as np import geopandas as gpd +import numpy as np +import pandas as pd def neighboring_polys( diff --git a/copro/plots.py b/copro/plots.py index 568a63b0..1b89bb96 100644 --- a/copro/plots.py +++ b/copro/plots.py @@ -1,14 +1,10 @@ -import matplotlib -matplotlib.use('Agg') import matplotlib.pyplot as plt -import geopandas as gpd import seaborn as sns -sns.set_palette('colorblind') -import numpy as np -import os, sys -from sklearn import metrics from copro import evaluation +sns.set_palette("colorblind") + + def selected_polygons(polygon_gdf, **kwargs): """Creates a plotting instance of the boundaries of all selected polygons. @@ -19,13 +15,14 @@ def selected_polygons(polygon_gdf, **kwargs): Geopandas-supported keyword arguments. Returns: - ax: Matplotlib axis object. - """ + ax: Matplotlib axis object. + """ ax = polygon_gdf.boundary.plot(**kwargs) return ax + def selected_conflicts(conflict_gdf, **kwargs): """Creates a plotting instance of the best casualties estimates of the selected conflicts. @@ -36,13 +33,14 @@ def selected_conflicts(conflict_gdf, **kwargs): Geopandas-supported keyword arguments. Returns: - ax: Matplotlib axis object. - """ + ax: Matplotlib axis object. + """ - ax = conflict_gdf.plot(column='best', **kwargs) + ax = conflict_gdf.plot(column="best", **kwargs) return ax + def metrics_distribution(out_dict, metrics, **kwargs): """Plots the value distribution of a range of evaluation metrics based on all model simulations. @@ -53,19 +51,27 @@ def metrics_distribution(out_dict, metrics, **kwargs): Matplotlib-supported keyword arguments. Returns: - ax: Matplotlib axis object. - """ + ax: Matplotlib axis object. + """ - fig, ax = plt.subplots(1, 1, **kwargs) + _, ax = plt.subplots(1, 1, **kwargs) - for metric, color in zip(metrics, sns.color_palette('colorblind')): + for metric, color in zip(metrics, sns.color_palette("colorblind")): - sns.histplot(out_dict[str(metric)], ax=ax, kde=True, stat='density', color=color, label=str(metric)) + sns.histplot( + out_dict[str(metric)], + ax=ax, + kde=True, + stat="density", + color=color, + label=str(metric), + ) plt.legend() return ax + def correlation_matrix(df, **kwargs): """Plots the correlation matrix of a dataframe. @@ -76,15 +82,16 @@ def correlation_matrix(df, **kwargs): Seaborn-supported keyword arguments. Returns: - ax: Matplotlib axis object. - """ + ax: Matplotlib axis object. + """ df_corr = evaluation.calc_correlation_matrix(df) ax = sns.heatmap(df_corr, **kwargs) - + return ax + def plot_ROC_curve_n_times(ax, clf, X_test, y_test, tprs, aucs, mean_fpr, **kwargs): """Plots the ROC-curve per model simulation to a pre-initiated matplotlib-instance. @@ -99,22 +106,12 @@ def plot_ROC_curve_n_times(ax, clf, X_test, y_test, tprs, aucs, mean_fpr, **kwar Returns: list: lists with true positive rates and area-under-curve values per plot. - """ + """ - raise DeprecationWarning('Plotting API in sklearn is changed, function needs updating.') - - viz = metrics.plot_roc_curve(clf, X_test, y_test, ax=ax, - alpha=0.15, color='b', lw=1, label=None, **kwargs) - - # rfc_disp = metrics.RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax, - # alpha=0.15, color='b', lw=1, label=None, **kwargs) + raise NotImplementedError( + "Plotting API in sklearn is changed, function needs updating and reimplementation." + ) - interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) - interp_tpr[0] = 0.0 - tprs.append(interp_tpr) - aucs.append(viz.roc_auc) - - return tprs, aucs def plot_ROC_curve_n_mean(ax, tprs, aucs, mean_fpr, **kwargs): """Plots the mean ROC-curve to a pre-initiated matplotlib-instance. @@ -124,24 +121,8 @@ def plot_ROC_curve_n_mean(ax, tprs, aucs, mean_fpr, **kwargs): tprs (list): list with false positive rates. aucs (list): list with area-under-curve values. mean_fpr (array): array with mean false positive rate. - """ - - raise DeprecationWarning('Plotting API in sklearn is changed, function needs updating.') - - mean_tpr = np.mean(tprs, axis=0) - mean_tpr[-1] = 1.0 - mean_auc = metrics.auc(mean_fpr, mean_tpr) - std_auc = np.std(aucs) - ax.plot(mean_fpr, mean_tpr, color='r', - label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), - lw=2, alpha=.8, **kwargs) - - std_tpr = np.std(tprs, axis=0) - tprs_upper = np.minimum(mean_tpr + std_tpr, 1) - tprs_lower = np.maximum(mean_tpr - std_tpr, 0) - ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=None, **kwargs) - - ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], **kwargs) - - ax.legend(loc="lower right") - + """ + + raise NotImplementedError( + "Plotting API in sklearn is changed, function needs updating and reimplementation." + ) diff --git a/copro/scripts/copro_runner.py b/copro/scripts/copro_runner.py index 4a581505..bc9717d1 100644 --- a/copro/scripts/copro_runner.py +++ b/copro/scripts/copro_runner.py @@ -1,9 +1,10 @@ -from copro import settings, selection, evaluation, io, models, xydata +import os import click import numpy as np import pandas as pd -import os + +from copro import evaluation, io, models, selection, settings, xydata @click.command() @@ -13,7 +14,7 @@ "-c", type=int, default=5, - help="Number of jobs to run in parallel. Default is 0.", + help="Number of jobs to run in parallel. Default is 5.", ) @click.option( "--verbose", @@ -100,7 +101,7 @@ def cli(cfg: click.Path, cores: int, verbose: int): # - create accuracy values per polygon and save to output folder gdf_hit = evaluation.polygon_model_accuracy(out_y_df, global_df) gdf_hit.to_file( - os.path.join(out_dir_REF, "output_for_REF.geojson"), driver="GeoJSON" + os.path.join(out_dir_REF, "output_for_REF.gpkg"), driver="GPKG", crs="EPSG:4326" ) click.echo(click.style("\nINFO: reference run succesfully finished\n", fg="cyan")) diff --git a/copro/scripts/postprocessing/avg_over_time.py b/copro/scripts/postprocessing/avg_over_time.py index deb09f87..f9eb5b5d 100644 --- a/copro/scripts/postprocessing/avg_over_time.py +++ b/copro/scripts/postprocessing/avg_over_time.py @@ -1,25 +1,46 @@ -import click import glob -import numpy as np +import os + +import click import geopandas as gpd -import pandas as pd import matplotlib.pyplot as plt -from copro import utils, evaluation -import os, sys +import numpy as np +import pandas as pd + +from copro import utils +from pathlib import Path + + +def check_year_settings(start_year, end_year): + + # check if start/end time settings are consistent + if ((start_year is not None) and (end_year is None)) or ( + (end_year is not None) and (start_year is None) + ): + raise ValueError( + "ERROR: if start or end year is specified, the pendant must be specified too!" + ) + @click.command() -@click.option('-t0', '--start-year', type=int) -@click.option('-t1', '--end-year', type=int) -@click.option('-c', '--column', help='column name', default='chance_of_conflict', type=str) -@click.option('--geojson/--no-geojson', help='save output to geojson or not', default=False) -@click.option('--png/--no-png', help='save output to png or not', default=True) -@click.option('--verbose/--no-verbose', help='verbose on/off', default=False) -@click.argument('input_dir', type=click.Path()) -@click.argument('output_dir',type=click.Path()) -@click.argument('selected_polygons', type=click.Path()) - -def main(input_dir=None, output_dir=None, selected_polygons=None, start_year=None, end_year=None, geojson=None, png=None, column=None, verbose=None): - """Post-processing script to calculate average model output over a user-specifeid period or all output geoJSON-files stored in input-dir. +@click.option("-t0", "--start-year", type=int) +@click.option("-t1", "--end-year", type=int) +@click.option( + "-c", "--column", help="column name", default="chance_of_conflict", type=str +) +@click.argument("input_dir", type=click.Path()) +@click.argument("output_dir", type=click.Path()) +@click.argument("selected_polygons", type=click.Path()) +def main( + input_dir=None, + output_dir=None, + selected_polygons=None, + start_year=None, + end_year=None, + column=None, +): + """Post-processing script to calculate average model output over a user-specifeid period + or all output geoJSON-files stored in input-dir. Computed average values can be outputted as geoJSON-file or png-file or both. Args: @@ -30,44 +51,53 @@ def main(input_dir=None, output_dir=None, selected_polygons=None, start_year=Non Output: geoJSON-file with average column value per polygon (if geojson is set). png-file with plot of average column value per polygon (if png is set) - """ + """ # check if start/end time settings are consistent - if ((start_year != None) and (end_year == None)) or ((end_year != None) and (start_year == None)): - raise ValueError('ERROR: if start or end year is specified, the pendant must be specified too!') + check_year_settings(start_year, end_year) # read a shp-file with geometries of all selected polygons - click.echo('\nreading shp-file with all polygons from {}'.format(os.path.abspath(selected_polygons))) + click.echo( + "\nreading shp-file with all polygons from {}".format( + os.path.abspath(selected_polygons) + ) + ) selected_polygons_gdf = gpd.read_file(os.path.abspath(selected_polygons)) # create dataframe global_df = utils.global_ID_geom_info(selected_polygons_gdf) # find all geojson-files in input-dir input_dir = os.path.abspath(input_dir) - click.echo('getting geojson-files from {}'.format(input_dir)) - all_files = sorted(glob.glob(os.path.join(input_dir, '*.geojson'))) + click.echo("getting geojson-files from {}".format(input_dir)) + all_files = sorted(glob.glob(os.path.join(input_dir, "*.geojson"))) # if specific start/end time is specified, find only those geojson-files for specified period - if (start_year != None) and (end_year != None): + if (start_year is not None) and (end_year is not None): # define period between start and ent time - period = np.arange(start_year, end_year+1, 1) - click.echo('using all geojson-files for years {} to {}'.format(period[0], period[-1])) + period = np.arange(start_year, end_year + 1, 1) + click.echo( + "using all geojson-files for years {} to {}".format(period[0], period[-1]) + ) # creating suffix for file saving later - suffix = '{}_to_{}'.format(period[0], period[-1]) + suffix = "{}_to_{}".format(period[0], period[-1]) # initiate empty list for selected geojson-files selected_files = [] # select for fo in all_files: # if the year-suffix of geojson-file matches year in period, add to list - year = int(str(str(os.path.basename(fo)).rsplit('.')[0]).rsplit('_')[-1]) + year = int( + str(str(os.path.basename(fo)).rsplit(".", maxsplit=1)[0]).rsplit( + "_", maxsplit=1 + )[-1] + ) if year in period: - if verbose: print('adding to selection') + print("adding to selection") selected_files.append(fo) # if not end/start time is specified, use all geojson-files in input-dir else: - click.echo('using all geojson-files in input-dir') + click.echo("using all geojson-files in input-dir") # also here, create suffix for file saving laster - suffix = 'all_years' + suffix = "all_years" selected_files = all_files # initiatie empyt dataframe for collecting all annual output @@ -76,57 +106,91 @@ def main(input_dir=None, output_dir=None, selected_polygons=None, start_year=Non # go through all geojson-files left over after selection step for geojson in selected_files: # read files and convert to datatrame - if verbose: click.echo('reading file {}'.format(geojson)) - gdf = gpd.read_file(geojson, driver='GeoJSON') + click.echo("reading file {}".format(geojson)) + gdf = gpd.read_file(geojson, driver="GeoJSON") df = pd.DataFrame(gdf) - # append to dataframe + # append to dataframe y_df = y_df.append(df, ignore_index=True) - + # initiate dataframe for time-averaged output - click.echo('creating one output dataframe from all geojson-files') + click.echo("creating one output dataframe from all geojson-files") y_out = pd.DataFrame() # get all unique IDs of polygons - y_out['ID'] = y_df.ID.unique() - click.echo('reading from column {}'.format(column)) - if column == 'chance_of_conflict': + y_out["ID"] = y_df.ID.unique() + click.echo("reading from column {}".format(column)) + if column == "chance_of_conflict": # add number of predictiosn made over all selected years - y_out = pd.merge(y_out, y_df.nr_predictions.groupby(y_df.ID).sum().to_frame(), on='ID') + y_out = pd.merge( + y_out, y_df.nr_predictions.groupby(y_df.ID).sum().to_frame(), on="ID" + ) # add number of predicted conflicts over all selected years - y_out = pd.merge(y_out, y_df.nr_predicted_conflicts.groupby(y_df.ID).sum().to_frame(), on='ID') + y_out = pd.merge( + y_out, + y_df.nr_predicted_conflicts.groupby(y_df.ID).sum().to_frame(), + on="ID", + ) # determine chance of conflict over all selected years y_out[column] = y_out.nr_predicted_conflicts / y_out.nr_predictions - elif column == 'avg_prob_1': - y_out = pd.merge(y_out, pd.to_numeric(y_df[column]).groupby(y_df.ID).mean().to_frame(), on='ID') + elif column == "avg_prob_1": + y_out = pd.merge( + y_out, + pd.to_numeric(y_df[column]).groupby(y_df.ID).mean().to_frame(), + on="ID", + ) else: - raise ValueError('ERROR: column {} is not yet supported'.format(column)) + raise ValueError("ERROR: column {} is not yet supported".format(column)) # add geometry informatoin for each polygon - y_out = pd.merge(y_out, global_df, on='ID', how='left') + y_out = pd.merge(y_out, global_df, on="ID", how="left") - if not os.path.isdir(os.path.abspath(output_dir)): - click.echo('creating output folder {}'.format(os.path.abspath(output_dir))) - os.makedirs(os.path.abspath(output_dir)) + Path(os.path.abspath(output_dir)).mkdir(parents=True, exist_ok=True) # convert to geo-dataframe gdf_out = gpd.GeoDataFrame(y_out, geometry=y_out.geometry) - # if specified, save as geojson-file to output-dir - if geojson: - click.echo('saving to {}'.format(os.path.abspath(os.path.join(output_dir, '{}_merged_{}.geojson'.format(column, suffix))))) - gdf_out.to_file(os.path.abspath(os.path.join(output_dir, '{}_merged_{}.geojson'.format(column, suffix))), driver='GeoJSON') - - # if specified, save as png-file to output-dir - if png: - fig, ax = plt.subplots(1, 1) - gdf_out.plot(column=column, ax=ax, - cmap='Reds', - vmin=0, vmax=1, - legend=True, - legend_kwds={'label': column, 'orientation': "vertical"}) - - click.echo('saving to {}'.format(os.path.abspath(os.path.join(output_dir, '{}_merged_{}.png'.format(column, suffix))))) - plt.savefig(os.path.abspath(os.path.join(output_dir, '{}_merged_{}.png'.format(column, suffix))), dpi=300, bbox_inches='tight') - plt.close() - -if __name__ == '__main__': - - main() \ No newline at end of file + # save as geojson-file to output-dir + click.echo( + "saving to {}".format( + os.path.abspath( + os.path.join(output_dir, "{}_merged_{}.geojson".format(column, suffix)) + ) + ) + ) + gdf_out.to_file( + os.path.abspath( + os.path.join(output_dir, "{}_merged_{}.geojson".format(column, suffix)) + ), + driver="GeoJSON", + ) + + # save as png-file to output-dir + _, ax = plt.subplots(1, 1) + gdf_out.plot( + column=column, + ax=ax, + cmap="Reds", + vmin=0, + vmax=1, + legend=True, + legend_kwds={"label": column, "orientation": "vertical"}, + ) + + click.echo( + "saving to {}".format( + os.path.abspath( + os.path.join(output_dir, "{}_merged_{}.png".format(column, suffix)) + ) + ) + ) + plt.savefig( + os.path.abspath( + os.path.join(output_dir, "{}_merged_{}.png".format(column, suffix)) + ), + dpi=300, + bbox_inches="tight", + ) + plt.close() + + +if __name__ == "__main__": + + main() diff --git a/copro/scripts/postprocessing/geojson2gif.py b/copro/scripts/postprocessing/geojson2gif.py index 82bf6e38..c8c07b93 100644 --- a/copro/scripts/postprocessing/geojson2gif.py +++ b/copro/scripts/postprocessing/geojson2gif.py @@ -1,24 +1,37 @@ -import click import glob -import matplotlib.pyplot as plt -import geopandas as gpd +import os from shutil import rmtree + +import click +import geopandas as gpd +import matplotlib.pyplot as plt from PIL import Image -import os + @click.command() -@click.option('-c', '--column', help='column name', default='chance_of_conflict', type=str) -@click.option('-cmap', '--color-map', default='brg', type=str) -@click.option('-v0', '--minimum-value', default=0, type=float) -@click.option('-v1', '--maximum-value', default=1, type=float) -@click.option('--delete/--no-delete', help='whether or not to delete png-files', default=True) -@click.argument('input-dir', type=click.Path()) -@click.argument('output-dir', type=click.Path()) - -def main(input_dir=None, column=None, color_map=None, minimum_value=None, maximum_value=None, delete=None, output_dir=None): +@click.option( + "-c", "--column", help="column name", default="chance_of_conflict", type=str +) +@click.option("-cmap", "--color-map", default="brg", type=str) +@click.option("-v0", "--minimum-value", default=0, type=float) +@click.option("-v1", "--maximum-value", default=1, type=float) +@click.option( + "--delete/--no-delete", help="whether or not to delete png-files", default=True +) +@click.argument("input-dir", type=click.Path()) +@click.argument("output-dir", type=click.Path()) +def main( + input_dir=None, + column=None, + color_map=None, + minimum_value=None, + maximum_value=None, + delete=None, + output_dir=None, +): """Function to convert column values of all geoJSON-files in a directory into one GIF-file. The function provides several options to modify the design of the GIF-file. - The GIF-file is based on png-files of column value per geoJSON-file. + The GIF-file is based on png-files of column value per geoJSON-file. It is possible to keep these png-file as simple plots of values per time step. Args: @@ -31,53 +44,66 @@ def main(input_dir=None, column=None, color_map=None, minimum_value=None, maximu # get path to geoJSON-files input_dir = os.path.abspath(input_dir) - click.echo('\ngetting geojson-files from {}'.format(input_dir)) + click.echo("\ngetting geojson-files from {}".format(input_dir)) # create folder where intermediate png-files are stored - png_dir = os.path.join(output_dir, 'png') - click.echo('creating png-folder {}'.format(png_dir)) + png_dir = os.path.join(output_dir, "png") + click.echo("creating png-folder {}".format(png_dir)) if not os.path.isdir(png_dir): os.mkdir(png_dir) # collect all geoJSON-files - all_files = glob.glob(os.path.join(input_dir, '*.geojson')) - + all_files = glob.glob(os.path.join(input_dir, "*.geojson")) + # plot values per geoJSON-file - click.echo('plotting column {}'.format(column)) + click.echo("plotting column {}".format(column)) for geojson in all_files: - click.echo('reading file {}'.format(geojson)) + click.echo("reading file {}".format(geojson)) # read geoJSON-file - gdf = gpd.read_file(geojson, driver='GeoJSON') + gdf = gpd.read_file(geojson, driver="GeoJSON") # retrieve year information from filename of geoJSON-file - year = int(str(str(os.path.basename(geojson)).rsplit('.')[0]).rsplit('_')[-1]) + year = int( + str(str(os.path.basename(geojson)).rsplit(".", maxsplit=1)[0]).rsplit( + "_", maxsplit=1 + )[-1] + ) - fig, ax = plt.subplots(1, 1) - gdf.plot(column=column, - ax=ax, - cmap=color_map, - vmin=minimum_value, vmax=maximum_value, - legend=True, - legend_kwds={'label': str(column), - 'orientation': "vertical"}) + _, ax = plt.subplots(1, 1) + gdf.plot( + column=column, + ax=ax, + cmap=color_map, + vmin=minimum_value, + vmax=maximum_value, + legend=True, + legend_kwds={"label": str(column), "orientation": "vertical"}, + ) ax.set_title(str(year)) # save plot - click.echo('saving plot to png-folder') - plt.savefig(os.path.join(png_dir, 'plt_{}_{}.png'.format(column, year)), dpi=300, bbox_inches='tight') - + click.echo("saving plot to png-folder") + plt.savefig( + os.path.join(png_dir, "plt_{}_{}.png".format(column, year)), + dpi=300, + bbox_inches="tight", + ) + # create GIF and save - click.echo('creating GIF from saved plots') + click.echo("creating GIF from saved plots") # based on: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#gif - fp_in = os.path.join(png_dir, '*_{}_*.png'.format(column)) - fp_out = os.path.join(output_dir, '{}_over_time.gif'.format(column)) + fp_in = os.path.join(png_dir, "*_{}_*.png".format(column)) + fp_out = os.path.join(output_dir, "{}_over_time.gif".format(column)) img, *imgs = [Image.open(f) for f in sorted(glob.glob(fp_in))] - img.save(fp=fp_out, format='GIF', append_images=imgs, save_all=True, duration=500, loop=0) + img.save( + fp=fp_out, format="GIF", append_images=imgs, save_all=True, duration=500, loop=0 + ) # if specified, delete all (intermediate) png-files if delete: - click.echo('removing png-folder') + click.echo("removing png-folder") rmtree(png_dir) -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + + main() diff --git a/copro/scripts/postprocessing/plot_polygon_vals.py b/copro/scripts/postprocessing/plot_polygon_vals.py index f6a03a31..84c32d7a 100644 --- a/copro/scripts/postprocessing/plot_polygon_vals.py +++ b/copro/scripts/postprocessing/plot_polygon_vals.py @@ -1,19 +1,27 @@ +import os + import click -import matplotlib.pyplot as plt -import pandas as pd import geopandas as gpd -import os +import matplotlib.pyplot as plt + @click.command() -@click.argument('file-object', type=click.Path()) -@click.option('-c', '--column', help='column name') -@click.option('-t', '--title', help='title for plot and file_object name') -@click.option('-v0', '--minimum-value', default=0, type=float) -@click.option('-v1', '--maximum-value', default=1, type=float) -@click.option('-cmap', '--color-map', default='brg', type=str) -@click.argument('output-dir', type=click.Path()) - -def main(file_object=None, column=None, title=None, minimum_value=None, maximum_value=None, color_map=None, output_dir=None): +@click.argument("file-object", type=click.Path()) +@click.option("-c", "--column", help="column name") +@click.option("-t", "--title", help="title for plot and file_object name") +@click.option("-v0", "--minimum-value", default=0, type=float) +@click.option("-v1", "--maximum-value", default=1, type=float) +@click.option("-cmap", "--color-map", default="brg", type=str) +@click.argument("output-dir", type=click.Path()) +def main( + file_object=None, + column=None, + title=None, + minimum_value=None, + maximum_value=None, + color_map=None, + output_dir=None, +): """Quick and dirty function to plot the column values of a geojson file with minimum user input, and save plot. Mainly used for quick inspection of model output in specific years. @@ -27,28 +35,39 @@ def main(file_object=None, column=None, title=None, minimum_value=None, maximum_ # get absolute path to geoJSON-file fo = os.path.abspath(file_object) - click.echo('\nreading file_object {}'.format(fo)) + click.echo("\nreading file_object {}".format(fo)) # read file df = gpd.read_file(fo) # plot - click.echo('plotting column {}'.format(column)) - fig, ax = plt.subplots(1, 1) - df.plot(column=column, - ax=ax, - cmap=color_map, - vmin=minimum_value, vmax=maximum_value, - legend=True, - legend_kwds={'label': str(column), - 'orientation': "vertical"}) - if title != None: + click.echo("plotting column {}".format(column)) + _, ax = plt.subplots(1, 1) + df.plot( + column=column, + ax=ax, + cmap=color_map, + vmin=minimum_value, + vmax=maximum_value, + legend=True, + legend_kwds={"label": str(column), "orientation": "vertical"}, + ) + if title is not None: ax.set_title(str(title)) # save plot to file - file_object_name = str(os.path.basename(file_object)).rsplit('.')[0] - click.echo('saving plot to file_object {}'.format(os.path.abspath(os.path.join(output_dir, '{}.png'.format(file_object_name))))) - plt.savefig(os.path.abspath(os.path.join(output_dir, '{}.png'.format(file_object_name))), dpi=300, bbox_inches='tight') + file_object_name = str(os.path.basename(file_object)).rsplit(".", maxsplit=1)[0] + click.echo( + "saving plot to file_object {}".format( + os.path.abspath(os.path.join(output_dir, "{}.png".format(file_object_name))) + ) + ) + plt.savefig( + os.path.abspath(os.path.join(output_dir, "{}.png".format(file_object_name))), + dpi=300, + bbox_inches="tight", + ) + -if __name__ == '__main__': +if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/copro/scripts/postprocessing/plot_value_over_time.py b/copro/scripts/postprocessing/plot_value_over_time.py index 88301d55..5dea7054 100644 --- a/copro/scripts/postprocessing/plot_value_over_time.py +++ b/copro/scripts/postprocessing/plot_value_over_time.py @@ -1,13 +1,14 @@ -import click import glob -import matplotlib.pyplot as plt -import pandas as pd -import geopandas as gpd -import numpy as np import os import warnings from pathlib import Path +import click +import geopandas as gpd +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + def define_val(data_series: pd.Series, statistics: str) -> float: diff --git a/copro/selection.py b/copro/selection.py index ece39283..53b609b2 100644 --- a/copro/selection.py +++ b/copro/selection.py @@ -1,10 +1,12 @@ +import os +import warnings +from typing import Tuple + +import click import geopandas as gpd import pandas as pd -import os + from copro import utils -import click -from typing import Tuple -import warnings def select( @@ -40,13 +42,13 @@ def select( # save conflict data and polygon to shp-file conflict_gdf.to_file( - os.path.join(out_dir, "selected_conflicts.geojson"), - driver="GeoJSON", + os.path.join(out_dir, "selected_conflicts.gpkg"), + driver="GPKG", crs="EPSG:4326", ) extent_gdf.to_file( - os.path.join(out_dir, "selected_polygons.geojson"), - driver="GeoJSON", + os.path.join(out_dir, "selected_polygons.gpkg"), + driver="GPKG", crs="EPSG:4326", ) diff --git a/copro/settings.py b/copro/settings.py index 4155b2b0..30e428fc 100644 --- a/copro/settings.py +++ b/copro/settings.py @@ -1,13 +1,13 @@ -import click import os -import numpy as np -from configparser import RawConfigParser from shutil import copyfile from typing import Tuple, Union -from copro import utils, io -from sklearn import ensemble +import click +import numpy as np import yaml +from sklearn import ensemble + +from copro import io, utils def initiate_setup(settings_file: click.Path) -> Tuple[dict, str]: @@ -102,13 +102,17 @@ def _collect_simulation_settings(config: dict, root_dir: click.Path) -> dict: # first entry is config-object for reference run config_dict["_REF"] = config - if "PROJ_files" in config["general"].keys(): + if "projections" in config_dict["_REF"]: # loop through all keys and values in PROJ_files section of reference config-object - for (each_key, each_val) in config.items("PROJ_files"): + for (each_key, each_val) in config_dict["_REF"]["projections"].items(): # for each value (here representing the cfg-files of the projections), get the absolute path - each_val = os.path.abspath(os.path.join(root_dir, each_val)) + each_val = os.path.abspath( + os.path.join( + root_dir, config_dict["_REF"]["projections"][each_key]["file"] + ) + ) # parse each config-file specified each_config = _parse_settings(each_val) @@ -170,16 +174,13 @@ def define_target_var( return None -def determine_projection_period( - config_REF: RawConfigParser, config_PROJ: RawConfigParser -) -> list: +def determine_projection_period(config_REF: dict) -> list: """Determines the period for which projections need to be made. This is defined as the period between the end year of the reference run and the specified projection year for each projection. Args: - config_REF (RawConfigParser): model configuration-settings for the reference run. - config_PROJ (RawConfigParser): model configuration-settings for a projection run.. + config_REF (dict): model configuration-settings for the reference run. Returns: list: all years of the projection period. @@ -188,7 +189,7 @@ def determine_projection_period( # get all years of projection period projection_period = np.arange( config_REF["general"]["y_end"] + 1, - config_PROJ["general"]["y_proj"] + 1, + config_REF["projections"]["proj_2020_to_2023"]["y_end"] + 1, 1, ) # convert to list diff --git a/copro/utils.py b/copro/utils.py index e5055791..4e22a55e 100644 --- a/copro/utils.py +++ b/copro/utils.py @@ -1,10 +1,12 @@ -import geopandas as gpd -import pandas as pd -import numpy as np import os from datetime import date + import click -from copro import __version__, __author__, __email__ +import geopandas as gpd +import numpy as np +import pandas as pd + +from copro import __author__, __email__, __version__ def get_conflict_geodataframe( diff --git a/copro/variables.py b/copro/variables.py index d718a883..b9656689 100644 --- a/copro/variables.py +++ b/copro/variables.py @@ -1,12 +1,13 @@ -import xarray as xr -import rasterio as rio -import pandas as pd -import geopandas as gpd -import rasterstats as rstats -import numpy as np import os import warnings + import click +import geopandas as gpd +import numpy as np +import pandas as pd +import rasterio as rio +import rasterstats as rstats +import xarray as xr def nc_with_float_timestamp( @@ -201,11 +202,6 @@ def nc_with_continous_datetime_timestamp( # warn if result is NaN if val is None: - warnings.warn( - f"`None` computed for {config['data']['extent']['id']}" - f"{row[config['data']['extent']['id']]}, setting to `np.nan`!" - ) - val = np.nan list_out.append(val) diff --git a/copro/xydata.py b/copro/xydata.py index 10bde9c0..5ab7050c 100644 --- a/copro/xydata.py +++ b/copro/xydata.py @@ -1,11 +1,13 @@ -from copro import conflict, variables, nb, utils +import os from typing import Tuple, Union + import click +import geopandas as gpd import numpy as np -import xarray as xr import pandas as pd -import geopandas as gpd -import os +import xarray as xr + +from copro import conflict, nb, utils, variables class XYData: @@ -77,7 +79,7 @@ def create_XY( self.config, root_dir, conflict_gdf, - self.target_var, + # self.target_var, polygon_gdf, out_dir, ) @@ -95,36 +97,39 @@ def create_XY( return X, Y -# def initiate_X_data(config: RawConfigParser) -> dict: -# """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data. -# This is needed for each time step of each projection run. -# By default, the first column is for the polygon ID and the second for polygon geometry. -# The penultimate column is for boolean information about conflict at t-1 -# while the last column is for boolean information about conflict at t-1 in neighboring polygons. -# All remaining columns correspond to the variables provided in the cfg-file. +def initiate_X_data(config: dict) -> dict: + """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data. + This is needed for each time step of each projection run. + By default, the first column is for the polygon ID and the second for polygon geometry. + The penultimate column is for boolean information about conflict at t-1 + while the last column is for boolean information about conflict at t-1 in neighboring polygons. + All remaining columns correspond to the variables provided in the cfg-file. -# Args: -# config (RawConfigParser): object containing the parsed configuration-settings of the model. + ..todo:: + Can this be better aligned with the XYData-class? -# Returns: -# dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data. -# """ + Args: + config (dict): object containing the parsed configuration-settings of the model. -# # Initialize dictionary -# # some entries are set by default, besides the ones corresponding to input data variables -# X = {} -# X["poly_ID"] = pd.Series() -# X["poly_geometry"] = pd.Series() -# for key in config.items("data"): -# X[str(key[0])] = pd.Series(dtype=float) -# X["conflict_t_min_1"] = pd.Series(dtype=bool) -# X["conflict_t_min_1_nb"] = pd.Series(dtype=float) + Returns: + dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data. + """ -# click.echo("The columns in the sample matrix used are:") -# for key in X: -# click.echo(f"...{key}") + # Initialize dictionary + # some entries are set by default, besides the ones corresponding to input data variables + X = {} + X["poly_ID"] = pd.Series() + X["poly_geometry"] = pd.Series() + for key in config["data"]["indicators"].keys(): + X[key] = pd.Series(dtype=float) + X["conflict_t_min_1"] = pd.Series(dtype=bool) + X["conflict_t_min_1_nb"] = pd.Series(dtype=float) -# return X + click.echo("The columns in the sample matrix used are:") + for key in X: + click.echo(f"...{key}") + + return X def fill_X_sample( @@ -175,47 +180,10 @@ def fill_X_sample( if key not in ["conflict_t_min_1", "conflict_t_min_1_nb"]: - nc_ds = xr.open_dataset( - os.path.join( - root_dir, - config["general"]["input_dir"], - config["data"]["indicators"][key]["file"], - ) + X[key] = _read_data_from_netCDF( + root_dir, config, key, value, polygon_gdf, proj_year ) - if (np.dtype(nc_ds.time) == np.float32) or ( - np.dtype(nc_ds.time) == np.float64 - ): - data_series = value - data_list = variables.nc_with_float_timestamp( - polygon_gdf, config, root_dir, key, proj_year - ) - data_series = pd.concat( - [data_series, pd.Series(data_list)], axis=0, ignore_index=True - ) - X[key] = data_series - - elif np.dtype(nc_ds.time) == "datetime64[ns]": - data_series = value - data_list = variables.nc_with_continous_datetime_timestamp( - polygon_gdf, config, root_dir, key, proj_year - ) - data_series = pd.concat( - [data_series, pd.Series(data_list)], axis=0, ignore_index=True - ) - X[key] = data_series - - else: - raise ValueError( - "This file has an unsupported dtype for the time variable: {}".format( - os.path.join( - root_dir, - config["general"]["input_dir"], - config["data"]["indicators"][key]["file"], - ) - ) - ) - return X @@ -272,7 +240,7 @@ def _fill_XY( # noqa: R0912 config: dict, root_dir: click.Path, conflict_data: gpd.GeoDataFrame, - target_var: Union[str, None], + # target_var: Union[str, None], polygon_gdf: gpd.GeoDataFrame, out_dir: click.Path, ) -> pd.DataFrame: @@ -315,14 +283,14 @@ def _fill_XY( # noqa: R0912 data_series = value # TODO: guess for target_vars others than None, a dedicasted function is needed - if target_var is None: - data_list = conflict.conflict_in_year_bool( - config, conflict_data, polygon_gdf, sim_year, out_dir - ) - else: - raise NotImplementedError( - "Implementation of target_var did not happen yet." - ) + # if target_var is None: + # data_list = conflict.conflict_in_year_bool( + # config, conflict_data, polygon_gdf, sim_year, out_dir + # ) + # else: + # raise NotImplementedError( + # "Implementation of target_var did not happen yet." + # ) data_list = conflict.conflict_in_year_bool( config, conflict_data, polygon_gdf, sim_year, out_dir ) @@ -381,8 +349,6 @@ def _fill_XY( # noqa: R0912 root_dir, config, key, value, polygon_gdf, sim_year ) - click.echo("All data read.") - return pd.DataFrame.from_dict(XY) # .to_numpy()