diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..83b1a54 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,8 @@ +name: Ruff lint +on: [push, pull_request] +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: chartboost/ruff-action@v1 \ No newline at end of file diff --git a/example/dnn_examples/demo_jackknife.py b/example/dnn_examples/demo_jackknife.py index eb3ea65..9fde532 100644 --- a/example/dnn_examples/demo_jackknife.py +++ b/example/dnn_examples/demo_jackknife.py @@ -1,8 +1,9 @@ -import pickle -import pandas as pd import logging +import pickle import sys -from motrainer.dnn import NNTrain + +import pandas as pd + from motrainer.jackknife import JackknifeGPI logging.basicConfig( @@ -53,7 +54,7 @@ val_split_year, input_list, output_list, - outpath='{}/gpi{}'.format(out_path, gpi_num)) + outpath=f'{out_path}/gpi{gpi_num}') gpi.train(searching_space=searching_space, optimize_space=optimize_space, diff --git a/example/dnn_examples/demo_slurm/jackknife_train_one.py b/example/dnn_examples/demo_slurm/jackknife_train_one.py index bb8891a..c59dc08 100644 --- a/example/dnn_examples/demo_slurm/jackknife_train_one.py +++ b/example/dnn_examples/demo_slurm/jackknife_train_one.py @@ -1,68 +1,70 @@ -import pickle -import sys -from motrainer.jackknife import JackknifeGPI - -if __name__ == "__main__": - # Parsing input - gpi_id = int(sys.argv[1]) - - # Manual input - out_path = './results/' - file_data = '../example_data/example_data.pickle' - val_split_year = 2017 - output_list = ['sig', 'slop', 'curv'] - input_list = [ - 'TG1', 'TG2', 'TG3', 'WG1', 'WG2', 'WG3', 'BIOMA1', 'BIOMA2'] - searching_space = {'learning_rate': [5e-4, 1e-2], 'activation': ['relu']} - optimize_space = { - 'best_loss': 1, - 'n_calls': 15, - 'noise': 0.01, - 'n_jobs': -1, - 'kappa': 5, - 'validation_split': 0.2, - 'x0': [1e-3, 1, 4, 13, 'relu', 64] - } - - # Read example data - df_all_gpi = pd.read_pickle(file_data) - - gpi_data = df_all_gpi.iloc[gpi_id]['data'] - gpi_data = gpi_data.dropna() - - if len(gpi_data) > 0: - gpi = JackknifeGPI(gpi_data, - val_split_year, - input_list, - output_list, - outpath='{}/gpi{}'.format(out_path, gpi_id)) - - gpi.train(searching_space=searching_space, - optimize_space=optimize_space, - normalize_method='standard', - training_method='dnn', - performance_method='rmse', - val_split_year=val_split_year) - - gpi.export_best() - - # Export apriori performance - path_apriori_performance = '{}/apriori_performance_{}'.format( - gpi.outpath, gpi.best_year) - with open(path_apriori_performance, 'wb') as f: - pickle.dump(gpi.apr_perf, f) - - # Export postpriori performance - path_postpriori_performance = '{}/postpriori_performance_{}'.format( - gpi.outpath, gpi.best_year) - with open(path_postpriori_performance, 'wb') as f: - pickle.dump(gpi.post_perf, f) - - - print("=========================================") - print(" GPI " + str(gpi_id) + " done") - print("=========================================") - else: - print("=========================================") - print(" GPI" + str(gpi_id) + " is empty") - print("=========================================") +import pickle +import sys + +import pandas as pd + +from motrainer.jackknife import JackknifeGPI + +if __name__ == "__main__": + # Parsing input + gpi_id = int(sys.argv[1]) + + # Manual input + out_path = './results/' + file_data = '../example_data/example_data.pickle' + val_split_year = 2017 + output_list = ['sig', 'slop', 'curv'] + input_list = [ + 'TG1', 'TG2', 'TG3', 'WG1', 'WG2', 'WG3', 'BIOMA1', 'BIOMA2'] + searching_space = {'learning_rate': [5e-4, 1e-2], 'activation': ['relu']} + optimize_space = { + 'best_loss': 1, + 'n_calls': 15, + 'noise': 0.01, + 'n_jobs': -1, + 'kappa': 5, + 'validation_split': 0.2, + 'x0': [1e-3, 1, 4, 13, 'relu', 64] + } + + # Read example data + df_all_gpi = pd.read_pickle(file_data) + + gpi_data = df_all_gpi.iloc[gpi_id]['data'] + gpi_data = gpi_data.dropna() + + if len(gpi_data) > 0: + gpi = JackknifeGPI(gpi_data, + val_split_year, + input_list, + output_list, + outpath=f'{out_path}/gpi{gpi_id}') + + gpi.train(searching_space=searching_space, + optimize_space=optimize_space, + normalize_method='standard', + training_method='dnn', + performance_method='rmse', + val_split_year=val_split_year) + + gpi.export_best() + + # Export apriori performance + path_apriori_performance = f'{gpi.outpath}/apriori_performance_{gpi.best_year}' + with open(path_apriori_performance, 'wb') as f: + pickle.dump(gpi.apr_perf, f) + + # Export postpriori performance + path_postpriori_performance = '{}/postpriori_performance_{}'.format( + gpi.outpath, gpi.best_year) + with open(path_postpriori_performance, 'wb') as f: + pickle.dump(gpi.post_perf, f) + + + print("=========================================") + print(" GPI " + str(gpi_id) + " done") + print("=========================================") + else: + print("=========================================") + print(" GPI" + str(gpi_id) + " is empty") + print("=========================================") diff --git a/motrainer/__init__.py b/motrainer/__init__.py index ce948d3..21c4bb1 100644 --- a/motrainer/__init__.py +++ b/motrainer/__init__.py @@ -1,3 +1,3 @@ -from motrainer.splitter import is_splitable, dataset_split, train_test_split +from motrainer.splitter import dataset_split, is_splitable, train_test_split __all__ = ("is_splitable", "dataset_split", "train_test_split") diff --git a/motrainer/dnn.py b/motrainer/dnn.py index d4e63a6..32f456f 100644 --- a/motrainer/dnn.py +++ b/motrainer/dnn.py @@ -1,23 +1,24 @@ import logging -import tensorflow as tf -import skopt import pickle from pathlib import Path -from skopt.space import Real, Categorical, Integer -from motrainer.model import keras_dnn, keras_dnn_lossweight # disable WARNING:absl:Found untraced functions such as _update_step_xla while saving # see https://github.com/tensorflow/tensorflow/issues/47554 import absl.logging +import skopt +import tensorflow as tf +from skopt.space import Categorical, Integer, Real + +from motrainer.model import keras_dnn, keras_dnn_lossweight + absl.logging.set_verbosity(absl.logging.ERROR) logger = logging.getLogger(__name__) -class NNTrain(object): - """ - Neuron Network trainning object +class NNTrain: + """Neuron Network trainning object. Methods ------- @@ -29,9 +30,9 @@ class NNTrain(object): Optimize the neuron network within the searching space by given optimization settings """ + def __init__(self, train_input, train_output): - """ - Initialize NNTrain object + """Initialize NNTrain object. Parameters ---------- @@ -62,13 +63,11 @@ def __init__(self, train_input, train_output): self.model = None def update_space(self, **kwrags): - """ - Update searching space of optimization. - """ + """Update searching space of optimization.""" for key, value in kwrags.items(): - logger.debug('Update seaching sapce: {}={}'.format(key, value)) + logger.debug(f'Update seaching sapce: {key}={value}') # skopt.space instances - if isinstance(value, (Real, Categorical, Integer)): + if isinstance(value, Real | Categorical | Integer): self.dimensions[key] = value self.dimensions[key].name = key @@ -77,11 +76,10 @@ def update_space(self, **kwrags): assert len(value) == 2 if any([isinstance(obj, int) for obj in value]): logger.warning( - 'Mixed fload/int type found in {}:{}. ' + f'Mixed fload/int type found in {key}:{value}. ' 'The search space will be interpreted as float. ' 'If this behavior is not desired, try to specify' - 'all elements in {} with the same type.'.format( - key, value, key)) + f'all elements in {key} with the same type.') self.dimensions[key] = Real(low=value[0], high=value[1], prior='log-uniform', @@ -100,8 +98,7 @@ def update_space(self, **kwrags): else: logger.error( - 'Do not understand searching space: {}:{}.'.format( - key, value)) + f'Do not understand searching space: {key}:{value}.') raise NotImplementedError def optimize(self, @@ -112,13 +109,11 @@ def optimize(self, n_jobs=-1, kappa=5, validation_split=0.2, - x0=[1e-3, 1, 4, 13, 'relu', 64], + x0=None, training_method='dnn', loss_weights=None, verbose=0): - """ - Optimize the neuron network within the searching space by given - optimization settings + """Optimize the neuron network within the searching space.. Parameters ---------- @@ -154,14 +149,13 @@ def optimize(self, Control the verbosity. By default 0, which means no screen feedback. """ - self.best_loss = best_loss self.keras_verbose = verbose self.loss_weights = loss_weights @skopt.utils.use_named_args(dimensions=list(self.dimensions.values())) def func(**dimensions): - logger.info('optimizing with dimensions: {}'.format(dimensions)) + logger.info(f'optimizing with dimensions: {dimensions}') # setup model earlystop = tf.keras.callbacks.EarlyStopping( @@ -178,8 +172,7 @@ def func(**dimensions): if self.loss_weights is None: self.loss_weights = [1] * self.train_output.shape[1] logger.warning('loss_weights is None.' - 'Using default weights {}'.format( - self.loss_weights)) + f'Using default weights {self.loss_weights}') model = keras_dnn_lossweight(dimensions, self.train_input.shape[1], self.train_output.shape[1], @@ -208,6 +201,9 @@ def func(**dimensions): tf.keras.backend.clear_session() return loss + if x0 is None: + x0 = [1e-3, 1, 4, 13, 'relu', 64] + self.gp_result = skopt.gp_minimize(func=func, dimensions=list( self.dimensions.values()), @@ -218,10 +214,7 @@ def func(**dimensions): x0=x0) def export(self, path_model=None, path_hyperparameters=None): - """ - Export model and hyperparameters from tranning. - """ - + """Export model and hyperparameters from tranning.""" if path_model is not None: Path(path_model).parent.mkdir(parents=True, exist_ok=True) self.model.save(path_model) @@ -232,5 +225,9 @@ def export(self, path_model=None, path_hyperparameters=None): with open(path_hyperparameters, 'wb') as f: pickle.dump([ sorted( - zip(self.gp_result.func_vals, self.gp_result.x_iters)) + zip( + self.gp_result.func_vals, + self.gp_result.x_iters, + strict=True + )) ], f) diff --git a/motrainer/jackknife.py b/motrainer/jackknife.py index 542bd3a..c569d48 100644 --- a/motrainer/jackknife.py +++ b/motrainer/jackknife.py @@ -1,18 +1,18 @@ -import numpy as np -import logging import json +import logging from pathlib import Path + +import numpy as np from sklearn.model_selection import LeaveOneOut + from motrainer.dnn import NNTrain -from motrainer.util import performance, normalize +from motrainer.util import normalize, performance logger = logging.getLogger(__name__) -class JackknifeGPI(object): - """ - GPI object oriented for neuron netowork training using Jackknife resampling - method. +class JackknifeGPI: + """GPI object for neuron netowork training using Jackknife resampling method. Methods ------- @@ -30,8 +30,7 @@ def __init__(self, output_list, export_all_years=True, outpath='./jackknife_results'): - """ - Initialize JackknifeGPI object. + """Initialize JackknifeGPI object. Parameters ---------- @@ -52,10 +51,9 @@ def __init__(self, Results exporting path, by default './jackknife_results' """ logger.info('Initializing Jackkinfe trainning:\n' - 'val_split_year: {}\n' - 'input_list: {}\n' - 'output_list: {}\n'.format(val_split_year, input_list, - output_list)) + f'val_split_year: {val_split_year}\n' + f'input_list: {input_list}\n' + f'output_list: {output_list}\n') assert not ( gpi_data.isnull().values.any()), 'Nan value(s) in gpi_data!' @@ -77,8 +75,8 @@ def train(self, performance_method='rmse', training_method='dnn', verbose=0): - """ - Train neuron network with Jackknife resampling method. + """Train neuron network with Jackknife resampling method. + Procedures: 1. Reserve in/output after self.val_split_year for later benchmarking. 2. From the rest in/output data, leave out one year as validation data. @@ -108,10 +106,8 @@ def train(self, Control the verbosity. By default 0, which means no screen feedback. """ - # Data normalization - logger.debug('Normalizing input/output data. Method: {}.'.format( - normalize_method)) + logger.debug(f'Normalizing input/output data. Method: {normalize_method}.') self.gpi_input[:], scaler_input = normalize(self.gpi_input, normalize_method) self.gpi_output[:], scaler_output = normalize(self.gpi_output, @@ -132,7 +128,7 @@ def train(self, # Jackknife in time loo = LeaveOneOut() best_perf_sum = None - for train_index, test_index in loo.split(year_list): + for train_index, test_index in loo.split(year_list): # noqa: B007 this_year = year_list[test_index[0]] input_years = jackknife_input.index.year @@ -144,14 +140,18 @@ def train(self, # check if train_input and train_output are empty, raise value error if train_input.empty or train_output.empty: - raise ValueError('Trainning data is empty. Please check the val_split_year.') + raise ValueError( + 'Trainning data is empty. Please check the val_split_year.' + ) test_input = jackknife_input[input_years == this_year] test_output = jackknife_output[output_years == this_year] # check if test_input and test_output are empty, raise value error if test_input.empty or test_output.empty: - raise ValueError('Testing data is empty. Please check the val_split_year.') + raise ValueError( + 'Testing data is empty. Please check the val_split_year.' + ) # Execute training training = NNTrain(train_input, train_output) @@ -166,13 +166,11 @@ def train(self, # TODO: Add warning if no model selected for the year if training.model is None: - logger.warning('No best model was found for year: {}.'.format( - str(this_year))) + logger.warning(f'No best model was found for year: {str(this_year)}.') continue if self.export_all_years: - path_model = '{}/all_years/optimized_model_{}'.format( - self.outpath, this_year) + path_model = f'{self.outpath}/all_years/optimized_model_{this_year}' path_hyperparas = '{}/all_years/hyperparameters_{}'.format( self.outpath, this_year) training.export(path_model=path_model, @@ -192,30 +190,24 @@ def train(self, performance_method, scaler_output) self.best_train = training self.best_year = this_year - logger.info('Found best year: {}' - 'A-priori performance: {}' - 'Post-priori performance: {}'.format( - str(self.best_year), self.apr_perf, self.post_perf)) + logger.info(f'Found best year: {str(self.best_year)}' + f'A-priori performance: {self.apr_perf}' + f'Post-priori performance: {self.post_perf}') def export_best(self, model_name='best_optimized_model', hyper_name='best_hyperparameters'): - """ - export the best results in Jackknife process. - """ + """Export the best results in Jackknife process.""" logger.info( 'Exporting model and hyperparameters of year {} to {}'.format( self.best_year, self.outpath)) if model_name is not None: - path_model = '{}/{}_{}'.format( - self.outpath, model_name, self.best_year) + path_model = f'{self.outpath}/{model_name}_{self.best_year}' else: - path_model = '{}/best_optimized_model_{}'.format( - self.outpath, self.best_year) + path_model = f'{self.outpath}/best_optimized_model_{self.best_year}' if hyper_name is not None: - path_hyperparameters = '{}/{}_{}'.format( - self.outpath, hyper_name, self.best_year) + path_hyperparameters = f'{self.outpath}/{hyper_name}_{self.best_year}' else: path_hyperparameters = '{}/best_hyperparameters_{}'.format( self.outpath, self.best_year) @@ -224,7 +216,7 @@ def export_best(self, model_name='best_optimized_model', path_hyperparameters=path_hyperparameters) # write metadata - f_metadata = '{}/metadata.json'.format(self.outpath) + f_metadata = f'{self.outpath}/metadata.json' metedata = dict() metedata['input_list'] = self.input_list metedata['output_list'] = self.input_list diff --git a/motrainer/model.py b/motrainer/model.py index b83185f..f641a72 100644 --- a/motrainer/model.py +++ b/motrainer/model.py @@ -1,9 +1,11 @@ -""" +"""Implementing different types of neural network. + This script is for the implementation of different types of neural network, -including different structures, different loss functions +including different structures, different loss functions. """ import os + import tensorflow as tf # Force tensorflow debug logging off, keep only error logging @@ -11,8 +13,8 @@ def keras_dnn(dimensions, input_shape, output_shape): - """ - Deep Neural Network implemented by Keras + """Deep Neural Network implemented by Keras. + by default: dimension consists of: learning_rate, num_dense_layers,num_input_nodes, @@ -27,7 +29,7 @@ def keras_dnn(dimensions, input_shape, output_shape): activation=dimensions['activation'])) for i in range(dimensions['num_dense_layers']): - name = 'layer_dense_{0}'.format(i + 1) + name = f'layer_dense_{i + 1}' model.add( tf.keras.layers.Dense(dimensions['num_dense_nodes'], activation=dimensions['activation'], @@ -42,14 +44,14 @@ def keras_dnn(dimensions, input_shape, output_shape): def keras_dnn_lossweight(dimensions, input_shape, output_shape, loss_weights): - """ - Deep Neural Network implemented by Keras. + """Deep Neural Network implemented by Keras. + Implemented to adapt 'loss_weights'. """ inputs = tf.keras.Input(shape=(input_shape, )) for i in range(dimensions['num_dense_layers']): - name = 'layer_dense_{0}'.format(i + 1) + name = f'layer_dense_{i + 1}' if i == 0: hidden = tf.keras.layers.Dense(dimensions['num_input_nodes'], activation=dimensions['activation'], @@ -63,7 +65,7 @@ def keras_dnn_lossweight(dimensions, input_shape, output_shape, loss_weights): outputs = [] for i in range(output_shape): - name = 'out{}'.format(i + 1) + name = f'out{i + 1}' outputs.append(tf.keras.layers.Dense(1, name=name)(hidden)) adam = tf.keras.optimizers.Adam(learning_rate=dimensions['learning_rate']) diff --git a/motrainer/splitter.py b/motrainer/splitter.py index 0db66b9..3f97b41 100644 --- a/motrainer/splitter.py +++ b/motrainer/splitter.py @@ -1,7 +1,8 @@ import warnings -import xarray as xr -import numpy as np + import dask.bag as db +import numpy as np +import xarray as xr MOT_DIMS = ["space", "time"] # Expected xr.Dataset dimensions diff --git a/motrainer/util.py b/motrainer/util.py index de4ea6d..60f4a2e 100644 --- a/motrainer/util.py +++ b/motrainer/util.py @@ -1,8 +1,9 @@ -import numpy as np -import tensorflow as tf import os -import sklearn.preprocessing import random + +import numpy as np +import sklearn.preprocessing +import tensorflow as tf from scipy.stats import pearsonr, spearmanr # Force tensorflow debug logging off, keep only error logging @@ -10,8 +11,7 @@ def performance(data_input, data_label, model, method, scaler_output=None): - """ - Compute performance of trained neuron netowrk. + """Compute performance of trained neuron netowrk. Parameters ---------- @@ -37,7 +37,7 @@ def performance(data_input, data_label, model, method, scaler_output=None): """ # Temporally SL the model because of TF graph execution issue # TODO: fix the model prediction issue - tmp_path = '/tmp/tmp_model{}'.format(random.getrandbits(64)) + tmp_path = f'/tmp/tmp_model{random.getrandbits(64)}' model.save(tmp_path) model = tf.keras.models.load_model(tmp_path) predicted = model.predict(data_input) @@ -76,8 +76,7 @@ def performance(data_input, data_label, model, method, scaler_output=None): def normalize(data, method): - """ - Pre-normalization for input/output + """Pre-normalization for input/output. Parameters ---------- @@ -93,7 +92,6 @@ def normalize(data, method): normalization. """ - if method == 'standard': scaler = sklearn.preprocessing.StandardScaler() elif method == 'min_max': diff --git a/pyproject.toml b/pyproject.toml index 8195e0c..4790b5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,11 +97,54 @@ ignore = [ "D100", "D101", "D104", "D105", "D106", "D107", "D203", "D213" ] # docstring style +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "docs", +] + line-length = 88 -exclude = ["docs", "build"] +indent-width = 4 + # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" target-version = "py310" [tool.ruff.per-file-ignores] "tests/**" = ["D"] + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" diff --git a/tests/__init__.py b/tests/__init__.py index 40a96af..e69de29 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/tests/test_jackkife.py b/tests/test_jackkife.py index bc54115..c50bc97 100644 --- a/tests/test_jackkife.py +++ b/tests/test_jackkife.py @@ -1,7 +1,8 @@ +import json + import numpy as np import pandas as pd import pytest -import json from motrainer.jackknife import JackknifeGPI diff --git a/tests/test_splitter.py b/tests/test_splitter.py index b2daa7e..904b9c7 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -1,7 +1,8 @@ -import xarray as xr import numpy as np import pytest -from motrainer import is_splitable, dataset_split, train_test_split +import xarray as xr + +from motrainer import dataset_split, is_splitable, train_test_split from motrainer.splitter import _regulate_identifier, _validate_train_test_split diff --git a/tests/test_util.py b/tests/test_util.py index 75ce96d..009fec5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,8 @@ import unittest + import numpy as np import tensorflow as tf + from motrainer import util