diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2c1374c..145edb8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,7 +35,7 @@ jobs: run: | python -m pip install poetry poetry lock --no-update - poetry install -E dev -E rnn + poetry install -E dev -E rnn -E visuals - name: Verify that we can build the package run: poetry build - name: Test with pytest diff --git a/grainlearning/bayesian_calibration.py b/grainlearning/bayesian_calibration.py index d35e7fc..17a06b9 100644 --- a/grainlearning/bayesian_calibration.py +++ b/grainlearning/bayesian_calibration.py @@ -6,7 +6,8 @@ from numpy import argmax from grainlearning.dynamic_systems import DynamicSystem, IODynamicSystem from grainlearning.iterative_bayesian_filter import IterativeBayesianFilter -from grainlearning.tools import plot_param_stats, plot_posterior, plot_param_data, plot_obs_and_sim +from grainlearning.tools import plot_param_stats, plot_posterior, plot_param_data, plot_obs_and_sim, plot_pdf, \ + close_plots class BayesianCalibration: @@ -180,6 +181,18 @@ def load_and_process(self, sigma: float = 0.1): self.calibration.inference.data_assimilation_loop(sigma, self.system) self.system.compute_estimated_params(self.calibration.inference.posteriors) + def load_all(self): + """Simply load all previous iterations of Bayesian calibration + """ + for _ in range(self.num_iter - 1): + print(f"Bayesian calibration iter No. {self.curr_iter}") + self.load_system() + self.calibration.add_curr_param_data_to_list(self.system.param_data) + self.calibration.run_inference(self.system) + self.calibration.sigma_list.append(self.system.sigma_max) + self.plot_uq_in_time() + self.increase_curr_iter() + def resample(self): """Learn and resample from a proposal distribution todo this should be refactored @@ -239,6 +252,15 @@ def plot_uq_in_time(self): self.save_fig ) + plot_pdf( + fig_name, + self.system.param_names, + self.calibration.param_data_list, + self.save_fig, + ) + + close_plots(self.save_fig) + def get_most_prob_params(self): """Return the most probable set of parameters diff --git a/grainlearning/dynamic_systems.py b/grainlearning/dynamic_systems.py index a579121..b270e9c 100644 --- a/grainlearning/dynamic_systems.py +++ b/grainlearning/dynamic_systems.py @@ -651,15 +651,19 @@ def load_param_data(self): # if param_data_file does not exit, get parameter data from text files files = glob(self.sim_data_dir + f'/iter{self.curr_iter}/{self.sim_name}*_param*{self.sim_data_file_ext}') self.num_samples = len(files) - self.sim_data_files = sorted(files) - self.param_data = np.zeros([self.num_samples, self.num_params]) - for i, sim_data_file in enumerate(self.sim_data_files): - if self.sim_data_file_ext == '.npy': - data = np.load(sim_data_file, allow_pickle=True).item() - else: - data = get_keys_and_data(sim_data_file) - params = [data[key][0] for key in self.param_names] - self.param_data[i, :] = params + # if the number of files found is non-zero + if self.num_samples != 0: + self.sim_data_files = sorted(files) + self.param_data = np.zeros([self.num_samples, self.num_params]) + for i, sim_data_file in enumerate(self.sim_data_files): + if self.sim_data_file_ext == '.npy': + data = np.load(sim_data_file, allow_pickle=True).item() + else: + data = get_keys_and_data(sim_data_file) + params = [data[key][0] for key in self.param_names] + self.param_data[i, :] = params + else: + raise RuntimeError(f'No data found for iteration {self.curr_iter}') def run(self): """ diff --git a/grainlearning/inference.py b/grainlearning/inference.py index 18e9ccd..bec8253 100644 --- a/grainlearning/inference.py +++ b/grainlearning/inference.py @@ -190,6 +190,10 @@ def data_assimilation_loop(self, sigma: float, system: Type["DynamicSystem"], pr self.compute_effective_sample_size() return (self.ess[-1] - self.ess_target) ** 2 + def set_posteriors(self, posteriors: np.ndarray = None): + """Set the posterior distribution""" + self.posteriors = posteriors + def compute_effective_sample_size(self): """Compute the effective sample size""" # compute the effective sample size for every time step diff --git a/grainlearning/tools.py b/grainlearning/tools.py index 2c679b7..c267edf 100644 --- a/grainlearning/tools.py +++ b/grainlearning/tools.py @@ -409,28 +409,22 @@ def plot_param_stats(fig_name, param_names, means, covs, save_fig=0): plt.subplot(2, n_cols, i + 1) plt.plot(means[:, i]) plt.xlabel("'Time' step") - plt.ylabel(r'$|' + param_names[i] + r'|$') + plt.ylabel(f'Mean of {param_names[i]}') plt.grid(True) plt.tight_layout() if save_fig: plt.savefig(f'{fig_name}_param_means.png') - else: - plt.show() - plt.close() plt.figure('Posterior coefficients of variance of the parameters') for i in range(num): plt.subplot(2, n_cols, i + 1) plt.plot(covs[:, i]) plt.xlabel("'Time' step") - plt.ylabel(r'$COV(' + param_names[i] + ')$') + plt.ylabel(f'Coefficient of variation of {param_names[i]}') plt.grid(True) plt.tight_layout() if save_fig: plt.savefig(f'{fig_name}_param_covs.png') - else: - plt.show() - plt.close() def plot_posterior(fig_name, param_names, param_data, posterior, save_fig=0): @@ -445,27 +439,27 @@ def plot_posterior(fig_name, param_names, param_data, posterior, save_fig=0): try: import matplotlib.pylab as plt except ImportError: - print('matplotlib is not installed, cannot plot posterior distribution. Please install with grainlearning[plot]') + print( + 'matplotlib is not installed, cannot plot posterior distribution. Please install with grainlearning[plot]') num_steps = posterior.shape[0] for i, name in enumerate(param_names): plt.figure(f'Posterior distribution of {name}') for j in range(6): plt.subplot(2, 3, j + 1) plt.plot(param_data[:, i], posterior[int(num_steps * (j + 1) / 6 - 1), :], 'o') - plt.title(f"'Time' step No.{int(num_steps * (j + 1) / 6 - 1):3d} ") + plt.title(f"'Time' step {int(num_steps * (j + 1) / 6 - 1):3d} ") plt.xlabel(r'$' + name + '$') - plt.ylabel('Posterior distribution') + plt.ylabel('Posterior probability mass') plt.grid(True) plt.tight_layout() if save_fig: plt.savefig(f'{fig_name}_posterior_{name}.png') - else: - plt.show() - plt.close() def plot_param_data(fig_name, param_names, param_data_list, save_fig=0): import matplotlib.pylab as plt + import seaborn as sns + flare_cmap = sns.color_palette("flare", as_cmap=True) num = len(param_names) n_cols = int(np.ceil(num / 2)) num = num - 1 @@ -474,7 +468,8 @@ def plot_param_data(fig_name, param_names, param_data_list, save_fig=0): for j in range(num): plt.subplot(2, n_cols, j + 1) for i in range(num_iter): - plt.plot(param_data_list[i][:, j], param_data_list[i][:, j + 1], 'o', label=f'iterNo. {i:d}') + plt.scatter(param_data_list[i][:, j], param_data_list[i][:, j + 1], edgecolors='white', + color=flare_cmap(i / (num_iter - 1)), label=f'Iter No. {i:d}') plt.xlabel(r'$' + param_names[j] + '$') plt.ylabel(r'$' + param_names[j + 1] + '$') plt.legend() @@ -482,9 +477,6 @@ def plot_param_data(fig_name, param_names, param_data_list, save_fig=0): plt.tight_layout() if save_fig: plt.savefig(f'{fig_name}_param_space.png') - else: - plt.show() - plt.close() def plot_obs_and_sim(fig_name, ctrl_name, obs_names, ctrl_data, obs_data, sim_data, posteriors, save_fig=0): @@ -534,7 +526,53 @@ def plot_obs_and_sim(fig_name, ctrl_name, obs_names, ctrl_data, obs_data, sim_da plt.tight_layout() if save_fig: plt.savefig(f'{fig_name}_obs_and_sim.png') - else: + + +def plot_pdf(fig_name, param_names, samples, save_fig=0): + """ + Plot the posterior density function of the parameter distribution + :param fig_name: string + :param sim_name: string + :param param_names: list of strings containing parameter names + :param samples: list of ndarray of shape (num_samples, self.num_params) + :param save_fig: bool defaults to False + """ + import seaborn as sns + import pandas as pd + import matplotlib.pylab as plt + + # create a deep copy of the samples (FIXME: this is a workaround for a bug in iBF constructor) + new_samples = [np.copy(sample) for sample in samples] + # pad with NaNs to make all samples have the same length + max_len = int(max([len(sample) for sample in new_samples])) + for i in range(len(new_samples)): + new_samples[i] = np.pad(new_samples[i], ((0, max_len - len(new_samples[i])), (0, 0)), 'constant', + constant_values=np.nan) + # fill the third dimension with iteration number + stacked_samples = np.vstack([new_samples[i] for i in range(len(new_samples))]) + third_dim_indices = np.repeat(np.arange(len(new_samples)), new_samples[0].shape[0]) + new_samples = np.hstack([stacked_samples, third_dim_indices[:, np.newaxis]]) + + # convert the array into a pandas dataframe + new_names = param_names + ['Iter'] + df = pd.DataFrame(new_samples, columns=new_names) + + # plot the parameter distribution in a scatter plot + fig = sns.PairGrid(df, diag_sharey=False, hue="Iter", palette='flare') + fig.map_upper(sns.scatterplot, s=15) + fig.map_lower(sns.kdeplot, thresh=1e-3, levels=5) + fig.map_diag(sns.kdeplot, lw=2) + fig.add_legend(loc='upper right') + fig.fig.canvas.manager.set_window_title('Estimated posterior probability density function') + + fig.tight_layout() + if save_fig: + fig.savefig(f'{fig_name}_scatterplot.png') + + +def close_plots(save_fig=0): + import matplotlib.pylab as plt + if not save_fig: plt.show() plt.close() diff --git a/pyproject.toml b/pyproject.toml index 1c15c7e..62b67f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,12 +32,14 @@ h5py = {version ="^3.7.0", optional = true} wandb = {version ="^0.13.4", optional = true} tensorflow = {version ="2.10.0", optional = true} ipykernel = {version = "*", optional = true} +seaborn = {version = '^0.12.2', optional = true} [tool.poetry.extras] docs = ["Sphinx", "sphinx-autodoc-typehints", "sphinx-mdinclude", "sphinx-rtd-theme"] dev = ["pytest", "pytest-cov", "prospector", "pyroma", "h5py"] rnn = ["wandb", "tensorflow"] tutorials = ["ipykernel"] +visuals = ["seaborn"] [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/tutorials/simple_regression/linear_regression/linear_regression_load_all.py b/tutorials/simple_regression/linear_regression/linear_regression_load_all.py new file mode 100644 index 0000000..b435c1b --- /dev/null +++ b/tutorials/simple_regression/linear_regression/linear_regression_load_all.py @@ -0,0 +1,41 @@ +""" +This tutorial shows how to run one iteration of Bayesian calibration for a linear regression model. +""" +import os +from grainlearning import BayesianCalibration +from grainlearning.dynamic_systems import IODynamicSystem, DynamicSystem + +PATH = os.path.abspath(os.path.dirname(__file__)) + +calibration = BayesianCalibration.from_dict( + { + "num_iter": 10, + "system": { + "system_type": IODynamicSystem, + "param_min": [0.001, 0.001], + "param_max": [1, 10], + "param_names": ['a', 'b'], + "num_samples": 20, + "obs_data_file": PATH + '/linear_obs.dat', + "obs_names": ['f'], + "ctrl_name": 'u', + "sim_name": 'linear', + "sim_data_dir": PATH + '/sim_data/', + "sim_data_file_ext": '.txt', + "sigma_tol": 0.01, + }, + "calibration": { + "inference": {"ess_target": 0.3}, + "sampling": { + "max_num_components": 2, + "n_init": 1, + "random_state": 0, + "covariance_type": "full", + } + }, + "save_fig": 0, + } +) + +# run GrainLearning for one iteration and generate the resampled parameter values +calibration.load_all()