add Ledoit Wolf shrinkage estimator for cross-validation covariance

different priors change prior again add reg into cov-mat for optimization example tweaks add reg to scalar method and widen default prior fix lorenz fix lorenz major refactor of RF prior cov fix lorenz format refactored user interface examples work with new interface format resolve merge format bug-fix n_hp calc updates to prior - remove scalings src for new kernel structure updated Emulator and Lorenz example with Kernel interface bugfix symbol checks Cholesky factorization of K remove unnecessary warning needed branch in if-else error message typo better nugget term for LRF sensible results for RF emulator exmaple bypass JLD2 errors make low-rank into more of a diagonal decomp bugfix scatter bounds added learnable scaling Lorenz 63 example learning update map gets l63 pdf moved L63 into Emulator examples remove duplicate svd rebalance loss function (sqrt complexity, zero out scaled coeffs, remove approx_sig2 from noise), greatly improved robustness example tweaks and fixes for plotting, convergence etc. clima format tests pass clima format change plots to makie, and small changes change plots to makie, and small changes format docs fix test shrinkage format
CliMA · Oct 6, 2023 · 2b49075 · 2b49075
1 parent 89724aa
commit 2b49075
Show file tree

Hide file tree

Showing 20 changed files with 1,650 additions and 682 deletions.
diff --git a/docs/src/API/RandomFeatures.md b/docs/src/API/RandomFeatures.md
@@ -31,7 +31,7 @@ get_n_features
 get_input_dim
 get_output_dim
 get_rng
-get_diagonalize_input
+get_kernel_structure
 get_feature_decomposition
 get_optimizer_options
 optimize_hyperparameters!(::ScalarRandomFeatureInterface) 

diff --git a/examples/EDMF_data/Project.toml b/examples/EDMF_data/Project.toml
@@ -1,9 +1,11 @@
 [deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
+PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

diff --git a/examples/EDMF_data/plot_posterior.jl b/examples/EDMF_data/plot_posterior.jl
@@ -1,5 +1,8 @@
 # Import modules
-using StatsPlots
+ENV["GKSwstype"] = "100"
+using Plots
+
+using CairoMakie, PairPlots
 using JLD2
 using Dates
 
@@ -11,11 +14,11 @@ using CalibrateEmulateSample.ParameterDistributions
 
 # 2-parameter calibration exp
 exp_name = "ent-det-calibration"
-date_of_run = Date(2022, 7, 15)
+date_of_run = Date(2023, 10, 5)
 
 # 5-parameter calibration exp
 #exp_name = "ent-det-tked-tkee-stab-calibration"
-#date_of_run = Date(2022,7,14)
+#date_of_run = Date(2023,10,4)
 
 # Output figure read/write directory
 figure_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run))
@@ -24,7 +27,7 @@ data_save_directory = joinpath(@__DIR__, "output", exp_name, string(date_of_run)
 # load
 posterior_filepath = joinpath(data_save_directory, "posterior.jld2")
 if !isfile(posterior_filepath)
-    LoadError(posterior_filepath * " not found. Please check experiment name and date")
+    throw(ArgumentError(posterior_filepath * " not found. Please check experiment name and date"))
 else
     println("Loading posterior distribution from: " * posterior_filepath)
     posterior = load(posterior_filepath)["posterior"]
@@ -40,10 +43,10 @@ density_filepath = joinpath(figure_save_directory, "posterior_dist_comp.png")
 transformed_density_filepath = joinpath(figure_save_directory, "posterior_dist_phys.png")
 labels = get_name(posterior)
 
-gr(dpi = 300, size = (nparam_plots * 300, nparam_plots * 300))
-# lower triangular marginal cornerplot
-p = cornerplot(permutedims(posterior_samples, (2, 1)), label = labels, compact = true)
-trans_p = cornerplot(permutedims(transformed_posterior_samples, (2, 1)), label = labels, compact = true)
+data = (; [(Symbol(labels[i]), posterior_samples[i, :]) for i in 1:length(labels)]...)
+transformed_data = (; [(Symbol(labels[i]), transformed_posterior_samples[i, :]) for i in 1:length(labels)]...)
 
-savefig(p, density_filepath)
-savefig(trans_p, transformed_density_filepath)
+p = pairplot(data => (PairPlots.Scatter(),))
+trans_p = pairplot(transformed_data => (PairPlots.Scatter(),))
+save(density_filepath, p)
+save(transformed_density_filepath, trans_p)
diff --git a/examples/EDMF_data/uq_for_edmf.jl b/examples/EDMF_data/uq_for_edmf.jl
@@ -1,9 +1,10 @@
 #include(joinpath(@__DIR__, "..", "ci", "linkfig.jl"))
-PLOT_FLAG = true
+PLOT_FLAG = false
 
 # Import modules
 using Distributions  # probability distributions and associated functions
 using LinearAlgebra
+ENV["GKSwstype"] = "100"
 using Plots
 using Random
 using JLD2
@@ -17,6 +18,7 @@ using CalibrateEmulateSample.MarkovChainMonteCarlo
 using CalibrateEmulateSample.ParameterDistributions
 using CalibrateEmulateSample.DataContainers
 using CalibrateEmulateSample.EnsembleKalmanProcesses
+using CalibrateEmulateSample.EnsembleKalmanProcesses.Localizers
 using CalibrateEmulateSample.Utilities
 
 rng_seed = 42424242
@@ -126,13 +128,55 @@ function main()
     end
 
     # load and create prior distributions
+    #=   
     prior_filepath = joinpath(exp_dir, "prior.jld2")
     if !isfile(prior_filepath)
         LoadError("prior file \"prior.jld2\" not found in directory \"" * exp_dir * "/\"")
     else
         prior_dict_raw = load(prior_filepath) #using JLD2
         prior = prior_dict_raw["prior"]
     end
+    =#
+    # build prior if jld2 does not work
+    function get_prior_config(s::SS) where {SS <: AbstractString}
+        config = Dict()
+        if s == "ent-det-calibration"
+            config["constraints"] =
+                Dict("entrainment_factor" => [bounded(0.0, 1.0)], "detrainment_factor" => [bounded(0.0, 1.0)])
+            config["prior_mean"] = Dict("entrainment_factor" => 0.13, "detrainment_factor" => 0.51)
+            config["unconstrained_σ"] = 1.0
+        elseif s == "ent-det-tked-tkee-stab-calibration"
+            config["constraints"] = Dict(
+                "entrainment_factor" => [bounded(0.0, 1.0)],
+                "detrainment_factor" => [bounded(0.0, 1.0)],
+                "tke_ed_coeff" => [bounded(0.01, 1.0)],
+                "tke_diss_coeff" => [bounded(0.01, 1.0)],
+                "static_stab_coeff" => [bounded(0.01, 1.0)],
+            )
+            config["prior_mean"] = Dict(
+                "entrainment_factor" => 0.13,
+                "detrainment_factor" => 0.51,
+                "tke_ed_coeff" => 0.14,
+                "tke_diss_coeff" => 0.22,
+                "static_stab_coeff" => 0.4,
+            )
+            config["unconstrained_σ"] = 1.0
+        else
+            throw(ArgumentError("prior for experiment $s not found, please implement in uq_for_edmf.jl"))
+        end
+        return config
+    end
+    prior_config = get_prior_config(exp_name)
+    means = prior_config["prior_mean"]
+    std = prior_config["unconstrained_σ"]
+    constraints = prior_config["constraints"]
+
+
+    prior = combine_distributions([
+        ParameterDistribution(
+            Dict("name" => name, "distribution" => Parameterized(Normal(mean, std)), "constraint" => constraints[name]),
+        ) for (name, mean) in means
+    ])
 
     # Option (ii) load EKP object
     # max_ekp_it = 10 # use highest available iteration file
@@ -145,27 +189,90 @@ function main()
     # end
     # input_output_pairs = Utilities.get_training_points(ekpobj, max_ekp_it)
 
-    println("Completed calibration stage")
+    println("Completed calibration loading stage")
     println(" ")
     ##############################################
     # [3. ] Build Emulator from calibration data #
     ##############################################
     println("Begin Emulation stage")
     # Create GP object
 
-    gppackage = Emulators.SKLJL()
-    pred_type = Emulators.YType()
-    gauss_proc = GaussianProcess(
-        gppackage;
-        kernel = nothing, # use default squared exponential kernel
-        prediction_type = pred_type,
-        noise_learn = false,
+    cases = [
+        "GP", # diagonalize, train scalar GP, assume diag inputs
+        "RF-scalar", # diagonalize, train scalar RF, don't asume diag inputs
+        "RF-vector-svd-diag",
+        "RF-vector-svd-nondiag",
+        "RF-vector-svd-nonsep",
+    ]
+    case = cases[5]
+
+    overrides = Dict(
+        "verbose" => true,
+        "train_fraction" => 0.95,
+        "scheduler" => DataMisfitController(terminate_at = 100),
+        "cov_sample_multiplier" => 0.5,
+        "n_iteration" => 5,
+        #    "n_ensemble" => 20,
+        #    "localization" => SEC(0.1), # localization / sample error correction for small ensembles
     )
+    nugget = 0.01
+    rng_seed = 99330
+    rng = Random.MersenneTwister(rng_seed)
+    input_dim = size(get_inputs(input_output_pairs), 1)
+    output_dim = size(get_outputs(input_output_pairs), 1)
+    if case == "GP"
+
+        gppackage = Emulators.SKLJL()
+        pred_type = Emulators.YType()
+        mlt = GaussianProcess(
+            gppackage;
+            kernel = nothing, # use default squared exponential kernel
+            prediction_type = pred_type,
+            noise_learn = false,
+        )
+    elseif case ∈ ["RF-scalar"]
+        n_features = 100
+        kernel_structure = SeparableKernel(CholeskyFactor(nugget), OneDimFactor())
+        mlt = ScalarRandomFeatureInterface(
+            n_features,
+            input_dim,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    elseif case ∈ ["RF-vector-svd-diag", "RF-vector-svd-nondiag"]
+        # do we want to assume that the outputs are decorrelated in the machine-learning problem?
+        kernel_structure =
+            case ∈ ["RF-vector-svd-diag"] ? SeparableKernel(LowRankFactor(1, nugget), DiagonalFactor(nugget)) :
+            SeparableKernel(LowRankFactor(2, nugget), LowRankFactor(2, nugget))
+        n_features = 500
+
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            input_dim,
+            output_dim,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    elseif case ∈ ["RF-vector-svd-nonsep"]
+        kernel_structure = NonseparableKernel(LowRankFactor(3, nugget))
+        n_features = 500
+
+        mlt = VectorRandomFeatureInterface(
+            n_features,
+            input_dim,
+            output_dim,
+            rng = rng,
+            kernel_structure = kernel_structure,
+            optimizer_options = overrides,
+        )
+    end
 
     # Fit an emulator to the data
     normalized = true
 
-    emulator = Emulator(gauss_proc, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized)
+    emulator = Emulator(mlt, input_output_pairs; obs_noise_cov = truth_cov, normalize_inputs = normalized)
 
     # Optimize the GP hyperparameters for better fit
     optimize_hyperparameters!(emulator)

diff --git a/examples/Emulator/GaussianProcess/plot_GP.jl b/examples/Emulator/GaussianProcess/plot_GP.jl
@@ -11,6 +11,7 @@ using CalibrateEmulateSample.DataContainers
 
 plot_flag = true
 if plot_flag
+    ENV["GKSwstype"] = "100"
     using Plots
     gr(size = (1500, 700))
     Plots.scalefontsizes(1.3)
@@ -78,7 +79,7 @@ gaussian_process = GaussianProcess(gppackage, noise_learn = false)
 # The observables y are related to the parameters x by: 
 # y = G(x1, x2) + η, 
 # where G(x1, x2) := [sin(x1) + cos(x2), sin(x1) - cos(x2)], and η ~ N(0, Σ)
-n = 100  # number of training points
+n = 150  # number of training points
 p = 2   # input dim 
 d = 2   # output dim
 

diff --git a/examples/Emulator/L63/Project.toml b/examples/Emulator/L63/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+CalibrateEmulateSample = "95e48a1f-0bec-4818-9538-3db4340308e3"
+ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"