From 0b56db08e51d751a58307a9d3764fb85cc11d024 Mon Sep 17 00:00:00 2001
From: Renato Lui Geh <renatogeh@gmail.com>
Date: Thu, 24 Jun 2021 09:16:22 -0300
Subject: [PATCH] Add Ensemble I/O

---
 Project.toml                               |  1 +
 src/LoadSave/circuit_loaders.jl            | 44 +++++++++++++++++++++-
 src/LoadSave/circuit_savers.jl             | 29 +++++++++++++-
 src/ensembles/bmc.jl                       |  3 +-
 src/ensembles/ensembles.jl                 |  4 +-
 src/structurelearner/sample_psdd.jl        |  7 ++--
 test/ensembles/bmc_tests.jl                | 17 +++++----
 test/ensembles/ensembles_tests.jl          | 36 +++++++++++++-----
 test/structurelearner/sample_psdd_tests.jl | 17 +++++----
 9 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7ec5c207..db3b73b4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,6 +28,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 ThreadPools = "b189fb0b-2eb5-4ed4-bc0c-d34c51242431"
 TikzGraphs = "b4f28e30-c73f-5eaf-a395-8a9db949a742"
+ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 
 [compat]
 CUDA = "2, 3.0"
diff --git a/src/LoadSave/circuit_loaders.jl b/src/LoadSave/circuit_loaders.jl
index 84c58b28..f68e49f7 100644
--- a/src/LoadSave/circuit_loaders.jl
+++ b/src/LoadSave/circuit_loaders.jl
@@ -1,5 +1,5 @@
 export zoo_clt, zoo_clt_file, zoo_psdd, zoo_lc, load_prob_circuit, 
-load_struct_prob_circuit, load_logistic_circuit
+load_struct_prob_circuit, load_logistic_circuit, load_as_ensemble
 
 using LogicCircuits
 using Pkg.Artifacts
@@ -96,3 +96,45 @@ function parse_clt(filename::String)::MetaDiGraph
     return clt
 end
 
+"Loads an ensemble from disk."
+function load_as_ensemble(name::String; quiet::Bool = false)::Ensemble{StructProbCircuit}
+    @assert endswith(name, ".esbl")
+    zip = ZipFile.Reader(name)
+    W, n = Vector{Float64}(), -1
+    for f ∈ zip.files
+        if endswith(f.name, ".meta")
+            n = parse(Int, readline(f))
+            W = map(x -> parse(Float64, x), split(readline(f)))
+        end
+    end
+    @assert n > 0 && length(W) == n "Ensemble file format corrupted, empty or missing meta file."
+    P = Tuple{Int, Int}[(0, 0) for i ∈ 1:n]
+    for (i, f) ∈ enumerate(zip.files)
+        if endswith(f.name, ".psdd")
+            j = parse(Int, f.name[1:end-5])
+            @assert j > 0 && j <= n "Either .meta file is corrupted or .psdd is misnamed (faulty: $(f.name))."
+            P[j] = (i, P[j][2])
+        elseif endswith(f.name, ".vtree")
+            j = parse(Int, f.name[1:end-6])
+            @assert j > 0 && j <= n "Either .meta file is corrupted or .vtree is misnamed (faulty: $(f.name))."
+            P[j] = (P[j][1], i)
+        end
+    end
+    C = Vector{StructProbCircuit}(undef, n)
+    function do_work(k::Int, i::Int, j::Int)
+        @assert i > 0 "Missing .psdd file for the $k-th circuit."
+        @assert j > 0 "Missing .psdd file for the $k-th circuit."
+        psdd_file, vtree_file = zip.files[i], zip.files[j]
+        psdd, _ = load_struct_prob_circuit(psdd_file, vtree_file)
+        C[k] = psdd
+        nothing
+    end
+    !quiet && print("Loading circuits...\n  ")
+    for (k, (i, j)) ∈ enumerate(P)
+        do_work(k, i, j)
+        !quiet && print('*')
+    end
+    !quiet && print('\n')
+    close(zip)
+    return Ensemble{StructProbCircuit}(C, W)
+end
diff --git a/src/LoadSave/circuit_savers.jl b/src/LoadSave/circuit_savers.jl
index ccfb8545..ba36dfc3 100644
--- a/src/LoadSave/circuit_savers.jl
+++ b/src/LoadSave/circuit_savers.jl
@@ -1,5 +1,6 @@
-export save_circuit, save_as_dot, save_as_psdd, save_as_logistic
+export save_circuit, save_as_dot, save_as_psdd, save_as_logistic, save_as_ensemble
 
+using ZipFile
 using LogicCircuits.LoadSave: SDDElement, 
     PSDDElement, 
     save_lines,
@@ -195,3 +196,29 @@ function save_as_dot(file::String, circuit::ProbCircuit)
     flush(f)
     close(f)
 end
+
+"Save file as a .esbl ensemble file format."
+function save_as_ensemble(name::String, ensemble::Ensemble{StructProbCircuit}; quiet::Bool = false)
+    @assert endswith(name, ".esbl")
+    zip = ZipFile.Writer(name)
+    f_w = ZipFile.addfile(zip, "ensemble.meta")
+    n = length(ensemble.C)
+    write(f_w, "$(n)\n")
+    write(f_w, join(ensemble.W, ' '))
+    close(f_w)
+    function do_work(C::StructProbCircuit, i::Integer)
+        f_c = ZipFile.addfile(zip, "$(i).psdd")
+        save_as_psdd(f_c, C, C.vtree)
+        f_v = ZipFile.addfile(zip, "$(i).vtree")
+        save_vtree(f_v, C.vtree)
+        nothing
+    end
+    !quiet && print("Saving circuits...\n  ")
+    for (i, C) ∈ enumerate(ensemble.C)
+        do_work(C, i)
+        !quiet && print('*')
+    end
+    !quiet && print('\n')
+    close(zip)
+    nothing
+end
diff --git a/src/ensembles/bmc.jl b/src/ensembles/bmc.jl
index 02f815b5..1881b5fd 100644
--- a/src/ensembles/bmc.jl
+++ b/src/ensembles/bmc.jl
@@ -1,3 +1,5 @@
+export BayesModelComb, bmc_sample_psdd
+
 using Distributions: Dirichlet
 
 "Bayesian Model Combination."
@@ -46,7 +48,6 @@ function bmc_sample_psdd(n::Integer, ϕ::Diagram, k::Integer, D::DataFrame, q::I
     LL .= LL ./ sum(LL)
     return BayesModelComb(E, log.(LL))
 end
-export bmc_sample_psdd
 
 function weighted_query(B::BayesModelComb{T}, D::DataFrame, f::Function; kwargs...)::Vector{Float64} where T <: ProbCircuit
     n, m = nrow(D), length(B.E)
diff --git a/src/ensembles/ensembles.jl b/src/ensembles/ensembles.jl
index 4d3e14d8..5ffe446c 100644
--- a/src/ensembles/ensembles.jl
+++ b/src/ensembles/ensembles.jl
@@ -1,3 +1,5 @@
+export Ensemble, ensemble_sample_psdd, sample_vtree
+
 using ThreadPools
 
 """Split `X` into two partitions `A` and `B`, where `A` is a Bernoulli sample of each element in
@@ -25,7 +27,6 @@ function sample_vtree(n::Int, p::Float64)::Vtree
     end
     return p < 0 ? Vtree(n, :random) : passdown(shuffle!(collect(1:n)))
 end
-export sample_vtree
 
 "Weighted ensemble of probabilistic circuits."
 mutable struct Ensemble{T <: ProbCircuit}
@@ -70,7 +71,6 @@ function ensemble_sample_psdd(n::Integer, ϕ::Diagram, k::Int, D::DataFrame; vtr
     @assert strategy == :uniform "Unrecognized ensemble strategy."
     return E
 end
-export ensemble_sample_psdd
 
 "Learns the weights of the Ensemble by the likelihood value of data `D`."
 function learn_ensemble_llw!(E::Ensemble{T}, D::DataFrame)::Ensemble{T} where T <: ProbCircuit
diff --git a/src/structurelearner/sample_psdd.jl b/src/structurelearner/sample_psdd.jl
index 09c0c71c..6ce025b8 100644
--- a/src/structurelearner/sample_psdd.jl
+++ b/src/structurelearner/sample_psdd.jl
@@ -1,6 +1,8 @@
+export sample_psdd
+
 using StatsFuns
-using BinaryDecisionDiagrams: Diagram, BinaryDecisionDiagrams
-const BDD = BinaryDecisionDiagrams
+using BinaryDecisionDiagrams: Diagram
+import BinaryDecisionDiagrams as BDD
 
 "Samples an element from a Binomial distribution with p=0.5."
 function sample_row(n::Int)::Int
@@ -134,7 +136,6 @@ Samples a PSDD from a BDD `ϕ` and vtree `V` with at most `k` elements in each d
     !randomize_weights && estimate_parameters(C, D; pseudocount)
     return C
 end
-export sample_psdd
 
 function sample_psdd_r(ϕ::Diagram, V::Vtree, k::Integer, leaves::Dict{Int32, StructProbLiteralNode},
         randomize_weights::Bool, opts::SamplingOpts, fact_on_⊤::Bool, ⊤_k::Integer, p_mr::Real,
diff --git a/test/ensembles/bmc_tests.jl b/test/ensembles/bmc_tests.jl
index e4cd8e7d..7b70049b 100644
--- a/test/ensembles/bmc_tests.jl
+++ b/test/ensembles/bmc_tests.jl
@@ -1,13 +1,14 @@
 using Test
 using ProbabilisticCircuits
 using DataFrames
-using BinaryDecisionDiagrams
+using BinaryDecisionDiagrams: Diagram
+import BinaryDecisionDiagrams as BDD
 
 @testset "BMC tests with SamplePSDD" begin
     # Set up a logic constraint ϕ as a BDD and scope size n.
     function case(ϕ::Diagram, n::Integer; atol::Real = 0)
         # All possible valuations (including impossible ones).
-        M = all_valuations(collect(1:n))
+        M = BDD.all_valuations(collect(1:n))
         # Get only possible worlds.
         W = M[findall(ϕ.(eachrow(M))),:]
         # Assign random probabilities for each world in W.
@@ -24,10 +25,10 @@ using BinaryDecisionDiagrams
         @test isapprox(evi[findall(>(0), evi)], (R/sum(R)); atol)
     end
 
-    case((1 ∧ 2) ∨ (3 ∧ ¬4) ∨ (¬1 ∧ 5), 5)
-    case((1 → 3) ∧ (5 → ¬2), 5)
-    case(and(1, 2, 3) ∨ and(4, 5), 5)
-    case(exactly(3, collect(1:5)), 5)
-    case(atleast(3, collect(1:5)), 5)
-    case(atmost(3, collect(1:5)), 5)
+    case(BDD.or(BDD.and(1, 2), BDD.and(3, BDD.:¬(4)), BDD.and(BDD.:¬(1), 5)), 5)
+    case(BDD.and(BDD.:→(1, 3), BDD.:→(5, BDD.:¬(2))), 5)
+    case(BDD.or(BDD.and(1, 2, 3), BDD.and(4, 5)), 5)
+    case(BDD.exactly(3, collect(1:5)), 5)
+    case(BDD.atleast(3, collect(1:5)), 5)
+    case(BDD.atmost(3, collect(1:5)), 5)
 end
diff --git a/test/ensembles/ensembles_tests.jl b/test/ensembles/ensembles_tests.jl
index 3f7e9730..cbbf5e7b 100644
--- a/test/ensembles/ensembles_tests.jl
+++ b/test/ensembles/ensembles_tests.jl
@@ -1,13 +1,14 @@
 using Test
 using ProbabilisticCircuits
 using DataFrames
-using BinaryDecisionDiagrams
+using BinaryDecisionDiagrams: Diagram
+import BinaryDecisionDiagrams as BDD
 
 @testset "ensemble tests with SamplePSDD" begin
     # Set up a logic constraint ϕ as a BDD and scope size n. Sample m PSDDs.
-    function case(ϕ::Diagram, n::Integer, strategy::Symbol; m::Integer = 20, atol::Real = 1e-2)
+    function case(ϕ::Diagram, n::Integer, strategy::Symbol; m::Integer = 20, atol::Real = 1e-2)::Ensemble{StructProbCircuit}
         # All possible valuations (including impossible ones).
-        M = all_valuations(collect(1:n))
+        M = BDD.all_valuations(collect(1:n))
         # Get only possible worlds.
         W = M[findall(ϕ.(eachrow(M))),:]
         # Assign random probabilities for each world in W.
@@ -23,14 +24,31 @@ using BinaryDecisionDiagrams
         # Test probabilities.
         evi = exp.(EVI(E, T))
         @test isapprox(evi[findall(>(0), evi)], (R/sum(R)); atol)
+        return E
     end
 
+    Es = Vector{Ensemble{StructProbCircuit}}()
     for strategy ∈ [:likelihood, :uniform, :em, :stacking]
-        case((1 ∧ 2) ∨ (3 ∧ ¬4) ∨ (¬1 ∧ 5), 5, strategy)
-        case((1 → 3) ∧ (5 → ¬2), 5, strategy)
-        case(and(1, 2, 3) ∨ and(4, 5), 5, strategy)
-        case(exactly(3, collect(1:5)), 5, strategy)
-        case(atleast(3, collect(1:5)), 5, strategy)
-        case(atmost(3, collect(1:5)), 5, strategy)
+        push!(Es, case(BDD.or(BDD.and(1, 2), BDD.and(3, BDD.:¬(4)), BDD.and(BDD.:¬(1), 5)), 5, strategy))
+        push!(Es, case(BDD.and(BDD.:→(1, 3), BDD.:→(5, BDD.:¬(2))), 5, strategy))
+        push!(Es, case(BDD.or(BDD.and(1, 2, 3), BDD.and(4, 5)), 5, strategy))
+        push!(Es, case(BDD.exactly(3, collect(1:5)), 5, strategy))
+        push!(Es, case(BDD.atleast(3, collect(1:5)), 5, strategy))
+        push!(Es, case(BDD.atmost(3, collect(1:5)), 5, strategy))
+    end
+
+    tmp = mktempdir()
+    @testset "Saving and loading ensembles" begin
+        for (i, E) ∈ enumerate(Es)
+            @test_nowarn save_as_ensemble("$tmp/$i.esbl", E; quiet = true)
+        end
+    end
+    Rs = Vector{Ensemble{StructProbCircuit}}()
+    T = DataFrame(BDD.all_valuations(1:5))
+    @testset "Loading ensembles" begin
+        for i ∈ 1:length(Es)
+            E = load_as_ensemble("$tmp/$i.esbl"; quiet = true)
+            @test EVI(E, T) ≈ EVI(Es[i], T)
+        end
     end
 end
diff --git a/test/structurelearner/sample_psdd_tests.jl b/test/structurelearner/sample_psdd_tests.jl
index e01abf6e..0e286aee 100644
--- a/test/structurelearner/sample_psdd_tests.jl
+++ b/test/structurelearner/sample_psdd_tests.jl
@@ -1,14 +1,15 @@
 using Test
 using ProbabilisticCircuits
 using DataFrames
-using BinaryDecisionDiagrams
+using BinaryDecisionDiagrams: Diagram
+import BinaryDecisionDiagrams as BDD
 import LogicCircuits: Vtree
 
 @testset "SamplePSDD tests" begin
     # Set up a logic constraint ϕ as a BDD and scope size n. Sample m PSDDs.
     function case(ϕ::Diagram, n::Integer; m::Integer = 20, atol::Real = 0)
         # All possible valuations (including impossible ones).
-        M = all_valuations(collect(1:n))
+        M = BDD.all_valuations(collect(1:n))
         # Get only possible worlds.
         W = M[findall(ϕ.(eachrow(M))),:]
         # Assign random probabilities for each world in W.
@@ -30,10 +31,10 @@ import LogicCircuits: Vtree
         end
     end
 
-    case((1 ∧ 2) ∨ (3 ∧ ¬4) ∨ (¬1 ∧ 5), 5)
-    case((1 → 3) ∧ (5 → ¬2), 5)
-    case(and(1, 2, 3) ∨ and(4, 5), 5)
-    case(exactly(3, collect(1:5)), 5)
-    case(atleast(3, collect(1:5)), 5)
-    case(atmost(3, collect(1:5)), 5)
+    case(BDD.or(BDD.and(1, 2), BDD.and(3, BDD.:¬(4)), BDD.and(BDD.:¬(1), 5)), 5)
+    case(BDD.and(BDD.:→(1, 3), BDD.:→(5, BDD.:¬(2))), 5)
+    case(BDD.or(BDD.and(1, 2, 3), BDD.and(4, 5)), 5)
+    case(BDD.exactly(3, collect(1:5)), 5)
+    case(BDD.atleast(3, collect(1:5)), 5)
+    case(BDD.atmost(3, collect(1:5)), 5)
 end