Added tracking and output of exploration levels (#33)

* Added level tracking for species and reactions * Added level tracking to docs * Bump to v0.5.12 * Fixed API docs for `SpeciesData`/`RxData`
Kinetica-jl · Oct 3, 2024 · 1e56aab · 1e56aab
1 parent 0b05dee
commit 1e56aab
Show file tree

Hide file tree

Showing 10 changed files with 189 additions and 75 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Kinetica"
 uuid = "3847ce11-53ec-444b-aa85-3b6606472139"
 authors = ["joegilkes <[email protected]>"]
-version = "0.5.11"
+version = "0.5.12"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"

diff --git a/docs/src/api/kinetica/exploration.md b/docs/src/api/kinetica/exploration.md
@@ -6,17 +6,17 @@
 
 ```@docs
 SpeciesData
-Base.push!(::SpeciesData, ::String, ::Dict{String, Any})
-Base.push!(::SpeciesData, ::Vector{String}, ::Vector{Any})
-Base.push!(::SpeciesData, ::String)
+Base.push!(::SpeciesData, ::String, ::Dict{String, Any}, ::Int)
+Base.push!(::SpeciesData, ::Vector{String}, ::Vector{Any}, ::Int)
+Base.push!(::SpeciesData, ::String, ::Int)
 push_unique!
 ```
 
 ### Representing Reactions
 
 ```@docs
 RxData
-Base.push!(::RxData{iType, fType}, ::SpeciesData, ::Vector{Vector{String}}, ::Vector{Vector{String}}, ::Vector{Dict{String, Any}}, ::Vector{Dict{String, Any}}, ::Vector{fType}) where {iType, fType <: AbstractFloat}
+Base.push!(::RxData{iType, fType}, ::SpeciesData, ::Vector{Vector{String}}, ::Vector{Vector{String}}, ::Vector{Dict{String, Any}}, ::Vector{Dict{String, Any}}, ::Vector{fType}, ::Int) where {iType, fType <: AbstractFloat}
 Base.splice!(::RxData, ::Vector{Int})
 ```
 

diff --git a/docs/src/assets/crn11.png b/docs/src/assets/crn11.png
diff --git a/docs/src/development/crn-representation.md b/docs/src/development/crn-representation.md
@@ -18,7 +18,7 @@ At the core of [`SpeciesData`](@ref) is a bidirectional mapping between species
 !!! note "Species Within ODESystems"
     When a CRN gets converted to a ModelingToolkit `ODESystem`, every species needs to have a symbolic representation that can be mapped to its concentration during kinetic simulations. While it may be natural to assume we'd simply use the species' SMILES representations here, these contain special characters that cannot be present in `Symbol`s. To also facilitate simple scaling to any number of species, species are automatically mapped to time-dependent symbolic variables `spec(t)[i]`, where `i` becomes an integer ID within the range `1:SpeciesData.n`. This is one of the core reasons why this SMILES to ID mapping is required.
 
-An instance of [`SpeciesData`](@ref) can be constructed in one of three ways:
+An instance of [`SpeciesData`](@ref) can be constructed in one of four ways:
 
 * From an array of SMILES strings and a corresponding array of geometries (ExtXYZ `frame`s).
 * From a single XYZ file containing one or more species.
@@ -32,7 +32,8 @@ Once created, more species can be added, again from SMILES and their respective
 Aside from the aforementioned `toStr` and `toInt` fields, [`SpeciesData`](@ref) structs can hold a wealth of information:
 
 * `SpeciesData.n` holds the total number of species that have been added.
-* `SpeciesData.xyz` holds a `Vector` of species geometries, indexed by species ID, in ExtXYZ.jl `frame` format. These store the cartesian coordinates of all atoms within a species, but can be used to store additional information related to a species.
+* `SpeciesData.xyz` holds a `Dict` of species geometries, indexed by species ID, in ExtXYZ.jl `frame` format. These store the Cartesian coordinates of all atoms within a species, but can be used to store additional information related to a species.
+* `SpeciesData.level_found` holds a `Dict` mapping species ID to the numerical exploration level in which each species was found. In [`DirectExplore`](@ref)-based simulations, every species defaults to being from level 1. 
 * `SpeciesData.cache` holds an empty `Dict`, which can be used to store temporary information related to a species. For example, caling [`get_species_stats!`] populates the cache with `Dict`s that map species IDs to properties including molecular weight and hard-sphere radius.
 
 ### Use of SMILES
@@ -67,10 +68,11 @@ Instances of [`RxData`](@ref) contain the following fields:
 
 * `RxData.nr` counts the number of reactions currently in the struct.
 * `RxData.mapped_rxns` holds the atom-mapped reaction SMILES representations of each reaction in the struct. These can be useful for generating correctly atom-indexed reactant/product systems (see [Analysing the CRN](@ref) for an example).
-* `id_reacs` and `id_prods` hold `Vector`s of the unique reactant/product IDs in each reaction. For example, if the CRN's [`SpeciesData`](@ref) dictated that species `"CC"` had ID `1` and `"[CH3]"` had ID `2`, and reaction `5` was `CC -> 2 [CH3]` (heterolytic cleavage of ethane), then `id_reacs[5] == ["CC"]` and `id_prods[5] == ["[CH3]"]`.
-* `stoic_reacs` and `stoic_prods` hold `Vector`s of the stoichiometry of each species in the reactions in `id_reacs` and `id_prods` respectively. This is why in the above example, `id_prods[5]` does not contain two `"[CH3]"`s - because `stoic_reacs[5] == [1]` and `stoic_prods[5] == [2]`.
-* `dH` holds each reaction's enthalpy of reaction, ``\Delta H_r``, as calculated by the level of electronic structure theory used within CDE during reaction generation.
-* `rhash` holds a unique reaction hash for every reaction in the struct. This is calculated by taking the `sha256` of the concatenation of the sorted reactant and product SMILES, and is used for checking whether new reactions are duplicates of any others already in the struct.
+* `RxData.id_reacs` and `RxData.id_prods` hold `Vector`s of the unique reactant/product IDs in each reaction. For example, if the CRN's [`SpeciesData`](@ref) dictated that species `"CC"` had ID `1` and `"[CH3]"` had ID `2`, and reaction `5` was `CC -> 2 [CH3]` (heterolytic cleavage of ethane), then `id_reacs[5] == ["CC"]` and `id_prods[5] == ["[CH3]"]`.
+* `RxData.stoic_reacs` and `RxData.stoic_prods` hold `Vector`s of the stoichiometry of each species in the reactions in `RxData.id_reacs` and `RxData.id_prods` respectively. This is why in the above example, `id_prods[5]` does not contain two `"[CH3]"`s - because `stoic_reacs[5] == [1]` and `stoic_prods[5] == [2]`.
+* `RxData.dH` holds each reaction's enthalpy of reaction, ``\Delta H_r``, as calculated by the level of electronic structure theory used within CDE during reaction generation.
+* `RxData.rhash` holds a unique reaction hash for every reaction in the struct. This is calculated by taking the `sha256` of the concatenation of the sorted reactant and product SMILES, and is used for checking whether new reactions are duplicates of any others already in the struct.
+* `RxData.level_found` contains, similarly to the same field in [`SpeciesData`](@ref) structs, the exploration level in which reactions were found. In [`DirectExplore`](@ref)-based simulations, every reaction defaults to being from level 1. 
 
 ### New Reaction Criteria
 

diff --git a/docs/src/tutorials/results-analysis.md b/docs/src/tutorials/results-analysis.md
@@ -36,6 +36,8 @@ nothing # hide
 !!! note "Using format_rxn"
     [`print_rxn`](@ref) is just a wrapper to print the string returned by [`format_rxn`](@ref), which uses the reactants/products and stoichiometries in `rd` and the species SMILES in `sd` to generate a human-readable string describing a reaction. If you need to print more information alongside a reaction, consider something like `println("$(format_rxn(sd, rd, rid)): $(extra_data[rid])")`.
 
+    Both [`print_rxn`](@ref) and [`format_rxn`](@ref) accept a keyword argument `display_level`, which also tags the requested reaction with the exploration level (see [Iterative CRN Exploration](@ref)) in which it was discovered.
+
 The total number of reactions in a CRN is stored in `RxData.nr`, so printing all of the reactions in a CRN is as simple as this loop:
 
 ```@example results_analysis
@@ -113,10 +115,14 @@ savegraph(g, "../assets/tutorials/results_analysis/modified_graph.svg", "svg");
 
 ![](../assets/tutorials/results_analysis/modified_graph.svg)
 
+The exploration level that each species and reaction was found in (see [Iterative CRN Exploration](@ref)) is additionally added as a Graphviz attribute to each respective node. While this is (probably) not useful within the context of Graphviz's layout engines, more advanced graphical interfaces such as [Gephi](https://gephi.org/) can be used to create level-driven layouts:
+
+![](../assets/crn11.png)
+
 !!! note "On Graphviz Installation"
     Graphviz is one of Kinetica's Python dependencies, and as such is always installed at the same time as Kinetica. This is not the case for Catalyst.jl, which either requires a user-installed version of Graphviz, or it uses the [Graphviz_jll](https://github.com/JuliaBinaryWrappers/Graphviz_jll.jl) package. As discussed in the section on the [Graphviz](@ref) dependency, we avoid the JLL due to it missing some key features which Kinetica's CRNs typically need.
 
-    Kinetica always adds its Python dependencies to the end of the current `PATH`, so if you have your own installation that you'd like to use, just make sure it's in your `PATH` before you start Julia!
+    Kinetica always adds its Python dependencies to the end of the current `PATH`, so if you have your own installation of Graphviz that you'd like to use, just make sure it's in your `PATH` before you start Julia!
 
 ### Species Analysis
 

diff --git a/src/analysis/graph.jl b/src/analysis/graph.jl
@@ -26,6 +26,8 @@ properties.
 Additionally allows for plotting with SMILES node labels.
 This is disabled by default, as many of the special
 characters in SMILES cannot currently be rendered properly.
+The exploration level in which species and reactions were
+found is also exported as a node attribute.
 
 The resulting `Catalyst.Graph` can be rendered in a notebook,
 or saved to file using `Catalyst.savegraph`, both of which
@@ -52,12 +54,19 @@ function Catalyst.Graph(sd::SpeciesData, rd::RxData;
     rxs = reactions(rs)
     specs = species(rs)
     spec_names = use_smiles ? [sd.toStr[i] for i in 1:sd.n] : ["S"*subscript(i) for i in 1:sd.n]
-    if remove_inactive_species
-        statenodes = [Catalyst.Node(spec_names[i], sattrs) for i in unique(reduce(vcat, [rd.id_reacs; rd.id_prods]))]
-    else
-        statenodes = [Catalyst.Node(spec_names[i], sattrs) for i in 1:sd.n]
+    statenodes = []
+    states = remove_inactive_species ? unique(reduce(vcat, [rd.id_reacs; rd.id_prods])) : 1:sd.n
+    for i in states
+        this_sattrs = deepcopy(sattrs)
+        this_sattrs[:level] = string(sd.level_found[i])
+        push!(statenodes, Catalyst.Node(spec_names[i], this_sattrs))
+    end
+    transnodes = []
+    for (i, r) in enumerate(rxs)
+        this_rattrs = deepcopy(rattrs)
+        this_rattrs[:level] = string(rd.level_found[i])
+        push!(transnodes, Catalyst.Node("R"*subscript(i), this_rattrs))
     end
-    transnodes = [Catalyst.Node("R"*subscript(i), rattrs) for (i, r) in enumerate(rxs)]
 
     stmts = vcat(statenodes, transnodes)
     edges = map(enumerate(rxs)) do (i, r)

diff --git a/src/analysis/io.jl b/src/analysis/io.jl
@@ -109,7 +109,8 @@ function save_output(out::ODESolveOutput, saveto::String)
         :sd => Dict(
             :toInt => out.sd.toInt,
             :n => out.sd.n,
-            :xyz => out.sd.xyz
+            :xyz => out.sd.xyz,
+            :level_found => out.sd.level_found
         ),
         :rd => Dict(
             :nr => out.rd.nr,
@@ -119,7 +120,8 @@ function save_output(out::ODESolveOutput, saveto::String)
             :stoic_reacs => out.rd.stoic_reacs,
             :stoic_prods => out.rd.stoic_prods,
             :dH => out.rd.dH,
-            :rhash => out.rd.rhash
+            :rhash => out.rd.rhash,
+            :level_found => out.rd.level_found
         ),
         :pars => Dict(
             :tspan => out.pars.tspan,
@@ -176,11 +178,13 @@ function load_output(outfile::String)
 
     sd_iType = typeof(savedict[:sd][:n])
     sd_toStr = Dict{sd_iType, String}(value => key for (key, value) in savedict[:sd][:toInt])
+    sd_levels = get(savedict[:sd], :level_found, Dict{sd_iType, Int}(i => 1 for i in 1:savedict[:sd][:n]))
     sd = SpeciesData(
         savedict[:sd][:toInt],
         sd_toStr,
         savedict[:sd][:n],
         savedict[:sd][:xyz],
+        sd_levels,
         Dict()
     )
     # For some reason ExtXYZ dicts can get a bit type unstable, species arrays need special handling.
@@ -193,14 +197,16 @@ function load_output(outfile::String)
     if length(rd_mapped_rxns) == 0
         @warn "No reaction atom maps found in output."
     end
+    rd_levels = get(savedict[:rd], :level_found, ones(Int, savedict[:rd][:nr]))
     rd = RxData(
         savedict[:rd][:nr], String[rxn for rxn in rd_mapped_rxns],
         Vector{rd_iType}[reac for reac in savedict[:rd][:id_reacs]], 
         Vector{rd_iType}[prod for prod in savedict[:rd][:id_prods]],
         Vector{rd_iType}[sreac for sreac in savedict[:rd][:stoic_reacs]], 
         Vector{rd_iType}[sprod for sprod in savedict[:rd][:stoic_prods]],
         savedict[:rd][:dH], 
-        Vector{UInt8}[hash for hash in savedict[:rd][:rhash]]
+        Vector{UInt8}[hash for hash in savedict[:rd][:rhash]],
+        rd_levels
     )
 
     pars_fields = fieldnames(ODESimulationParams)

diff --git a/src/exploration/explore_utils.jl b/src/exploration/explore_utils.jl
@@ -47,45 +47,47 @@ end
 
 
 """
-    import_mechanism(rdir::String, rcount[, max_molecularity=2])
+    import_mechanism(loc::ExploreLoc, rcount[, max_molecularity=2])
 
 Create a CRN's initial `SpeciesData` and `RxData` from a CDE generated mechanism(s).
 
 Reads in the results of a CDE run at the `rcount` reaction
-directory under `rdir`. Returns a new `SpeciesData` and `RxData`
+directory specified by `loc`. Returns a new `SpeciesData` and `RxData`
 containing the unique species and reactions within these results,
 provided these reactions do not exceed the maximum molecularity
 set by `max_molecularity`, which defaults to only accepting
 unimolecular and bimolecular reactions.
 """
-function import_mechanism(rdir::String, rcount; max_molecularity=2)
+function import_mechanism(loc::ExploreLoc, rcount; max_molecularity=2)
+    rdir = pathof(loc)
     rsmis, rxyzs, rsys, psmis, pxyzs, psys, dHs = ingest_cde_run(rdir, rcount)
     all_smis = vcat(reduce(vcat, rsmis), reduce(vcat, psmis))
     all_xyzs = vcat(reduce(vcat, rxyzs), reduce(vcat, pxyzs))
-    sd = SpeciesData(all_smis, all_xyzs)
-    rd = RxData(sd, rsmis, psmis, rsys, psys, dHs; max_molecularity=max_molecularity)
+    sd = SpeciesData(all_smis, all_xyzs, loc.level)
+    rd = RxData(sd, rsmis, psmis, rsys, psys, dHs, loc.level; max_molecularity=max_molecularity)
     return sd, rd
 end
 
 """
-    import_mechanism!(sd::SpeciesData, rd::RxData, rdir::String, rcount[, max_molecularity=2])
+    import_mechanism!(sd::SpeciesData, rd::RxData, loc::ExploreLoc, rcount[, max_molecularity=2])
 
 Extend a CRN's `SpeciesData` and `RxData` from a CDE generated mechanism(s).
 
 Reads in the results of a CDE run at the `rcount` reaction
-directory under `rdir`. Extends `sd` and `rd` with the unique 
+directory specified by `loc`. Extends `sd` and `rd` with the unique 
 species and reactions within these results, provided these 
 reactions do not exceed the maximum molecularity set by 
 `max_molecularity`, which defaults to only accepting unimolecular
 and bimolecular reactions.
 """
-function import_mechanism!(sd::SpeciesData, rd::RxData, rdir::String, rcount;
+function import_mechanism!(sd::SpeciesData, rd::RxData, loc::ExploreLoc, rcount;
         max_molecularity=2)
+    rdir = pathof(loc)
     rsmis, rxyzs, rsys, psmis, pxyzs, psys, dHs = ingest_cde_run(rdir, rcount)
     all_smis = vcat(reduce(vcat, rsmis), reduce(vcat, psmis))
     all_xyzs = vcat(reduce(vcat, rxyzs), reduce(vcat, pxyzs))
-    push_unique!(sd, all_smis, all_xyzs)
-    push!(rd, sd, rsmis, psmis, rsys, psys, dHs; max_molecularity=max_molecularity)
+    push_unique!(sd, all_smis, all_xyzs, loc.level)
+    push!(rd, sd, rsmis, psmis, rsys, psys, dHs, loc.level; max_molecularity=max_molecularity)
     return
 end
 
@@ -106,7 +108,8 @@ function import_network(rdir_head::String)
     @info "Importing all reactions in level tree under $(rdir_head)"; flush_log()
     level_dirs = readdir(rdir_head)
     level_dirs = level_dirs[startswith.(level_dirs, "level_")]
-    if length(level_dirs) == 0
+    n_levels = length(level_dirs)
+    if length(n_levels) == 0
         error("ERROR: No network levels found in rdir_head.")
     end
 
@@ -124,25 +127,28 @@ function import_network(rdir_head::String)
     # Add inert species, if there are any.
     for spec in inert_species
         xyz = frame_from_smiles(spec)
-        push_unique!(sd, spec, xyz)
+        push_unique!(sd, spec, xyz, 0)
     end
 
     # Loop through each level, adding each subspace.
-    for lv in level_dirs
-        lv_dir = joinpath(rdir_head, lv)
-        ss_dirs = readdir(lv_dir)
+    loc = ExploreLoc(rdir_head, 1, 1)
+    for i in 1:n_levels
+        reset_subspace!(loc)
+        ss_dirs = readdir(pathof(loc; to_level=true))
         ss_dirs = ss_dirs[startswith.(ss_dirs, "subspace_")]
-        for ss in ss_dirs
-            ss_dir = joinpath(lv_dir, ss)
-            rcount = make_rcount(joinpath(ss_dir, "rcount"))
+        n_subspaces = length(ss_dirs)
+        for j in 1:n_subspaces
+            rcount = make_rcount(joinpath(pathof(loc), "rcount"))
             for reac in 1:rcount
-                import_mechanism!(sd, rd, ss_dir, reac)
+                import_mechanism!(sd, rd, loc, reac)
             end
+            inc_subspace!(loc)
         end
+        inc_level!(loc)
     end
 
     @info "Finished network import."
-    @info "Network contains $(sd.n) species over $(rd.nr) reactions.\n"
+    @info "Network contains $(sd.n) species over $(rd.nr) reactions, explored over $(n_levels) levels.\n"
     flush_log()
 
     return sd, rd

diff --git a/src/exploration/methods.jl b/src/exploration/methods.jl
@@ -344,11 +344,11 @@ function explore_subspace!(sd::SpeciesData, rd::RxData, loc::ExploreLoc, explore
         n_reacs_prev = rd.nr
         if exploremethod.cde.parallel_runs > 1
             for rc in rcountrange
-                import_mechanism!(sd, rd, pathof(loc), rc)
+                import_mechanism!(sd, rd, loc, rc)
             end
             rcount += length(rcountrange)-1
         else
-            import_mechanism!(sd, rd, pathof(loc), rcount)
+            import_mechanism!(sd, rd, loc, rcount)
         end
         @info "   - Reaction network now contains $(rd.nr) reactions over $(sd.n) unique fragments."
         flush_log()