Merge pull request #150 from JuliaGNI/linear_symplectic_transformer

Linear symplectic transformer
JuliaGNI · May 16, 2024 · 8a2f2ef · 8a2f2ef
2 parents 14a9295 + 8eecb10
commit 8a2f2ef
Show file tree

Hide file tree

Showing 54 changed files with 1,348 additions and 160 deletions.
diff --git a/docs/Makefile b/docs/Makefile
@@ -11,11 +11,13 @@ latex: latex_no_pdf
 	$(MAKE) compile_tex;
 	$(MAKE) compile_tex
 
-latex_no_pdf_no_images: install_brenier_two_fluid latex_docs_no_pdf remove_brenier_two_fluid put_figures_outside_of_minted_environment
+latex_no_pdf_no_images: install_brenier_two_fluid latex_docs_no_pdf put_figures_outside_of_minted_environment do_correct_quotation_marks make_correct_thrm_and_dfntn_environment
 
 latex_no_pdf: latex_images latex_no_pdf_no_images 
 
-html: html_images install_brenier_two_fluid test_docs documenter remove_brenier_two_fluid
+html: html_images html_no_images
+
+html_no_images: install_brenier_two_fluid test_docs documenter
 
 test_docs: 
 	cd ..; julia --project=docs  -e ';									\
@@ -56,7 +58,7 @@ latex_docs_no_pdf:
 
 compile_tex: 
 	cd build; \
-	xelatex -shell-escape G*.tex;
+	lualatex -shell-escape G*.tex;
 
 put_figures_outside_of_minted_environment:
 	sed -i'' -e 's/\"\\\\begin{figure}\\n\\\\includegraphics/DeleteThisAndTheLineBefore\n\\begin{figure}[H]\n\\centering\n\\includegraphics/g' build/G*.tex;
@@ -69,4 +71,18 @@ put_figures_outside_of_minted_environment:
 	sed -ni'' -e '/DeleteThisAndTheLineBefore/{x;d;};1h;1!{x;p;};${x;p;}' build/G*.tex;
 	sed -i'' -e '/DeleteThisAndTheLineAfter/{N;s/\n.*//;}' build/G*.tex;
 	sed -i'' -e '/DeleteThisAndTheLineBefore/d' build/G*.tex;
-	sed -i'' -e '/DeleteThisAndTheLineAfter/d' build/G*.tex;
+	sed -i'' -e '/DeleteThisAndTheLineAfter/d' build/G*.tex;
+	sed -i'' -e 's/\\\\texttt/\\texttt/g' build/G*.tex;
+	sed -i'' -e 's/\\\\_/\\_/g' build/G*.tex;
+
+make_correct_thrm_and_dfntn_environment:
+	sed -i'' -e 's/{\\textbackslash}begin\\{thrm\\}/\\begin{thrm}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}end\\{thrm\\}/\\end{thrm}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}label\\{th:\([a-zA-Z]*\)\\}/\\label{th:\1}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}begin\\{dfntn\\}/\\begin{dfntn}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}end\\{dfntn\\}/\\end{dfntn}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}label\\{def:\([a-zA-Z]*\)\\}/\\label{th:\1}/g' build/G*.tex;
+
+do_correct_quotation_marks:
+	sed -i'' -e 's/{\\textquotedbl}/"/g' build/G*.tex;
+	sed -i'' -e 's/ "/ ``/g' build/G*.tex
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -4,5 +4,6 @@ DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 GeometricIntegrators = "dcce2d33-59f6-5b8d-9047-0defad88ae06"
 GeometricMachineLearning = "194d25b2-d3f5-49f0-af24-c124f4aa80cc"
 GeometricProblems = "18cb22b4-ad41-5c80-9c5f-710df63fbdc9"
+LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,6 +1,7 @@
 using GeometricMachineLearning
 using Documenter
 using DocumenterCitations
+using Markdown
 # using Weave
 
 # this is necessary to avoid warnings. See https://documenter.juliadocs.org/dev/man/syntax/
@@ -33,8 +34,12 @@ const output_type = isempty(ARGS) ? :html : ARGS[1] == "html_output" ? :html : :
 const format = output_type == :html ? html_format : latex_format
 
 function html_graphics(path::String; kwargs...)
-    light_string = """<object type="image/svg+xml" class="display-light-only" data=$(joinpath(buildpath, path * ".png"))></object>"""
-    dark_string = """<object type="image/svg+xml" class="display-dark-only" data=$(joinpath(buildpath, path * "_dark.png"))></object>"""
+    light_path = joinpath(path * ".png")
+    dark_path = joinpath(path * "_dark.png")
+    light_string = """<object type="image/svg+xml" class="display-light-only" data=$(joinpath(buildpath, light_path))></object>"""
+    dark_string = """<object type="image/svg+xml" class="display-dark-only" data=$(joinpath(buildpath, dark_path))></object>"""
+    @assert isfile(light_path) "No file found for " * light_path * "!"
+    @assert isfile(dark_path) "No file found for " * dark_path * "!"
     Docs.HTML(light_string, dark_string)
 end
 
@@ -53,6 +58,37 @@ function include_graphics(path::String; kwargs...)
     Main.output_type == :html ? html_graphics(path; kwargs...) : latex_graphics(path; kwargs...)
 end
 
+function theorem(statement::String, name::Nothing; label::Union{Nothing, String} = nothing)
+    if Main.output_type == :html
+        Markdown.parse("__Theorem:__ *" * statement * "*")
+    else
+        theorem_label = isnothing(label) ? "" : raw"\label{th:" * label * raw"}"
+        Markdown.parse(raw"\begin{thrm}" * statement * theorem_label * raw"\end{thrm}")
+    end
+end
+
+function theorem(statement::String, name::String; label::Union{Nothing, String} = nothing)
+    if Main.output_type == :html
+        Markdown.parse("__Theorem (" * name * "):__ *" * statement * "*")
+    else
+        theorem_label = isnothing(label) ? "" : raw"\label{th:" * label * raw"}"
+        Markdown.parse(raw"\begin{thrm}[" * name * "]" * statement * theorem_label * raw"\end{thrm}")
+    end
+end
+
+function theorem(statement::String; name::Union{Nothing, String} = nothing, label::Union{Nothing, String} = nothing)
+    theorem(statement, name; label = label)
+end
+
+function definition(statement::String; label::Union{Nothing, String} = nothing)
+    if Main.output_type == :html
+        Markdown.parse("__Definition:__ *" * statement * "*")
+    else
+        theorem_label = isnothing(label) ? "" : raw"\label{def:" * label * raw"}"
+        Markdown.parse(raw"\begin{dfntn}" * statement * theorem_label * raw"\end{dfntn}")
+    end
+end
+
 makedocs(;
     plugins = [bib],
     modules = [GeometricMachineLearning],
@@ -62,10 +98,6 @@ makedocs(;
     format = format,
     pages=[
         "Home" => "index.md",
-        "Architectures" => [
-            "SympNet" => "architectures/sympnet.md",
-            "Symplectic Autoencoders" => "architectures/symplectic_autoencoder.md",
-        ],
         "Manifolds" => [
             "Concepts from General Topology" => "manifolds/basic_topology.md",
             "General Theory on Manifolds" => "manifolds/manifolds.md",
@@ -96,9 +128,20 @@ makedocs(;
             "BFGS Optimizer" => "optimizers/bfgs_optimizer.md",
             ],
         "Special Neural Network Layers" => [
+            "Sympnet Gradient Layers" => "layers/sympnet_gradient.md",
             "Volume-Preserving Layers" => "layers/volume_preserving_feedforward.md",
             "Attention" => "layers/attention_layer.md",
             "Multihead Attention" => "layers/multihead_attention_layer.md",
+            "Linear Symplectic Attention" => "layers/linear_symplectic_attention.md",
+        ],
+        "Architectures" => [
+            "Symplectic Autoencoders" => "architectures/symplectic_autoencoder.md",
+            "Neural Network Integrators" => "architectures/neural_network_integrators.md",
+            "SympNet" => "architectures/sympnet.md",
+            "Volume-Preserving FeedForward" => "architectures/volume_preserving_feedforward.md",
+            "Standard Transformer" => "architectures/transformer.md",
+            "Volume-Preserving Transformer" => "architectures/volume_preserving_transformer.md",
+            "Linear Symplectic Transformer" => "architectures/linear_symplectic_transformer.md",
         ],
         "Data Loader" =>[
             "Routines" => "data_loader/data_loader.md",
@@ -116,6 +159,7 @@ makedocs(;
             "MNIST" => "tutorials/mnist_tutorial.md",
             "Grassmann manifold" => "tutorials/grassmann_layer.md",
             "Volume-Preserving Attention" => "tutorials/volume_preserving_attention.md",
+            "Linear Symplectic Transformer" => "tutorials/linear_symplectic_transformer.md",
         ],
         "References" => "references.md",
         "Library" => "library.md",

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
@@ -152,6 +152,34 @@ @article{vaswani2017attention
   year={2017}
 }
 
+@article{patwardhan2023transformers,
+  title={Transformers in the real world: A survey on nlp applications},
+  author={Patwardhan, Narendra and Marrone, Stefano and Sansone, Carlo},
+  journal={Information},
+  volume={14},
+  number={4},
+  pages={242},
+  year={2023},
+  publisher={MDPI}
+}
+
+@article{hemmasian2023reduced,
+  title={Reduced-order modeling of fluid flows with transformers},
+  author={Hemmasian, AmirPouya and Barati Farimani, Amir},
+  journal={Physics of Fluids},
+  volume={35},
+  number={5},
+  year={2023},
+  publisher={AIP Publishing}
+}
+
+@article{solera2023beta,
+  title={$\beta$-Variational autoencoders and transformers for reduced-order modelling of fluid flows},
+  author={Solera-Rico, Alberto and Vila, Carlos Sanmiguel and G{\'o}mez, MA and Wang, Yuning and Almashjary, Abdulrahman and Dawson, Scott and Vinuesa, Ricardo},
+  journal={arXiv preprint arXiv:2304.03571},
+  year={2023}
+}
+
 @article{brantner2023symplectic,
   title={Symplectic autoencoders for Model Reduction of Hamiltonian Systems},
   author={Brantner, Benedikt and Kraus, Michael},
@@ -166,6 +194,13 @@ @article{brantner2023structure
     year = {2023}
 }
 
+@article{brantner2024volume,
+    author = {Brantner, Benedikt and de Romemont, Guillaume and Kraus, Michael and Li, Zeyuan},
+    title = {Volume-Preserving Transformers for Learning Time Series Data with Structure},
+    journal = {arXiv preprint arXiv:2312:11166v2},
+    year = {2024}
+}
+
 @article{lin2008riemannian,
   title={Riemannian manifold learning},
   author={Lin, Tong and Zha, Hongbin},
@@ -273,4 +308,41 @@ @article{feng1998step
   pages={193--202},
   year={1998},
   publisher={JSTOR}
+}
+
+@inproceedings{feng1987symplectic,
+  title={The symplectic methods for the computation of Hamiltonian equations},
+  author={Feng, Kang and Qin, Meng-zhao},
+  booktitle={Numerical Methods for Partial Differential Equations: Proceedings of a Conference held in Shanghai, PR China, March 25--29, 1987},
+  pages={1--37},
+  year={1987},
+  organization={Springer}
+}
+
+@article{ge1988approximation,
+  title={On the approximation of linear Hamiltonian systems},
+  author={Ge, Zhong and Feng, Kang},
+  journal={Journal of Computational Mathematics},
+  pages={88--97},
+  year={1988},
+  publisher={JSTOR}
+}
+
+@misc{Kraus:2020:GeometricIntegrators,
+  title={GeometricIntegrators.jl: Geometric Numerical Integration in Julia},
+  author={Kraus, Michael},
+  year={2020},
+  howpublished={\url{https://github.com/JuliaGNI/GeometricIntegrators.jl}},
+  doi={10.5281/zenodo.3648325}
+}
+
+@article{hochreiter1997long,
+  title={Long short-term memory},
+  author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
+  journal={Neural computation},
+  volume={9},
+  number={8},
+  pages={1735--1780},
+  year={1997},
+  publisher={MIT press}
 }
diff --git a/docs/src/architectures/linear_symplectic_transformer.md b/docs/src/architectures/linear_symplectic_transformer.md
@@ -0,0 +1,13 @@
+# Linear Symplectic Transformer
+
+The linear symplectic transformer consists of a combination of [linear symplectic attention](@ref "Linear Symplectic Attention") and [gradient](@ref "SympNet Gradient Layer") layers and is visualized below: 
+
+```@example
+Main.include_graphics("../tikz/linear_symplectic_transformer"; caption = raw"Visualization of the linear symplectic transformer architecutre. \texttt{n\_sympnet} refers to the number of SympNet layers (\texttt{n\_sympnet=2} in this figure) and \texttt{L} refers to the number of transformer blocks (\texttt{L=1} in this figure).", width = .3) # hide
+```
+
+## Library Functions
+
+```@docs; canonical=false
+LinearSymplecticTransformer
+```
diff --git a/docs/src/architectures/neural_network_integrators.md b/docs/src/architectures/neural_network_integrators.md
@@ -0,0 +1,66 @@
+# Neural Network Integrators 
+
+In `GeometricMachineLearning` we can divide most neural network architectures (that are used for applications to physical systems) into two categories: autoencoders and integrators. *Integrator* in its most general form refers to an approximation of the flow of an ODE (see [the section on the existence and uniqueness theorem](@ref "The Existence-And-Uniqueness Theorem")) by a numerical scheme. Traditionally these numerical schemes were constructed by defining certain relationships between a known time step ``z^{(t)}`` and a future unknown one ``z^{(t+1)}`` [hairer2006geometric, leimkuhler2004simulating](@cite): 
+
+```math
+    f(z^{(t)}, z^{(t+1)}) = 0.
+```
+
+One usually refers to such a relationship as an "integration scheme". If this relationship can be reformulated as 
+
+```math
+    z^{(t+1)} = g(z^{(t)}),
+```
+
+then we refer to the scheme as *explicit*, if it cannot be reformulated in such a way then we refer to it as *implicit*. Implicit schemes are typically more expensive to solve than explicit ones. The `Julia` library `GeometricIntegrators` [Kraus:2020:GeometricIntegrators](@cite) offers a wide variety of integration schemes both implicit and explicit. 
+
+The neural network integrators in `GeometricMachineLearning` (the corresponding type is [`NeuralNetworkIntegrator`](@ref)) are all explicit integration schemes where the function ``g`` above is modeled with a neural network.
+
+Neural networks, as an alternative to traditional methods, are employed because of (i) potentially superior performance and (ii) an ability to learn unknown dynamics from data. 
+
+## Multi-step methods
+
+*Multi-step method* [feng1987symplectic, ge1988approximation](@cite) refers to schemes that are of the form[^1]: 
+
+[^1]: We again assume that all the steps up to and including ``t`` are known.
+
+```math
+    f(z^{(t - \mathtt{sl} + 1)}, z^{(t - \mathtt{sl} + 2)}, \ldots, z^{(t)}, z^{(t + 1)}, \ldots, z^{(\mathtt{pw} + 1)}) = 0,
+```
+where `sl` is short for *sequence length* and `pw` is short for *prediction window*. In contrast to traditional single-step methods, `sl` and `pw` can be greater than 1. An explicit multi-step method has the following form: 
+
+```math 
+[z^{(t+1)}, \ldots, z^{(t+\mathtt{pw})}] = g(z^{(t - \mathtt{sl} + 1)}, \ldots, z^{(t)}).
+```
+
+There are essentially two ways to construct multi-step methods with neural networks: the older one is using recurrent neural networks such as long short-term memory cells (LSTMs, [hochreiter1997long](@cite)) and the newer one is using transformer neural networks [vaswani2017attention](@cite). Both of these approaches have been successfully employed to learn multi-step methods (see [fresca2021comprehensive, lee2020model](@cite) for the former and [hemmasian2023reduced, solera2023beta, brantner2024volume](@cite) for the latter), but because the transformer architecture exhibits superior performance on modern hardware and can be imbued with geometric properties it is recommended to always use a transformer-derived architecture when dealing with time series[^2].
+
+[^2]: `GeometricMachineLearning` also has an LSTM implementation, but this may be deprecated in the future. 
+
+Explicit multi-step methods derived from he transformer are always subtypes of the type [`TransformerIntegrator`](@ref) in `GeometricMachineLearning`. In `GeometricMachineLearning` the [standard transformer](@ref "Standard Transformer"), the [volume-preserving transformer](@ref "Volume-Preserving Transformer") and the [linear symplectic transformer](@ref "Linear Symplectic Transformer") are implemented. 
+
+## Library Functions 
+
+```@docs; canonical=false
+NeuralNetworkIntegrator 
+TransformerIntegrator
+```
+
+## References 
+
+```@bibliography
+Pages = []
+Canonical = false
+
+hairer2006geometric
+leimkuhler2004simulating
+Kraus:2020:GeometricIntegrators
+feng1998step
+hochreiter1997long
+vaswani2017attention
+fresca2021comprehensive
+lee2020model
+hemmasian2023reduced
+solera2023beta
+brantner2024volume
+```
diff --git a/docs/src/architectures/sympnet.md b/docs/src/architectures/sympnet.md
@@ -1,4 +1,4 @@
-# SympNet
+# SympNet Architecture
 
 This document discusses the SympNet architecture and its implementation in `GeometricMachineLearning.jl`.
 

diff --git a/docs/src/architectures/transformer.md b/docs/src/architectures/transformer.md
@@ -0,0 +1,22 @@
+# Standard Transformer
+
+The transformer is a relatively modern neural network architecture [vaswani2017attention](@cite) that has come to dominate the field of natural language processing (NLP, [patwardhan2023transformers](@cite)) and replaced the previously dominant long-short term memory cells (LSTM, [hochreiter1997long](@cite)). Its success is due to a variety of factors: 
+- unlike LSTMs it consists of very simple building blocks and hence is easier to interpret mathematically,
+- it is very flexible in its application and the data it is fed with do not have to conform to a rigid pattern, 
+- transformers utilize modern hardware (especially GPUs) very effectively. 
+
+The transformer architecture is sketched below: 
+
+```@example
+Main.include_graphics("../tikz/transformer_encoder") # hide
+```
+
+It is nothing more than a combination of a [multihead attention layer](@ref "Multihead Attention") and a residual neural network[^1] (ResNet).
+
+[^1]: A ResNet is nothing more than a neural network to whose output we again add the input, i.e. every ResNet is of the form ``\mathrm{ResNet}(x) = x + \mathcal{NN}(x)``.
+
+## Library Functions 
+
+```@docs; canonical=false
+StandardTransformerIntegrator
+```
diff --git a/docs/src/architectures/volume_preserving_feedforward.md b/docs/src/architectures/volume_preserving_feedforward.md
@@ -0,0 +1,23 @@
+# Volume-Preserving Feedforward Neural Network 
+
+## Neural network architecture
+
+The constructor produces the following architecture[^1]:
+
+[^1]: Based on the input arguments `n_linear` and `n_blocks`. In this example `init_upper` is set to false, which means that the first layer is of type *lower* followed by a layer of type *upper*. 
+
+```@example
+Main.include_graphics("../tikz/vp_feedforward") # hide
+```
+
+Here *LinearLowerLayer* performs ``x \mapsto x + Lx`` and *NonLinearLowerLayer* performs ``x \mapsto x + \sigma(Lx + b)``. The activation function ``\sigma`` is the forth input argument to the constructor and `tanh` by default. 
+
+## Note on Sympnets
+
+As [SympNets](@ref "SympNet Architecture") are symplectic maps, they also conserve phase space volume and therefore form a subcategory of volume-preserving feedforward layers. 
+
+## Library Functions 
+
+```@docs; canonical=false
+VolumePreservingFeedForward
+```