From 911ae4ceeb5c2fe9fd7389b4e7dbf9342bffb6db Mon Sep 17 00:00:00 2001 From: "Documenter.jl" Date: Tue, 11 Jun 2024 13:34:16 +0000 Subject: [PATCH] build based on 87e1f5d --- latest/.documenter-siteinfo.json | 2 +- latest/GeometricMachineLearning.bib | 9 + latest/Optimizer/index.html | 2 +- latest/architectures/autoencoders/index.html | 4 +- .../linear_symplectic_transformer/index.html | 2 +- .../neural_network_integrators/index.html | 2 +- .../symplectic_autoencoder/index.html | 2 +- latest/architectures/sympnet/index.html | 6 +- latest/architectures/transformer/index.html | 2 +- .../volume_preserving_feedforward/index.html | 2 +- .../volume_preserving_transformer/index.html | 2 +- .../arrays/global_tangent_spaces/index.html | 177 ++ .../grassmann_lie_alg_hor_matrix/index.html | 12 - .../arrays/skew_symmetric_matrix/index.html | 160 +- .../stiefel_lie_alg_horizontal/index.html | 8 - latest/data_loader/TODO/index.html | 2 +- latest/data_loader/data_loader/index.html | 16 +- latest/data_loader/snapshot_matrix/index.html | 4 +- latest/index.html | 2 +- latest/layers/attention_layer/index.html | 6 +- .../linear_symplectic_attention/index.html | 2 +- .../multihead_attention_layer/index.html | 2 +- latest/layers/sympnet_gradient/index.html | 4 +- .../volume_preserving_feedforward/index.html | 4 +- latest/library/index.html | 289 ++- latest/manifolds/basic_topology/index.html | 2 +- .../index.html | 4 +- .../manifolds/homogeneous_spaces/index.html | 76 +- .../inverse_function_theorem/index.html | 4 +- latest/manifolds/manifolds/index.html | 4 +- .../metric_and_vector_spaces/index.html | 4 +- .../manifolds/riemannian_manifolds/index.html | 6 +- latest/objects.inv | Bin 7255 -> 7964 bytes latest/optimizers/adam_optimizer/index.html | 2 +- latest/optimizers/bfgs_optimizer/index.html | 6 +- .../general_optimization/index.html | 2 +- .../manifold_related/cayley/index.html | 4 +- .../manifold_related/geodesic/index.html | 2 +- .../global_sections/index.html | 4 +- .../horizontal_lift/index.html | 2 +- .../manifold_related/retractions/index.html | 2 +- .../computation_of_pullbacks/index.html | 4 +- .../autoencoder/index.html | 2 +- .../kolmogorov_n_width/index.html | 2 +- .../projection_reduction_errors/index.html | 4 +- .../symplectic_autoencoder/index.html | 4 +- latest/references/index.html | 2 +- latest/search_index.js | 2 +- .../{feb29668.svg => 1790e027.svg} | 1953 ++++++++--------- .../{6f2cf159.svg => a08bbe8e.svg} | 60 +- .../{a35e350a.svg => fed7261b.svg} | 1896 ++++++++-------- latest/tutorials/grassmann_layer/index.html | 8 +- .../{59dd74ca.svg => 8b9246ed.svg} | 80 +- .../{a58e3d30.svg => 9fb33f4f.svg} | 64 +- .../linear_symplectic_transformer/index.html | 6 +- latest/tutorials/mnist_tutorial/index.html | 6 +- .../{a7fb2654.svg => 25132818.svg} | 72 +- .../{34c1e399.svg => e80b6398.svg} | 68 +- .../symplectic_autoencoder/index.html | 6 +- .../{8e47bd72.svg => 362d7549.svg} | 56 +- .../{105b2366.svg => bd088bbf.svg} | 72 +- latest/tutorials/sympnet_tutorial/index.html | 726 +++--- .../{338983b3.svg => 0e515873.svg} | 80 +- .../{b5294845.svg => 21efc475.svg} | 68 +- .../{5d3be767.svg => 23654bc1.svg} | 88 +- .../{c1333eea.svg => 244e9e6f.svg} | 80 +- .../{8bc8bd61.svg => 6cc831d5.svg} | 76 +- .../{ccbb9215.svg => 76f565ff.svg} | 60 +- .../{b1e4a67b.svg => 92210c6c.svg} | 64 +- .../{30157b05.svg => b94db844.svg} | 84 +- .../volume_preserving_attention/index.html | 14 +- 71 files changed, 3519 insertions(+), 3035 deletions(-) create mode 100644 latest/arrays/global_tangent_spaces/index.html delete mode 100644 latest/arrays/grassmann_lie_alg_hor_matrix/index.html delete mode 100644 latest/arrays/stiefel_lie_alg_horizontal/index.html rename latest/tutorials/grassmann_layer/{feb29668.svg => 1790e027.svg} (65%) rename latest/tutorials/grassmann_layer/{6f2cf159.svg => a08bbe8e.svg} (70%) rename latest/tutorials/grassmann_layer/{a35e350a.svg => fed7261b.svg} (65%) rename latest/tutorials/linear_symplectic_transformer/{59dd74ca.svg => 8b9246ed.svg} (85%) rename latest/tutorials/linear_symplectic_transformer/{a58e3d30.svg => 9fb33f4f.svg} (95%) rename latest/tutorials/symplectic_autoencoder/{a7fb2654.svg => 25132818.svg} (87%) rename latest/tutorials/symplectic_autoencoder/{34c1e399.svg => e80b6398.svg} (85%) rename latest/tutorials/sympnet_tutorial/{8e47bd72.svg => 362d7549.svg} (88%) rename latest/tutorials/sympnet_tutorial/{105b2366.svg => bd088bbf.svg} (86%) rename latest/tutorials/volume_preserving_attention/{338983b3.svg => 0e515873.svg} (85%) rename latest/tutorials/volume_preserving_attention/{b5294845.svg => 21efc475.svg} (92%) rename latest/tutorials/volume_preserving_attention/{5d3be767.svg => 23654bc1.svg} (90%) rename latest/tutorials/volume_preserving_attention/{c1333eea.svg => 244e9e6f.svg} (85%) rename latest/tutorials/volume_preserving_attention/{8bc8bd61.svg => 6cc831d5.svg} (88%) rename latest/tutorials/volume_preserving_attention/{ccbb9215.svg => 76f565ff.svg} (94%) rename latest/tutorials/volume_preserving_attention/{b1e4a67b.svg => 92210c6c.svg} (94%) rename latest/tutorials/volume_preserving_attention/{30157b05.svg => b94db844.svg} (85%) diff --git a/latest/.documenter-siteinfo.json b/latest/.documenter-siteinfo.json index ddd4769d8..ee976098f 100644 --- a/latest/.documenter-siteinfo.json +++ b/latest/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.10.3","generation_timestamp":"2024-06-03T15:15:52","documenter_version":"1.4.1"}} \ No newline at end of file +{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-06-11T13:34:10","documenter_version":"1.4.1"}} \ No newline at end of file diff --git a/latest/GeometricMachineLearning.bib b/latest/GeometricMachineLearning.bib index 4ebde5c1d..b50dac1e3 100644 --- a/latest/GeometricMachineLearning.bib +++ b/latest/GeometricMachineLearning.bib @@ -350,6 +350,15 @@ @inproceedings{feng1987symplectic organization={Springer} } +@book{holm2009geometric, + title={Geometric mechanics and symmetry: from finite to infinite dimensions}, + author={Holm, Darryl D and Schmah, Tanya and Stoica, Cristina}, + volume={12}, + year={2009}, + publisher={Oxford University Press}, + address={Oxford, UK} +} + @article{ge1988approximation, title={On the approximation of linear Hamiltonian systems}, author={Ge, Zhong and Feng, Kang}, diff --git a/latest/Optimizer/index.html b/latest/Optimizer/index.html index c8bcc4940..e0848e901 100644 --- a/latest/Optimizer/index.html +++ b/latest/Optimizer/index.html @@ -1,2 +1,2 @@ -Optimizers · GeometricMachineLearning.jl

Optimizer

In order to generalize neural network optimizers to homogeneous spaces, a class of manifolds we often encounter in machine learning, we have to find a global tangent space representation which we call $\mathfrak{g}^\mathrm{hor}$ here.

Starting from an element of the tangent space $T_Y\mathcal{M}$[1], we need to perform two mappings to arrive at $\mathfrak{g}^\mathrm{hor}$, which we refer to by $\Omega$ and a red horizontal arrow:

Here the mapping $\Omega$ is a horizontal lift from the tangent space onto the horizontal component of the Lie algebra at $Y$.

The red line maps the horizontal component at $Y$, i.e. $\mathfrak{g}^{\mathrm{hor},Y}$, to the horizontal component at $\mathfrak{g}^\mathrm{hor}$.

The $\mathrm{cache}$ stores information about previous optimization steps and is dependent on the optimizer. The elements of the $\mathrm{cache}$ are also in $\mathfrak{g}^\mathrm{hor}$. Based on this the optimer (Adam in this case) computes a final velocity, which is the input of a retraction. Because this update is done for $\mathfrak{g}^{\mathrm{hor}}\equiv{}T_Y\mathcal{M}$, we still need to perform a mapping, called apply_section here, that then finally updates the network parameters. The two red lines are described in global sections.

References

[36]
B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
  • 1In practice this is obtained by first using an AD routine on a loss function $L$, and then computing the Riemannian gradient based on this. See the section of the Stiefel manifold for an example of this.
+Optimizers · GeometricMachineLearning.jl

Optimizer

In order to generalize neural network optimizers to homogeneous spaces, a class of manifolds we often encounter in machine learning, we have to find a global tangent space representation which we call $\mathfrak{g}^\mathrm{hor}$ here.

Starting from an element of the tangent space $T_Y\mathcal{M}$[1], we need to perform two mappings to arrive at $\mathfrak{g}^\mathrm{hor}$, which we refer to by $\Omega$ and a red horizontal arrow:

Here the mapping $\Omega$ is a horizontal lift from the tangent space onto the horizontal component of the Lie algebra at $Y$.

The red line maps the horizontal component at $Y$, i.e. $\mathfrak{g}^{\mathrm{hor},Y}$, to the horizontal component at $\mathfrak{g}^\mathrm{hor}$.

The $\mathrm{cache}$ stores information about previous optimization steps and is dependent on the optimizer. The elements of the $\mathrm{cache}$ are also in $\mathfrak{g}^\mathrm{hor}$. Based on this the optimer (Adam in this case) computes a final velocity, which is the input of a retraction. Because this update is done for $\mathfrak{g}^{\mathrm{hor}}\equiv{}T_Y\mathcal{M}$, we still need to perform a mapping, called apply_section here, that then finally updates the network parameters. The two red lines are described in global sections.

References

[38]
B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
  • 1In practice this is obtained by first using an AD routine on a loss function $L$, and then computing the Riemannian gradient based on this. See the section of the Stiefel manifold for an example of this.
diff --git a/latest/architectures/autoencoders/index.html b/latest/architectures/autoencoders/index.html index 919a0e900..7b9e8f7af 100644 --- a/latest/architectures/autoencoders/index.html +++ b/latest/architectures/autoencoders/index.html @@ -1,3 +1,3 @@ -Variational Autoencoders · GeometricMachineLearning.jl

Variational Autoencoders

Variational autoencoders (Lee and Carlberg, 2020) train on the following set:

\[\mathcal{X}(\mathbb{P}_\mathrm{train}) := \{\mathbf{x}^k(\mu) - \mathbf{x}^0(\mu):0\leq{}k\leq{}K,\mu\in\mathbb{P}_\mathrm{train}\},\]

where $\mathbf{x}^k(\mu)\approx\mathbf{x}(t^k;\mu)$. Note that $\mathbf{0}\in\mathcal{X}(\mathbb{P}_\mathrm{train})$ as $k$ can also be zero.

The encoder $\Psi^\mathrm{enc}$ and decoder $\Psi^\mathrm{dec}$ are then trained on this set $\mathcal{X}(\mathbb{P}_\mathrm{train})$ by minimizing the reconstruction error:

\[|| \mathbf{x} - \Psi^\mathrm{dec}\circ\Psi^\mathrm{enc}(\mathbf{x}) ||\text{ for $\mathbf{x}\in\mathcal{X}(\mathbb{P}_\mathrm{train})$}.\]

Initial condition

No matter the parameter $\mu$ the initial condition in the reduced system is always $\mathbf{x}_{r,0}(\mu) = \mathbf{x}_{r,0} = \Psi^\mathrm{enc}(\mathbf{0})$.

Reconstructed solution

In order to arrive at the reconstructed solution one first has to decode the reduced state and then add the reference state:

\[\mathbf{x}^\mathrm{reconstr}(t;\mu) = \mathbf{x}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\mathbf{x}_r(t;\mu)),\]

where $\mathbf{x}^\mathrm{ref}(\mu) = \mathbf{x}(t_0;\mu) - \Psi^\mathrm{dec}\circ\Psi^\mathrm{dec}(\mathbf{0})$.

Symplectic reduced vector field

A symplectic vector field is one whose flow conserves the symplectic structure $\mathbb{J}$. This is equivalent[1] to there existing a Hamiltonian $H$ s.t. the vector field $X$ can be written as $X = \mathbb{J}\nabla{}H$.

If the full-order Hamiltonian is $H^\mathrm{full}\equiv{}H$ we can obtain another Hamiltonian on the reduces space by simply setting:

\[H^\mathrm{red}(\mathbf{x}_r(t;\mu)) = H(\mathbf{x}^\mathrm{reconstr}(t;\mu)) = H(\mathbf{x}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\mathbf{x}_r(t;\mu))).\]

The ODE associated to this Hamiltonian is also the one corresponding to Manifold Galerkin ROM (see (Lee and Carlberg, 2020)).

Manifold Galerkin ROM

Define the FOM ODE residual as:

\[r: (\mathbf{v}, \xi, \tau; \mu) \mapsto \mathbf{v} - f(\xi, \tau; \mu).\]

The reduced ODE is then defined to be:

\[\dot{\hat{\mathbf{x}}}(t;\mu) = \mathrm{arg\,{}min}_{\hat{\mathbf{v}}\in\mathbb{R}^p}|| r(\mathcal{J}(\hat{\mathbf{x}}(t;\mu))\hat{\mathbf{v}},\hat{\mathbf{x}}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\hat{\mathbf{x}}(t;\mu)),t;\mu) ||_2^2,\]

where $\mathcal{J}$ is the Jacobian of the decoder $\Psi^\mathrm{dec}$. This leads to:

\[\mathcal{J}(\hat{\mathbf{x}}(t;\mu))\hat{\mathbf{v}} - f(\hat{\mathbf{x}}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\hat{\mathbf{x}}(t;\mu)), t; \mu) \overset{!}{=} 0 \implies -\hat{\mathbf{v}} = \mathcal{J}(\hat{\mathbf{x}}(t;\mu))^+f(\hat{\mathbf{x}}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\hat{\mathbf{x}}(t;\mu)), t; \mu),\]

where $\mathcal{J}(\hat{\mathbf{x}}(t;\mu))^+$ is the pseudoinverse of $\mathcal{J}(\hat{\mathbf{x}}(t;\mu))$. Because $\mathcal{J}(\hat{\mathbf{x}}(t;\mu))$ is a symplectic matrix the pseudoinverse is the symplectic inverse (see (Peng and Mohseni, 2016)).

Furthermore, because $f$ is Hamiltonian, the vector field describing $dot{\hat{\mathbf{x}}}(t;\mu)$ will also be Hamiltonian.

References

  • K. Lee and K. Carlberg. “Model reduction of dynamical systems on nonlinear manifolds using

deep convolutional autoencoders”. In: Journal of Computational Physics 404 (2020), p. 108973.

  • Peng L, Mohseni K. Symplectic model reduction of Hamiltonian systems[J]. SIAM Journal on Scientific Computing, 2016, 38(1): A1-A27.
  • 1Technically speaking the definitions are equivalent only for simply-connected manifolds, so also for vector spaces.
+Variational Autoencoders · GeometricMachineLearning.jl

Variational Autoencoders

Variational autoencoders (Lee and Carlberg, 2020) train on the following set:

\[\mathcal{X}(\mathbb{P}_\mathrm{train}) := \{\mathbf{x}^k(\mu) - \mathbf{x}^0(\mu):0\leq{}k\leq{}K,\mu\in\mathbb{P}_\mathrm{train}\},\]

where $\mathbf{x}^k(\mu)\approx\mathbf{x}(t^k;\mu)$. Note that $\mathbf{0}\in\mathcal{X}(\mathbb{P}_\mathrm{train})$ as $k$ can also be zero.

The encoder $\Psi^\mathrm{enc}$ and decoder $\Psi^\mathrm{dec}$ are then trained on this set $\mathcal{X}(\mathbb{P}_\mathrm{train})$ by minimizing the reconstruction error:

\[|| \mathbf{x} - \Psi^\mathrm{dec}\circ\Psi^\mathrm{enc}(\mathbf{x}) ||\text{ for $\mathbf{x}\in\mathcal{X}(\mathbb{P}_\mathrm{train})$}.\]

Initial condition

No matter the parameter $\mu$ the initial condition in the reduced system is always $\mathbf{x}_{r,0}(\mu) = \mathbf{x}_{r,0} = \Psi^\mathrm{enc}(\mathbf{0})$.

Reconstructed solution

In order to arrive at the reconstructed solution one first has to decode the reduced state and then add the reference state:

\[\mathbf{x}^\mathrm{reconstr}(t;\mu) = \mathbf{x}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\mathbf{x}_r(t;\mu)),\]

where $\mathbf{x}^\mathrm{ref}(\mu) = \mathbf{x}(t_0;\mu) - \Psi^\mathrm{dec}\circ\Psi^\mathrm{dec}(\mathbf{0})$.

Symplectic reduced vector field

A symplectic vector field is one whose flow conserves the symplectic structure $\mathbb{J}$. This is equivalent[1] to there existing a Hamiltonian $H$ s.t. the vector field $X$ can be written as $X = \mathbb{J}\nabla{}H$.

If the full-order Hamiltonian is $H^\mathrm{full}\equiv{}H$ we can obtain another Hamiltonian on the reduces space by simply setting:

\[H^\mathrm{red}(\mathbf{x}_r(t;\mu)) = H(\mathbf{x}^\mathrm{reconstr}(t;\mu)) = H(\mathbf{x}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\mathbf{x}_r(t;\mu))).\]

The ODE associated to this Hamiltonian is also the one corresponding to Manifold Galerkin ROM (see (Lee and Carlberg, 2020)).

Manifold Galerkin ROM

Define the FOM ODE residual as:

\[r: (\mathbf{v}, \xi, \tau; \mu) \mapsto \mathbf{v} - f(\xi, \tau; \mu).\]

The reduced ODE is then defined to be:

\[\dot{\hat{\mathbf{x}}}(t;\mu) = \mathrm{arg\,{}min}_{\hat{\mathbf{v}}\in\mathbb{R}^p}|| r(\mathcal{J}(\hat{\mathbf{x}}(t;\mu))\hat{\mathbf{v}},\hat{\mathbf{x}}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\hat{\mathbf{x}}(t;\mu)),t;\mu) ||_2^2,\]

where $\mathcal{J}$ is the Jacobian of the decoder $\Psi^\mathrm{dec}$. This leads to:

\[\mathcal{J}(\hat{\mathbf{x}}(t;\mu))\hat{\mathbf{v}} - f(\hat{\mathbf{x}}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\hat{\mathbf{x}}(t;\mu)), t; \mu) \overset{!}{=} 0 \implies +\hat{\mathbf{v}} = \mathcal{J}(\hat{\mathbf{x}}(t;\mu))^+f(\hat{\mathbf{x}}^\mathrm{ref}(\mu) + \Psi^\mathrm{dec}(\hat{\mathbf{x}}(t;\mu)), t; \mu),\]

where $\mathcal{J}(\hat{\mathbf{x}}(t;\mu))^+$ is the pseudoinverse of $\mathcal{J}(\hat{\mathbf{x}}(t;\mu))$. Because $\mathcal{J}(\hat{\mathbf{x}}(t;\mu))$ is a symplectic matrix the pseudoinverse is the symplectic inverse (see (Peng and Mohseni, 2016)).

Furthermore, because $f$ is Hamiltonian, the vector field describing $dot{\hat{\mathbf{x}}}(t;\mu)$ will also be Hamiltonian.

References

  • K. Lee and K. Carlberg. “Model reduction of dynamical systems on nonlinear manifolds using

deep convolutional autoencoders”. In: Journal of Computational Physics 404 (2020), p. 108973.

  • Peng L, Mohseni K. Symplectic model reduction of Hamiltonian systems[J]. SIAM Journal on Scientific Computing, 2016, 38(1): A1-A27.
  • 1Technically speaking the definitions are equivalent only for simply-connected manifolds, so also for vector spaces.
diff --git a/latest/architectures/linear_symplectic_transformer/index.html b/latest/architectures/linear_symplectic_transformer/index.html index 4915084bd..8afa83250 100644 --- a/latest/architectures/linear_symplectic_transformer/index.html +++ b/latest/architectures/linear_symplectic_transformer/index.html @@ -1,2 +1,2 @@ -Linear Symplectic Transformer · GeometricMachineLearning.jl

Linear Symplectic Transformer

The linear symplectic transformer consists of a combination of linear symplectic attention and gradient layers and is visualized below:

Library Functions

GeometricMachineLearning.LinearSymplecticTransformerType

Realizes the linear Symplectic Transformer.

Constructor:

The constructor is called with the following arguments

  1. dim::Int: System dimension
  2. seq_length::Int: Number of time steps that the transformer considers.

Optional keyword arguments:

  • n_sympnet::Int=2: The number of sympnet layers in the transformer.
  • upscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer.
  • L::Int=1: The number of transformer units.
  • activation=tanh: The activation function for the SympNet layers.
  • init_upper::Bool=true: Specifies if the first layer is a $Q$-type layer (init_upper=true) or if it is a $P$-type layer (init_upper=false).
source
+Linear Symplectic Transformer · GeometricMachineLearning.jl

Linear Symplectic Transformer

The linear symplectic transformer consists of a combination of linear symplectic attention and gradient layers and is visualized below:

Library Functions

GeometricMachineLearning.LinearSymplecticTransformerType

Realizes the linear Symplectic Transformer.

Constructor:

The constructor is called with the following arguments

  1. dim::Int: System dimension
  2. seq_length::Int: Number of time steps that the transformer considers.

Optional keyword arguments:

  • n_sympnet::Int=2: The number of sympnet layers in the transformer.
  • upscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer.
  • L::Int=1: The number of transformer units.
  • activation=tanh: The activation function for the SympNet layers.
  • init_upper::Bool=true: Specifies if the first layer is a $Q$-type layer (init_upper=true) or if it is a $P$-type layer (init_upper=false).
source
diff --git a/latest/architectures/neural_network_integrators/index.html b/latest/architectures/neural_network_integrators/index.html index 832f308cc..0aea2770c 100644 --- a/latest/architectures/neural_network_integrators/index.html +++ b/latest/architectures/neural_network_integrators/index.html @@ -1,2 +1,2 @@ -Neural Network Integrators · GeometricMachineLearning.jl

Neural Network Integrators

In GeometricMachineLearning we can divide most neural network architectures (that are used for applications to physical systems) into two categories: autoencoders and integrators. Integrator in its most general form refers to an approximation of the flow of an ODE (see the section on the existence and uniqueness theorem) by a numerical scheme. Traditionally these numerical schemes were constructed by defining certain relationships between a known time step $z^{(t)}$ and a future unknown one $z^{(t+1)}$ [7, 22]:

\[ f(z^{(t)}, z^{(t+1)}) = 0.\]

One usually refers to such a relationship as an "integration scheme". If this relationship can be reformulated as

\[ z^{(t+1)} = g(z^{(t)}),\]

then we refer to the scheme as explicit, if it cannot be reformulated in such a way then we refer to it as implicit. Implicit schemes are typically more expensive to solve than explicit ones. The Julia library GeometricIntegrators [23] offers a wide variety of integration schemes both implicit and explicit.

The neural network integrators in GeometricMachineLearning (the corresponding type is NeuralNetworkIntegrator) are all explicit integration schemes where the function $g$ above is modeled with a neural network.

Neural networks, as an alternative to traditional methods, are employed because of (i) potentially superior performance and (ii) an ability to learn unknown dynamics from data.

Multi-step methods

Multi-step method [19, 20] refers to schemes that are of the form[1]:

\[ f(z^{(t - \mathtt{sl} + 1)}, z^{(t - \mathtt{sl} + 2)}, \ldots, z^{(t)}, z^{(t + 1)}, \ldots, z^{(\mathtt{pw} + 1)}) = 0,\]

where sl is short for sequence length and pw is short for prediction window. In contrast to traditional single-step methods, sl and pw can be greater than 1. An explicit multi-step method has the following form:

\[[z^{(t+1)}, \ldots, z^{(t+\mathtt{pw})}] = g(z^{(t - \mathtt{sl} + 1)}, \ldots, z^{(t)}).\]

There are essentially two ways to construct multi-step methods with neural networks: the older one is using recurrent neural networks such as long short-term memory cells (LSTMs, [24]) and the newer one is using transformer neural networks [14]. Both of these approaches have been successfully employed to learn multi-step methods (see [25, 26] for the former and [21, 27, 28] for the latter), but because the transformer architecture exhibits superior performance on modern hardware and can be imbued with geometric properties it is recommended to always use a transformer-derived architecture when dealing with time series[2].

Explicit multi-step methods derived from he transformer are always subtypes of the type TransformerIntegrator in GeometricMachineLearning. In GeometricMachineLearning the standard transformer, the volume-preserving transformer and the linear symplectic transformer are implemented.

Library Functions

References

[7]
E. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).
[22]
B. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).
[23]
[17]
K. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).
[24]
S. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).
[14]
A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).
[25]
S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).
[26]
K. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).
[27]
A. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).
[28]
A. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, $\beta$-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).
[21]
B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).
  • 1We again assume that all the steps up to and including $t$ are known.
  • 2GeometricMachineLearning also has an LSTM implementation, but this may be deprecated in the future.
+Neural Network Integrators · GeometricMachineLearning.jl

Neural Network Integrators

In GeometricMachineLearning we can divide most neural network architectures (that are used for applications to physical systems) into two categories: autoencoders and integrators. Integrator in its most general form refers to an approximation of the flow of an ODE (see the section on the existence and uniqueness theorem) by a numerical scheme. Traditionally these numerical schemes were constructed by defining certain relationships between a known time step $z^{(t)}$ and a future unknown one $z^{(t+1)}$ [7, 24]:

\[ f(z^{(t)}, z^{(t+1)}) = 0.\]

One usually refers to such a relationship as an "integration scheme". If this relationship can be reformulated as

\[ z^{(t+1)} = g(z^{(t)}),\]

then we refer to the scheme as explicit, if it cannot be reformulated in such a way then we refer to it as implicit. Implicit schemes are typically more expensive to solve than explicit ones. The Julia library GeometricIntegrators [25] offers a wide variety of integration schemes both implicit and explicit.

The neural network integrators in GeometricMachineLearning (the corresponding type is NeuralNetworkIntegrator) are all explicit integration schemes where the function $g$ above is modeled with a neural network.

Neural networks, as an alternative to traditional methods, are employed because of (i) potentially superior performance and (ii) an ability to learn unknown dynamics from data.

Multi-step methods

Multi-step method [21, 22] refers to schemes that are of the form[1]:

\[ f(z^{(t - \mathtt{sl} + 1)}, z^{(t - \mathtt{sl} + 2)}, \ldots, z^{(t)}, z^{(t + 1)}, \ldots, z^{(\mathtt{pw} + 1)}) = 0,\]

where sl is short for sequence length and pw is short for prediction window. In contrast to traditional single-step methods, sl and pw can be greater than 1. An explicit multi-step method has the following form:

\[[z^{(t+1)}, \ldots, z^{(t+\mathtt{pw})}] = g(z^{(t - \mathtt{sl} + 1)}, \ldots, z^{(t)}).\]

There are essentially two ways to construct multi-step methods with neural networks: the older one is using recurrent neural networks such as long short-term memory cells (LSTMs, [26]) and the newer one is using transformer neural networks [17]. Both of these approaches have been successfully employed to learn multi-step methods (see [27, 28] for the former and [23, 29, 30] for the latter), but because the transformer architecture exhibits superior performance on modern hardware and can be imbued with geometric properties it is recommended to always use a transformer-derived architecture when dealing with time series[2].

Explicit multi-step methods derived from he transformer are always subtypes of the type TransformerIntegrator in GeometricMachineLearning. In GeometricMachineLearning the standard transformer, the volume-preserving transformer and the linear symplectic transformer are implemented.

Library Functions

References

[7]
E. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).
[24]
B. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).
[25]
[19]
K. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).
[26]
S. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).
[17]
A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).
[27]
S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).
[28]
K. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).
[29]
A. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).
[30]
A. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, $\beta$-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).
[23]
B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).
  • 1We again assume that all the steps up to and including $t$ are known.
  • 2GeometricMachineLearning also has an LSTM implementation, but this may be deprecated in the future.
diff --git a/latest/architectures/symplectic_autoencoder/index.html b/latest/architectures/symplectic_autoencoder/index.html index b12e1b00d..bd5e011bf 100644 --- a/latest/architectures/symplectic_autoencoder/index.html +++ b/latest/architectures/symplectic_autoencoder/index.html @@ -1,2 +1,2 @@ -Symplectic Autoencoders · GeometricMachineLearning.jl

Symplectic Autoencoder

A visualization of an instance of SymplecticAutoencoder is shown below:

The intermediate dimension $M$ is calculated via n : (N - n) ÷ (n_blocks - 1) : N. Further we have the following choices:

  • n_encoder_layers::Integer = 4
  • n_encoder_blocks::Integer = 2
  • n_decoder_layers::Integer = 2
  • n_decoder_blocks::Integer = 3
  • encoder_init_q::Bool = true
  • decoder_init_q::Bool = true

Note that all of these are keyword arguments that can be supplied to SymplecticAutoencoder.

+Symplectic Autoencoders · GeometricMachineLearning.jl

Symplectic Autoencoder

A visualization of an instance of SymplecticAutoencoder is shown below:

The intermediate dimension $M$ is calculated via n : (N - n) ÷ (n_blocks - 1) : N. Further we have the following choices:

  • n_encoder_layers::Integer = 4
  • n_encoder_blocks::Integer = 2
  • n_decoder_layers::Integer = 2
  • n_decoder_blocks::Integer = 3
  • encoder_init_q::Bool = true
  • decoder_init_q::Bool = true

Note that all of these are keyword arguments that can be supplied to SymplecticAutoencoder.

diff --git a/latest/architectures/sympnet/index.html b/latest/architectures/sympnet/index.html index 390f1b3f9..d00347575 100644 --- a/latest/architectures/sympnet/index.html +++ b/latest/architectures/sympnet/index.html @@ -1,5 +1,5 @@ -SympNet · GeometricMachineLearning.jl

SympNet Architecture

This document discusses the SympNet architecture and its implementation in GeometricMachineLearning.jl.

Quick overview of the theory of SympNets

Principle

SympNets (see [29] for the eponymous paper) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take $(q^T,p^T)^T=(q_1,\ldots,q_d,p_1,\ldots,p_d)^T\in \mathbb{R}^{2d}$ as the coordinates in phase space, where $q=(q_1, \ldots, q_d)^T\in \mathbb{R}^{d}$ is refered to as the position and $p=(p_1, \ldots, p_d)^T\in \mathbb{R}^{d}$ the momentum. Given a point $(q^T,p^T)^T$ in $\mathbb{R}^{2d}$ the SympNet aims to compute the next position $((q')^T,(p')^T)^T$ and thus predicts the trajectory while preserving the symplectic structure of the system. SympNets are enforcing symplecticity strongly, meaning that this property is hard-coded into the network architecture. The layers are reminiscent of traditional neural network feedforward layers, but have a strong restriction imposed on them in order to be symplectic.

SympNets can be viewed as a "symplectic integrator" (see [7] and [22]). Their goal is to predict, based on an initial condition $((q^{(0)})^T,(p^{(0)})^T)^T$, a sequence of points in phase space that fit the training data as well as possible:

\[\begin{pmatrix} q^{(0)} \\ p^{(0)} \end{pmatrix}, \cdots, \begin{pmatrix} \tilde{q}^{(1)} \\ \tilde{p}^{(1)} \end{pmatrix}, \cdots \begin{pmatrix} \tilde{q}^{(n)} \\ \tilde{p}^{(n)} \end{pmatrix}.\]

The tilde in the above equation indicates predicted data. The time step between predictions is not a parameter we can choose but is related to the temporal frequency of the training data. This means that if data is recorded in an interval of e.g. 0.1 seconds, then this will be the time step of our integrator.

There are two types of SympNet architectures: $LA$-SympNets and $G$-SympNets.

$LA$-SympNet

The first type of SympNets, $LA$-SympNets, are obtained from composing two types of layers: symplectic linear layers and symplectic activation layers. For a given integer $n$, a symplectic linear layer is defined by

\[\mathcal{L}^{n,q} +SympNet · GeometricMachineLearning.jl

SympNet Architecture

This document discusses the SympNet architecture and its implementation in GeometricMachineLearning.jl.

Quick overview of the theory of SympNets

Principle

SympNets (see [31] for the eponymous paper) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take $(q^T,p^T)^T=(q_1,\ldots,q_d,p_1,\ldots,p_d)^T\in \mathbb{R}^{2d}$ as the coordinates in phase space, where $q=(q_1, \ldots, q_d)^T\in \mathbb{R}^{d}$ is refered to as the position and $p=(p_1, \ldots, p_d)^T\in \mathbb{R}^{d}$ the momentum. Given a point $(q^T,p^T)^T$ in $\mathbb{R}^{2d}$ the SympNet aims to compute the next position $((q')^T,(p')^T)^T$ and thus predicts the trajectory while preserving the symplectic structure of the system. SympNets are enforcing symplecticity strongly, meaning that this property is hard-coded into the network architecture. The layers are reminiscent of traditional neural network feedforward layers, but have a strong restriction imposed on them in order to be symplectic.

SympNets can be viewed as a "symplectic integrator" (see [7] and [24]). Their goal is to predict, based on an initial condition $((q^{(0)})^T,(p^{(0)})^T)^T$, a sequence of points in phase space that fit the training data as well as possible:

\[\begin{pmatrix} q^{(0)} \\ p^{(0)} \end{pmatrix}, \cdots, \begin{pmatrix} \tilde{q}^{(1)} \\ \tilde{p}^{(1)} \end{pmatrix}, \cdots \begin{pmatrix} \tilde{q}^{(n)} \\ \tilde{p}^{(n)} \end{pmatrix}.\]

The tilde in the above equation indicates predicted data. The time step between predictions is not a parameter we can choose but is related to the temporal frequency of the training data. This means that if data is recorded in an interval of e.g. 0.1 seconds, then this will be the time step of our integrator.

There are two types of SympNet architectures: $LA$-SympNets and $G$-SympNets.

$LA$-SympNet

The first type of SympNets, $LA$-SympNets, are obtained from composing two types of layers: symplectic linear layers and symplectic activation layers. For a given integer $n$, a symplectic linear layer is defined by

\[\mathcal{L}^{n,q} \begin{pmatrix} q \\ p \\ @@ -39,7 +39,7 @@ \end{pmatrix} \begin{pmatrix} q \\ p \end{pmatrix} - + b . \]

The superscripts $q$ and $p$ indicate whether the $q$ or the $p$ part is changed. The learnable parameters are the symmetric matrices $S^i\in\mathbb{R}^{d\times d}$ and the bias $b\in\mathbb{R}^{2d}$. The integer $n$ is the width of the symplectic linear layer. It can be shown that five of these layers, i.e. $n\geq{}5$, can represent any linear symplectic map (see [30]), so $n$ need not be larger than five. We denote the set of symplectic linear layers by $\mathcal{M}^L$.

The second type of layer needed for $LA$-SympNets are so-called activation layers:

\[ \mathcal{A}^{q} \begin{pmatrix} q \\ + + b . \]

The superscripts $q$ and $p$ indicate whether the $q$ or the $p$ part is changed. The learnable parameters are the symmetric matrices $S^i\in\mathbb{R}^{d\times d}$ and the bias $b\in\mathbb{R}^{2d}$. The integer $n$ is the width of the symplectic linear layer. It can be shown that five of these layers, i.e. $n\geq{}5$, can represent any linear symplectic map (see [32]), so $n$ need not be larger than five. We denote the set of symplectic linear layers by $\mathcal{M}^L$.

The second type of layer needed for $LA$-SympNets are so-called activation layers:

\[ \mathcal{A}^{q} \begin{pmatrix} q \\ p \end{pmatrix} = \begin{bmatrix} I&\hat{\sigma}^{a} \\ @@ -81,4 +81,4 @@ \begin{pmatrix} q \\ K^T \mathrm{diag}(a)\sigma(Kq+b)+p - \end{pmatrix}.\]

The parameters of this layer are the scaling matrix $K\in\mathbb{R}^{m\times d}$, the bias $b\in\mathbb{R}^{m}$ and the scaling vector $a\in\mathbb{R}^{m}$. The name "gradient layer" has its origin in the fact that the expression $[K^T\mathrm{diag}(a)\sigma(Kq+b)]_i = \sum_jk_{ji}a_j\sigma(\sum_\ell{}k_{j\ell}q_\ell+b_j)$ is the gradient of a function $\sum_ja_j\tilde{\sigma}(\sum_\ell{}k_{j\ell}q_\ell+b_j)$, where $\tilde{\sigma}$ is the antiderivative of $\sigma$. The first dimension of $K$ we refer to as the upscaling dimension.

If we denote by $\mathcal{M}^G$ the set of gradient layers, a $G$-SympNet is a function of the form $\Psi=g_k \circ g_{k-1} \circ \cdots \circ g_0$ where $(g_i)_{0\leq i\leq k} \subset (\mathcal{M}^G)^k$. The index $k$ is again the number of hidden layers.

Further note here the different roles played by round and square brackets: the latter indicates a nonlinear operation as opposed to a regular vector or matrix.

Universal approximation theorems

In order to state the universal approximation theorem for both architectures we first need a few definitions:

Let $U$ be an open set of $\mathbb{R}^{2d}$, and let us denote by $\mathcal{SP}^r(U)$ the set of $C^r$ smooth symplectic maps on $U$. We now define a topology on $C^r(K, \mathbb{R}^n)$, the set of $C^r$-smooth maps from a compact set $K\subset\mathbb{R}^{n}$ to $\mathbb{R}^{n}$ through the norm

\[||f||_{C^r(K,\mathbb{R}^{n})} = \underset{|\alpha|\leq r}{\sum} \underset{1\leq i \leq n}{\max}\underset{x\in K}{\sup} |D^\alpha f_i(x)|,\]

where the differential operator $D^\alpha$ is defined by

\[D^\alpha f = \frac{\partial^{|\alpha|} f}{\partial x_1^{\alpha_1}...x_n^{\alpha_n}},\]

with $|\alpha| = \alpha_1 +...+ \alpha_n$.

Definition $\sigma$ is $r$-finite if $\sigma\in C^r(\mathbb{R},\mathbb{R})$ and $\int |D^r\sigma(x)|dx <+\infty$.

Definition Let $m,n,r\in \mathbb{N}$ with $m,n>0$ be given, $U$ an open set of $\mathbb{R}^m$, and $I,J\subset C^r(U,\mathbb{R}^n)$. We say $J$ is $r$-uniformly dense on compacta in $I$ if $J \subset I$ and for any $f\in I$, $\epsilon>0$, and any compact $K\subset U$, there exists $g\in J$ such that $||f-g||_{C^r(K,\mathbb{R}^{n})} < \epsilon$.

We can now state the universal approximation theorems:

Theorem (Approximation theorem for LA-SympNet) For any positive integer $r>0$ and open set $U\in \mathbb{R}^{2d}$, the set of $LA$-SympNet is $r$-uniformly dense on compacta in $SP^r(U)$ if the activation function $\sigma$ is $r$-finite.

Theorem (Approximation theorem for G-SympNet) For any positive integer $r>0$ and open set $U\in \mathbb{R}^{2d}$, the set of $G$-SympNet is $r$-uniformly dense on compacta in $SP^r(U)$ if the activation function $\sigma$ is $r$-finite.

There are many $r$-finite activation functions commonly used in neural networks, for example:

  • sigmoid $\sigma(x)=\frac{1}{1+e^{-x}}$ for any positive integer $r$,
  • tanh $\tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}$ for any positive integer $r$.

The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on $\mathbb{R}^{2d}$. But this does not tell us anything about how to optimize the network. This is can be done with any common neural network optimizer and these neural network optimizers always rely on a corresponding loss function.

Loss function

To train the SympNet, one need data along a trajectory such that the model is trained to perform an integration. These data are $(Q,P)$ where $Q[i,j]$ (respectively $P[i,j]$) is the real number $q_j(t_i)$ (respectively $p[i,j]$) which is the j-th coordinates of the generalized position (respectively momentum) at the i-th time step. One also need a loss function defined as :

\[Loss(Q,P) = \underset{i}{\sum} d(\Phi(Q[i,-],P[i,-]), [Q[i,-] P[i,-]]^T)\]

where $d$ is a distance on $\mathbb{R}^d$.

See the tutorial section for an introduction into using SympNets with GeometricMachineLearning.jl.

References

[29]
P. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).
  • 1Note that if $k=1$ then the $LA$-SympNet consists of only one linear layer.
+ \end{pmatrix}.\]

The parameters of this layer are the scaling matrix $K\in\mathbb{R}^{m\times d}$, the bias $b\in\mathbb{R}^{m}$ and the scaling vector $a\in\mathbb{R}^{m}$. The name "gradient layer" has its origin in the fact that the expression $[K^T\mathrm{diag}(a)\sigma(Kq+b)]_i = \sum_jk_{ji}a_j\sigma(\sum_\ell{}k_{j\ell}q_\ell+b_j)$ is the gradient of a function $\sum_ja_j\tilde{\sigma}(\sum_\ell{}k_{j\ell}q_\ell+b_j)$, where $\tilde{\sigma}$ is the antiderivative of $\sigma$. The first dimension of $K$ we refer to as the upscaling dimension.

If we denote by $\mathcal{M}^G$ the set of gradient layers, a $G$-SympNet is a function of the form $\Psi=g_k \circ g_{k-1} \circ \cdots \circ g_0$ where $(g_i)_{0\leq i\leq k} \subset (\mathcal{M}^G)^k$. The index $k$ is again the number of hidden layers.

Further note here the different roles played by round and square brackets: the latter indicates a nonlinear operation as opposed to a regular vector or matrix.

Universal approximation theorems

In order to state the universal approximation theorem for both architectures we first need a few definitions:

Let $U$ be an open set of $\mathbb{R}^{2d}$, and let us denote by $\mathcal{SP}^r(U)$ the set of $C^r$ smooth symplectic maps on $U$. We now define a topology on $C^r(K, \mathbb{R}^n)$, the set of $C^r$-smooth maps from a compact set $K\subset\mathbb{R}^{n}$ to $\mathbb{R}^{n}$ through the norm

\[||f||_{C^r(K,\mathbb{R}^{n})} = \underset{|\alpha|\leq r}{\sum} \underset{1\leq i \leq n}{\max}\underset{x\in K}{\sup} |D^\alpha f_i(x)|,\]

where the differential operator $D^\alpha$ is defined by

\[D^\alpha f = \frac{\partial^{|\alpha|} f}{\partial x_1^{\alpha_1}...x_n^{\alpha_n}},\]

with $|\alpha| = \alpha_1 +...+ \alpha_n$.

Definition $\sigma$ is $r$-finite if $\sigma\in C^r(\mathbb{R},\mathbb{R})$ and $\int |D^r\sigma(x)|dx <+\infty$.

Definition Let $m,n,r\in \mathbb{N}$ with $m,n>0$ be given, $U$ an open set of $\mathbb{R}^m$, and $I,J\subset C^r(U,\mathbb{R}^n)$. We say $J$ is $r$-uniformly dense on compacta in $I$ if $J \subset I$ and for any $f\in I$, $\epsilon>0$, and any compact $K\subset U$, there exists $g\in J$ such that $||f-g||_{C^r(K,\mathbb{R}^{n})} < \epsilon$.

We can now state the universal approximation theorems:

Theorem (Approximation theorem for LA-SympNet) For any positive integer $r>0$ and open set $U\in \mathbb{R}^{2d}$, the set of $LA$-SympNet is $r$-uniformly dense on compacta in $SP^r(U)$ if the activation function $\sigma$ is $r$-finite.

Theorem (Approximation theorem for G-SympNet) For any positive integer $r>0$ and open set $U\in \mathbb{R}^{2d}$, the set of $G$-SympNet is $r$-uniformly dense on compacta in $SP^r(U)$ if the activation function $\sigma$ is $r$-finite.

There are many $r$-finite activation functions commonly used in neural networks, for example:

  • sigmoid $\sigma(x)=\frac{1}{1+e^{-x}}$ for any positive integer $r$,
  • tanh $\tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}$ for any positive integer $r$.

The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on $\mathbb{R}^{2d}$. But this does not tell us anything about how to optimize the network. This is can be done with any common neural network optimizer and these neural network optimizers always rely on a corresponding loss function.

Loss function

To train the SympNet, one need data along a trajectory such that the model is trained to perform an integration. These data are $(Q,P)$ where $Q[i,j]$ (respectively $P[i,j]$) is the real number $q_j(t_i)$ (respectively $p[i,j]$) which is the j-th coordinates of the generalized position (respectively momentum) at the i-th time step. One also need a loss function defined as :

\[Loss(Q,P) = \underset{i}{\sum} d(\Phi(Q[i,-],P[i,-]), [Q[i,-] P[i,-]]^T)\]

where $d$ is a distance on $\mathbb{R}^d$.

See the tutorial section for an introduction into using SympNets with GeometricMachineLearning.jl.

References

[31]
P. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).
  • 1Note that if $k=1$ then the $LA$-SympNet consists of only one linear layer.
diff --git a/latest/architectures/transformer/index.html b/latest/architectures/transformer/index.html index 5ce139a53..476f5a4e2 100644 --- a/latest/architectures/transformer/index.html +++ b/latest/architectures/transformer/index.html @@ -1,2 +1,2 @@ -Standard Transformer · GeometricMachineLearning.jl

Standard Transformer

The transformer is a relatively modern neural network architecture [14] that has come to dominate the field of natural language processing (NLP, [31]) and replaced the previously dominant long-short term memory cells (LSTM, [24]). Its success is due to a variety of factors:

  • unlike LSTMs it consists of very simple building blocks and hence is easier to interpret mathematically,
  • it is very flexible in its application and the data it is fed with do not have to conform to a rigid pattern,
  • transformers utilize modern hardware (especially GPUs) very effectively.

The transformer architecture is sketched below:

It is nothing more than a combination of a multihead attention layer and a residual neural network[1] (ResNet).

Library Functions

GeometricMachineLearning.StandardTransformerIntegratorType

The regular transformer used as an integrator (multi-step method).

The constructor is called with one argument:

  • sys_dim::Int

The following are keyword arguments:

  • transformer_dim::Int: the default is transformer_dim = sys_dim.
  • n_blocks::Int: The default is 1.
  • n_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)
  • L::Int the number of transformer blocks (default is L = 2).
  • upscaling_activation: by default identity
  • resnet_activation: by default tanh
  • add_connection:Bool=true: if the input should be added to the output.
source
  • 1A ResNet is nothing more than a neural network to whose output we again add the input, i.e. every ResNet is of the form $\mathrm{ResNet}(x) = x + \mathcal{NN}(x)$.
+Standard Transformer · GeometricMachineLearning.jl

Standard Transformer

The transformer is a relatively modern neural network architecture [17] that has come to dominate the field of natural language processing (NLP, [33]) and replaced the previously dominant long-short term memory cells (LSTM, [26]). Its success is due to a variety of factors:

  • unlike LSTMs it consists of very simple building blocks and hence is easier to interpret mathematically,
  • it is very flexible in its application and the data it is fed with do not have to conform to a rigid pattern,
  • transformers utilize modern hardware (especially GPUs) very effectively.

The transformer architecture is sketched below:

It is nothing more than a combination of a multihead attention layer and a residual neural network[1] (ResNet).

Library Functions

GeometricMachineLearning.StandardTransformerIntegratorType

The regular transformer used as an integrator (multi-step method).

The constructor is called with one argument:

  • sys_dim::Int

The following are keyword arguments:

  • transformer_dim::Int: the default is transformer_dim = sys_dim.
  • n_blocks::Int: The default is 1.
  • n_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)
  • L::Int the number of transformer blocks (default is L = 2).
  • upscaling_activation: by default identity
  • resnet_activation: by default tanh
  • add_connection:Bool=true: if the input should be added to the output.
source
  • 1A ResNet is nothing more than a neural network to whose output we again add the input, i.e. every ResNet is of the form $\mathrm{ResNet}(x) = x + \mathcal{NN}(x)$.
diff --git a/latest/architectures/volume_preserving_feedforward/index.html b/latest/architectures/volume_preserving_feedforward/index.html index e61fa73cf..62282a820 100644 --- a/latest/architectures/volume_preserving_feedforward/index.html +++ b/latest/architectures/volume_preserving_feedforward/index.html @@ -1,2 +1,2 @@ -Volume-Preserving FeedForward · GeometricMachineLearning.jl

Volume-Preserving Feedforward Neural Network

Neural network architecture

The constructor produces the following architecture[1]:

Here LinearLowerLayer performs $x \mapsto x + Lx$ and NonLinearLowerLayer performs $x \mapsto x + \sigma(Lx + b)$. The activation function $\sigma$ is the forth input argument to the constructor and tanh by default.

Note on Sympnets

As SympNets are symplectic maps, they also conserve phase space volume and therefore form a subcategory of volume-preserving feedforward layers.

Library Functions

GeometricMachineLearning.VolumePreservingFeedForwardType

Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer.

Constructor

The constructor is called with the following arguments:

  • sys_dim::Int: The system dimension.
  • n_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.
  • n_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
  • activation: The activation function for the nonlinear layers in a block.
  • init_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper.
source
  • 1Based on the input arguments n_linear and n_blocks. In this example init_upper is set to false, which means that the first layer is of type lower followed by a layer of type upper.
+Volume-Preserving FeedForward · GeometricMachineLearning.jl

Volume-Preserving Feedforward Neural Network

Neural network architecture

The constructor produces the following architecture[1]:

Here LinearLowerLayer performs $x \mapsto x + Lx$ and NonLinearLowerLayer performs $x \mapsto x + \sigma(Lx + b)$. The activation function $\sigma$ is the forth input argument to the constructor and tanh by default.

Note on Sympnets

As SympNets are symplectic maps, they also conserve phase space volume and therefore form a subcategory of volume-preserving feedforward layers.

Library Functions

GeometricMachineLearning.VolumePreservingFeedForwardType

Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer.

Constructor

The constructor is called with the following arguments:

  • sys_dim::Int: The system dimension.
  • n_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.
  • n_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
  • activation: The activation function for the nonlinear layers in a block.
  • init_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper.
source
  • 1Based on the input arguments n_linear and n_blocks. In this example init_upper is set to false, which means that the first layer is of type lower followed by a layer of type upper.
diff --git a/latest/architectures/volume_preserving_transformer/index.html b/latest/architectures/volume_preserving_transformer/index.html index d41f06380..e2e300aff 100644 --- a/latest/architectures/volume_preserving_transformer/index.html +++ b/latest/architectures/volume_preserving_transformer/index.html @@ -1,2 +1,2 @@ -Volume-Preserving Transformer · GeometricMachineLearning.jl

Volume-Preserving Transformer

The volume-preserving transformer is, similar to the standard transformer, a combination of two different neural networks: a volume-preserving attention layer and a volume-preserving feedforward layer. It is visualized below:

Library Functions

GeometricMachineLearning.VolumePreservingTransformerType

The volume-preserving transformer with the Cayley activation function and built-in upscaling.

Constructor

The arguments for the constructor are:

  1. sys_dim::Int
  2. seq_length::Int: The sequence length of the data fed into the transformer.

The following are keyword argumetns:

  • n_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.
  • n_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
  • L::Int=1: The number of transformer units.
  • activation=tanh: The activation function.
  • init_upper::Bool=false: Specifies if the network first acts on the $q$ component.
  • skew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.
source

References

[21]
B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).
+Volume-Preserving Transformer · GeometricMachineLearning.jl

Volume-Preserving Transformer

The volume-preserving transformer is, similar to the standard transformer, a combination of two different neural networks: a volume-preserving attention layer and a volume-preserving feedforward layer. It is visualized below:

Library Functions

GeometricMachineLearning.VolumePreservingTransformerType

The volume-preserving transformer with the Cayley activation function and built-in upscaling.

Constructor

The arguments for the constructor are:

  1. sys_dim::Int
  2. seq_length::Int: The sequence length of the data fed into the transformer.

The following are keyword argumetns:

  • n_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.
  • n_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
  • L::Int=1: The number of transformer units.
  • activation=tanh: The activation function.
  • init_upper::Bool=false: Specifies if the network first acts on the $q$ component.
  • skew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.
source

References

[23]
B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).
diff --git a/latest/arrays/global_tangent_spaces/index.html b/latest/arrays/global_tangent_spaces/index.html new file mode 100644 index 000000000..edcc3c77b --- /dev/null +++ b/latest/arrays/global_tangent_spaces/index.html @@ -0,0 +1,177 @@ + +Global Tangent Spaces · GeometricMachineLearning.jl

Global Tangent Spaces

In GeometricMachineLearning standard neural network optimizers are generalized to homogeneous spaces by leveraging the special structure of the tangent spaces of this class of manifolds. When we introduced homogeneous spaces we already talked about that every tangent space to a homogeneous space $T_Y\mathcal{M}$ is of the form:

\[ T_Y\mathcal{M} = \mathfrak{g} \cdot Y := \{AY: A\in{}\mathfrak{g}\}.\]

We then have a decomposition of $\mathfrak{g}$ into a vertical part $\mathfrak{g}^{\mathrm{ver}, Y}$ and a horizontal part $\mathfrak{g}^{\mathrm{hor}, Y}$ and the horizontal part is isomorphic to $T_Y\mathcal{M}$.

We now identify a special element $E \in \mathcal{M}$ and designate the horizontal component $\mathfrak{g}^{\mathrm{hor}, E}$ as our global tangent space. We will refer to this global tangent space by $\mathfrak{g}^\mathrm{hor}$. We can now find a transformation from any $\mathfrak{g}^{\mathrm{hor}, Y}$ to $\mathfrak{g}^\mathrm{hor}$ and vice-versa (these spaces are isomorphic).

Theorem

Let $A\in{}G$ an element such that $AE = Y$. Then we have

\[A^{-1}\cdot\mathfrak{g}^{\mathrm{hor},Y}\cdot{}A = \mathfrak{g}^\mathrm{hor},\]

i.e. for every element $B\in\mathfrak{g}^\mathrm{hor}$ we can find a $B^Y \in \mathfrak{g}^{\mathrm{hor},Y}$ s.t. $B = A^{-1}B^YA$ (and vice-versa).

Proof

We first show that for every $B^Y\in\mathfrak{g}^{\mathrm{hor},Y}$ the element $A^{-1}B^YA$ is in $\mathfrak{g}^{\mathrm{hor}}$. First not that $A^{-1}B^YA\in\mathfrak{g}$ by a fundamental theorem of Lie group theory (closedness of the Lie algebra under adjoint action). Now assume that $A^{-1}B^YA$ is not fully contained in $\mathfrak{g}^\mathrm{hor}$, i.e. it also has a vertical component. So we would lose information when performing $A^{-1}B^YA \mapsto A^{-1}B^YAE = A^{-1}B^YY$, but this contradicts the fact that $B^Y\in\mathfrak{g}^{\mathrm{hor},Y}.$ We now have to proof that for every $B\in\mathfrak{g}^\mathrm{hor}$ we can find an element in $\mathfrak{g}^{\mathrm{hor}, Y}$ such that this element is mapped to $B$. By a argument similar to the one above we can show that $ABA^{-1}\in\mathfrak{g}^\mathrm{hor, Y}$ and this element maps to $B$. Proofing that the map is injective is now trivial.

We should note that we have written all Lie group and Lie algebra actions as simple matrix multiplications, like $AE = Y$. For some Lie groups and Lie algebras we should use different notations [9]. These Lie groups are however not relevant for what we use in GeometricMachineLearning and we will stick to regular matrix notation.

Global Sections

Note that the theorem above requires us to find an element $A\in{}G$ such that $AE = Y$. If we can find a mapping $\lambda:\mathcal{M}\to{}G$ we call such a mapping a global section.

Theorem

We call a mapping from $\lambda:\mathcal{M} \to G$ a homogeneous space to its associated Lie group a global section if it satisfies:

\[\lambda(Y)E = Y,\]

where $E$ is the distinct element of the homogeneous space.

Note that in general global sections are not unique because the rank of $G$ is in general greater than that of $\mathcal{M}$. We give an example of how to construct such a global section for the Stiefel and the Grassmann manifolds below.

The Global Tangent Space for the Stiefel Manifold

We now discuss the specific form of the global tangent space for the Stiefel manifold. We choose the distinct element[1] $E$ to have an especially simple form (this matrix can be build by calling StiefelProjection):

\[E = \begin{bmatrix} +\mathbb{I}_n \\ +\mathbb{O} +\end{bmatrix}\in{}St(n, N).\]

Based on this elements of the vector space $\mathfrak{g}^{\mathrm{hor}, E} =: \mathfrak{g}^{\mathrm{hor}}$ are:

\[\begin{pmatrix} +A & B^T \\ B & \mathbb{O} +\end{pmatrix},\]

where $A$ is a skew-symmetric matrix of size $n\times{}n$ and $B$ is an arbitrary matrix of size $(N - n)\times{}n$.

Arrays of type $\mathfrak{g}^{\mathrm{hor}, E}$ are implemented in GeometricMachineLearning under the name StiefelLieAlgHorMatrix.

We can call this with e.g. a skew-symmetric matrix $A$ and an arbitrary matrix $B$:

N, n = 10, 4
+
+A = rand(SkewSymMatrix, n)
4×4 SkewSymMatrix{Float64, Vector{Float64}}:
+ 0.0       -0.618058  -0.942954  -0.94312
+ 0.618058   0.0       -0.277085  -0.65307
+ 0.942954   0.277085   0.0       -0.544225
+ 0.94312    0.65307    0.544225   0.0
B = rand(N - n, n)
6×4 Matrix{Float64}:
+ 0.599558  0.120066     0.355605  0.533721
+ 0.624685  0.00618571   0.329038  0.810655
+ 0.546357  0.675435     0.666189  0.70082
+ 0.407207  0.216524     0.90568   0.208163
+ 0.67539   0.000139574  0.317462  0.191557
+ 0.115811  0.817239     0.811343  0.853422
B1 = StiefelLieAlgHorMatrix(A, B, N, n)
10×10 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:
+ 0.0       -0.618058     -0.942954  …  -0.407207  -0.67539      -0.115811
+ 0.618058   0.0          -0.277085     -0.216524  -0.000139574  -0.817239
+ 0.942954   0.277085      0.0          -0.90568   -0.317462     -0.811343
+ 0.94312    0.65307       0.544225     -0.208163  -0.191557     -0.853422
+ 0.599558   0.120066      0.355605      0.0        0.0           0.0
+ 0.624685   0.00618571    0.329038  …   0.0        0.0           0.0
+ 0.546357   0.675435      0.666189      0.0        0.0           0.0
+ 0.407207   0.216524      0.90568       0.0        0.0           0.0
+ 0.67539    0.000139574   0.317462      0.0        0.0           0.0
+ 0.115811   0.817239      0.811343      0.0        0.0           0.0

We can also call it with a matrix of shape $N\times{}N$:

B2 = Matrix(B1) # note that this does not have any special structure
+
+StiefelLieAlgHorMatrix(B2, n)
10×10 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}}:
+ 0.0       -0.618058     -0.942954  …  -0.407207  -0.67539      -0.115811
+ 0.618058   0.0          -0.277085     -0.216524  -0.000139574  -0.817239
+ 0.942954   0.277085      0.0          -0.90568   -0.317462     -0.811343
+ 0.94312    0.65307       0.544225     -0.208163  -0.191557     -0.853422
+ 0.599558   0.120066      0.355605      0.0        0.0           0.0
+ 0.624685   0.00618571    0.329038  …   0.0        0.0           0.0
+ 0.546357   0.675435      0.666189      0.0        0.0           0.0
+ 0.407207   0.216524      0.90568       0.0        0.0           0.0
+ 0.67539    0.000139574   0.317462      0.0        0.0           0.0
+ 0.115811   0.817239      0.811343      0.0        0.0           0.0

Or we can call it a matrix of shape $N\times{}n$:

E = StiefelProjection(N, n)
10×4 StiefelProjection{Float64, Matrix{Float64}}:
+ 1.0  0.0  0.0  0.0
+ 0.0  1.0  0.0  0.0
+ 0.0  0.0  1.0  0.0
+ 0.0  0.0  0.0  1.0
+ 0.0  0.0  0.0  0.0
+ 0.0  0.0  0.0  0.0
+ 0.0  0.0  0.0  0.0
+ 0.0  0.0  0.0  0.0
+ 0.0  0.0  0.0  0.0
+ 0.0  0.0  0.0  0.0
B3 = B1 * E
+
+StiefelLieAlgHorMatrix(B3, n)
10×10 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}}:
+ 0.0       -0.618058     -0.942954  …  -0.407207  -0.67539      -0.115811
+ 0.618058   0.0          -0.277085     -0.216524  -0.000139574  -0.817239
+ 0.942954   0.277085      0.0          -0.90568   -0.317462     -0.811343
+ 0.94312    0.65307       0.544225     -0.208163  -0.191557     -0.853422
+ 0.599558   0.120066      0.355605      0.0        0.0           0.0
+ 0.624685   0.00618571    0.329038  …   0.0        0.0           0.0
+ 0.546357   0.675435      0.666189      0.0        0.0           0.0
+ 0.407207   0.216524      0.90568       0.0        0.0           0.0
+ 0.67539    0.000139574   0.317462      0.0        0.0           0.0
+ 0.115811   0.817239      0.811343      0.0        0.0           0.0

We now demonstrate how to map from an element of $\mathfrak{g}^{\mathrm{hor}, Y}$ to an element of $\mathfrak{g}^\mathrm{hor}$:

N, n = 10, 5
+
+Y = rand(StiefelManifold, N, n)
+Δ = rgrad(Y, rand(N, n))
+ΩΔ = GeometricMachineLearning.Ω(Y, Δ)
+λY = GlobalSection(Y)
+
+λY_mat = Matrix(λY)
+
+round.(λY_mat' * ΩΔ * λY_mat; digits = 3)
10×10 Matrix{Float64}:
+ -0.0     0.433   1.327  -0.196  -0.075  …  -0.364  -0.872   0.924   0.255
+ -0.433  -0.0     0.702  -0.298  -1.285     -0.09   -0.716   1.162   1.224
+ -1.327  -0.702  -0.0    -1.255  -0.904     -0.506  -0.152   0.303   0.184
+  0.196   0.298   1.255   0.0    -0.764     -0.278  -0.699   1.205   0.38
+  0.075   1.285   0.904   0.764  -0.0       -0.515  -0.365   0.489   0.754
+ -0.682  -0.262  -0.221  -0.539  -0.681  …   0.0    -0.0     0.0    -0.0
+  0.364   0.09    0.506   0.278   0.515     -0.0     0.0    -0.0     0.0
+  0.872   0.716   0.152   0.699   0.365     -0.0    -0.0     0.0     0.0
+ -0.924  -1.162  -0.303  -1.205  -0.489      0.0    -0.0    -0.0    -0.0
+ -0.255  -1.224  -0.184  -0.38   -0.754     -0.0    -0.0     0.0    -0.0

Performing this computation directly is computationally very inefficient however and the user is strongly discouraged to call Matrix on an instance of GlobalSection. The better option is calling global_rep:

_round(global_rep(λY, Δ); digits = 3)
10×10 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:
+  0.0     0.433   1.327  -0.196  -0.075  0.682  -0.364  -0.872  0.924  0.255
+ -0.433   0.0     0.702  -0.298  -1.285  0.262  -0.09   -0.716  1.162  1.224
+ -1.327  -0.702   0.0    -1.255  -0.904  0.221  -0.506  -0.152  0.303  0.184
+  0.196   0.298   1.255   0.0    -0.764  0.539  -0.278  -0.699  1.205  0.38
+  0.075   1.285   0.904   0.764   0.0    0.681  -0.515  -0.365  0.489  0.754
+ -0.682  -0.262  -0.221  -0.539  -0.681  0.0     0.0     0.0    0.0    0.0
+  0.364   0.09    0.506   0.278   0.515  0.0     0.0     0.0    0.0    0.0
+  0.872   0.716   0.152   0.699   0.365  0.0     0.0     0.0    0.0    0.0
+ -0.924  -1.162  -0.303  -1.205  -0.489  0.0     0.0     0.0    0.0    0.0
+ -0.255  -1.224  -0.184  -0.38   -0.754  0.0     0.0     0.0    0.0    0.0

Internally GlobalSection calls the function GeometricMachineLearning.global_section which does the following for the Stiefel manifold:

A = randn(N, N - n) # or the gpu equivalent
+A = A - Y * (Y' * A)
+Y⟂ = qr(A).Q[1:N, 1:(N - n)]

So we draw $(N - n)$ new columns randomly, subtract the part that is spanned by the columns of $Y$ and then perform a $QR$ composition on the resulting matrix. The $Q$ part of the decomposition is a matrix of $(N - n)$ columns that is orthogonal to $Y$ and is typically referred to as $Y_\perp$ [6, 10, 11]. We can easily check that this $Y_\perp$ is indeed orthogonal to $Y$.

Theorem

The matrix $Y_\perp$ constructed with the above algorithm satisfies

\[Y^TY_\perp = \mathbb{O},\]

and

\[(Y_\perp)^TY_\perp = \mathbb{I},\]

i.e. all the columns in the big matrix $[Y, Y_\perp]\in\mathbb{R}^{N\times{}N}$ are mutually orthonormal and it therefore is an element of $SO(N)$.

Proof

The second property is trivially satisfied because the $Q$ component of a $QR$ decomposition is an orthogonal matrix. For the first property note that $Y^TQR = \mathbb{O}$ is zero because we have subtracted the $Y$ component from the matrix $QR$. The matrix $R\in\mathbb{R}^{N\times{}(N-n)}$ further has the property $[R]_{ij} = 0$ for $i > j$ and we have that

\[(Y^TQ)R = [r_{11}(Y^TQ)_{1\bullet}, r_{12}(Y^TQ)_{1\bullet} + r_{22}(Y^TQ)_{2\bullet}, \ldots, \sum_{i=1}^{N-n}r_{i(N-n)}(Y^TQ)_{i\bullet}].\]

Now all the coefficients $r_{ii}$ are non-zero because the matrix we performed the $QR$ decomposition on has full rank and we can see that if $(Y^TQ)R$ is zero $Y^TQ$ also has to be zero.

We now discuss the global tangent space for the Grassmann manifold. This is similar to the Stiefel case.

Global Tangent Space for the Grassmann Manifold

In the case of the Grassmann manifold we construct the global tangent space with respect to the distinct element $\mathcal{E}=\mathrm{span}(E)\in{}Gr(n,N)$, where $E$ is again the same matrix.

The tangent tangent space $T_\mathcal{E}Gr(n,N)$ can be represented through matrices:

\[\begin{pmatrix} + 0 & \cdots & 0 \\ + \cdots & \cdots & \cdots \\ + 0 & \cdots & 0 \\ + b_{11} & \cdots & b_{1n} \\ + \cdots & \cdots & \cdots \\ + b_{(N-n)1} & \cdots & b_{(N-n)n} +\end{pmatrix}.\]

This representation is based on the identification $T_\mathcal{E}Gr(n,N)\to{}T_E\mathcal{S}_E$ that was discussed in the section on the Grassmann manifold[2]. We use the following notation:

\[\mathfrak{g}^\mathrm{hor} = \mathfrak{g}^{\mathrm{hor},\mathcal{E}} = \left\{\begin{pmatrix} 0 & -B^T \\ B & 0 \end{pmatrix}: \text{$B$ arbitrary}\right\}.\]

This is equivalent to the horizontal component of $\mathfrak{g}$ for the Stiefel manifold for the case when $A$ is zero. This is a reflection of the rotational invariance of the Grassmann manifold: the skew-symmetric matrices $A$ are connected to the group of rotations $O(n)$ which is factored out in the Grassmann manifold $Gr(n,N)\simeq{}St(n,N)/O(n)$. In GeometricMachineLearning we thus treat the Grassmann manifold as being embedded in the Stiefel manifold. In [11] viewing the Grassmann manifold as a quotient space of the Stiefel manifold is important for "feasibility" in "practical computations".

Library Functions

GeometricMachineLearning.StiefelLieAlgHorMatrixType
StiefelLieAlgHorMatrix(A::SkewSymMatrix{T}, B::AbstractMatrix{T}, N::Integer, n::Integer) where T

Build an instance of StiefelLieAlgHorMatrix based on a skew-symmetric matrix A and an arbitrary matrix B.

StiefelLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: $\pi:S \to SE$ where

\[E = \begin{pmatrix} \mathbb{I}_{n} \\ \mathbb{O}_{(N-n)\times{}n} \end{pmatrix}.\]

The matrix (E) is implemented under StiefelProjection in GeometricMachineLearning.

An element of StiefelLieAlgMatrix takes the form:

\[\begin{pmatrix} +A & B^T \\ B & \mathbb{O} +\end{pmatrix},\]

where $A$ is skew-symmetric (this is SkewSymMatrix in GeometricMachineLearning).

Also see GrassmannLieAlgHorMatrix.

source
GeometricMachineLearning.StiefelLieAlgHorMatrixMethod
StiefelLieAlgHorMatrix(D::AbstractMatrix, n::Integer)

Take a big matrix as input and build an instance of StiefelLieAlgHorMatrix belonging to the StiefelManifold $St(n, N)$ where $N$ is the number of rows of D.

If the constructor is called with a big $N\times{}N$ matrix, then the projection is performed the following way:

\[\begin{pmatrix} +A & B_1 \\ +B_2 & D +\end{pmatrix} \mapsto +\begin{pmatrix} +\mathrm{skew}(A) & -B_2^T \\ +B_2 & \mathbb{O} +\end{pmatrix}.\]

The operation $\mathrm{skew}:\mathbb{R}^{n\times{}n}\to\mathcal{S}_\mathrm{skew}(n)$ is the skew-symmetrization operation. This is equivalent to calling of SkewSymMatrix with an $n\times{}n$ matrix.

This can also be seen as the operation:

\[D \mapsto \Omega(E, DE) = \mathrm{skew}\left(2 \left(\mathbb{I} - \frac{1}{2} E E^T \right) DE E^T\right).\]

Also see GeometricMachineLearning.Ω.

source
GeometricMachineLearning.GrassmannLieAlgHorMatrixType
GrassmannLieAlgHorMatrix(B::AbstractMatrix{T}, N::Integer, n::Integer) where T

Build an instance of GrassmannLieAlgHorMatrix based on an arbitrary matrix B of size $(N-n)\times{}n$.

GrassmannLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: $\pi:S \to SE/\sim$ where

\[E = \begin{pmatrix} \mathbb{I}_{n} \\ \mathbb{O}_{(N-n)\times{}n} \end{pmatrix},\]

and the equivalence relation is

\[V_1 \sim V_2 \iff \exists A\in\mathcal{S}_\mathrm{skew}(n) \text{such that } V_2 = V_1 + \begin{pmatrix} A \\ \mathbb{O} \end{pmatrix}\]

An element of GrassmannLieAlgMatrix takes the form:

\[\begin{pmatrix} +\bar{\mathbb{O}} & B^T \\ B & \mathbb{O} +\end{pmatrix},\]

where $\bar{\mathbb{O}}\in\mathbb{R}^{n\times{}n}$ and $\mathbb{O}\in\mathbb{R}^{(N - n)\times{}n}.$

source
GeometricMachineLearning.GrassmannLieAlgHorMatrixMethod
GrassmannLieAlgHorMatrix(D::AbstractMatrix, n::Integer)

Take a big matrix as input and build an instance of GrassmannLieAlgHorMatrix belonging to the GrassmannManifold $Gr(n, N)$ where $N$ is the number of rows of D.

If the constructor is called with a big $N\times{}N$ matrix, then the projection is performed the following way:

\[\begin{pmatrix} +A & B_1 \\ +B_2 & D +\end{pmatrix} \mapsto +\begin{pmatrix} +\bar{\mathbb{O}} & -B_2^T \\ +B_2 & \mathbb{O} +\end{pmatrix}.\]

This can also be seen as the operation:

\[D \mapsto \Omega(E, DE - EE^TDE),\]

where $\Omega$ is the horizontal lift GeometricMachineLearning.Ω.

source
GeometricMachineLearning.GlobalSectionType
GlobalSection(Y::AbstractMatrix)

Construct a global section for Y.

A global section $\lambda$ is a mapping from a homogeneous space $\mathcal{M}$ to the corresponding Lie group $G$ such that

\[\lambda(Y)E = Y,\]

Also see apply_section and global_rep.

Implementation

For an implementation of GlobalSection for a custom array (especially manifolds), the function global_section has to be generalized.

source
GeometricMachineLearning.global_sectionFunction
global_section(Y::StiefelManifold)

Compute a matrix of size $N\times(N-n)$ whose columns are orthogonal to the columns in Y.

This matrix is also called $Y_\perp$ [6, 10, 11].

Examples

using GeometricMachineLearning
+using GeometricMachineLearning: global_section
+import Random
+
+Random.seed!(123)
+
+Y = StiefelManifold([1. 0.; 0. 1.; 0. 0.; 0. 0.])
+
+round.(Matrix(global_section(Y)); digits = 3)
+
+# output
+
+4×2 Matrix{Float64}:
+ 0.0    -0.0
+ 0.0     0.0
+ 0.936  -0.353
+ 0.353   0.936

Further note that we convert the QRCompactWYQ object to a Matrix before we display it.

Implementation

The implementation is done with a QR decomposition (LinearAlgebra.qr!). Internally we do:

A = randn(N, N - n) # or the gpu equivalent
+A = A - Y.A * (Y.A' * A)
+qr!(A).Q
source
global_section(Y::GrassmannManifold)

Compute a matrix of size $N\times(N-n)$ whose columns are orthogonal to the columns in Y.

The method global_section for the Grassmann manifold is equivalent to that for the StiefelManifold (we represent the Grassmann manifold as an embedding in the Stiefel manifold).

See the documentation for global_section(Y::StiefelManifold{T}) where T.

source
GeometricMachineLearning.global_repFunction
global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:StiefelManifold{T}}

Express Δ (an the tangent space of Y) as an instance of StiefelLieAlgHorMatrix.

This maps an element from $T_Y\mathcal{M}$ to an element of $\mathfrak{g}^\mathrm{hor}$.

These two spaces are isomorphic where the isomorphism where the isomorphism is established through $\lambda(Y)\in{}G$ via:

\[T_Y\mathcal{M} \to \mathfrak{g}^{\mathrm{hor}}, \Delta \mapsto \lambda(Y)^{-1}\Omega(Y, \Delta)\lambda(Y).\]

Also see GeometricMachineLearning.Ω.

Examples

using GeometricMachineLearning
+using GeometricMachineLearning: _round
+import Random 
+
+Random.seed!(123)
+
+Y = rand(StiefelManifold, 6, 3)
+Δ = rgrad(Y, randn(6, 3))
+λY = GlobalSection(Y)
+
+_round(global_rep(λY, Δ); digits = 3)
+
+# output
+
+6×6 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:
+  0.0     0.679   1.925   0.981  -2.058   0.4
+ -0.679   0.0     0.298  -0.424   0.733  -0.919
+ -1.925  -0.298   0.0    -1.815   1.409   1.085
+ -0.981   0.424   1.815   0.0     0.0     0.0
+  2.058  -0.733  -1.409   0.0     0.0     0.0
+ -0.4     0.919  -1.085   0.0     0.0     0.0

Implementation

The function global_rep does in fact not perform the entire map $\lambda(Y)^{-1}\Omega(Y, \Delta)\lambda(Y)$ but only

\[\Delta \mapsto \mathrm{skew}(Y^T\Delta),\]

to get the small skew-symmetric matrix and

\[\Delta \mapsto (\lambda(Y)_{[1:N, n:N]}^T \Delta)_{[1:(N-n), 1:n]},\]

for the arbitrary matrix.

source
global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:GrassmannManifold{T}}

Express Δ (an the tangent space of Y) as an instance of GrassmannLieAlgHorMatrix.

The method global_rep for GrassmannManifold is similar to that for StiefelManifold.

Examples

using GeometricMachineLearning
+using GeometricMachineLearning: _round
+import Random 
+
+Random.seed!(123)
+
+Y = rand(GrassmannManifold, 6, 3)
+Δ = rgrad(Y, randn(6, 3))
+λY = GlobalSection(Y)
+
+_round(global_rep(λY, Δ); digits = 3)
+
+# output
+
+6×6 GrassmannLieAlgHorMatrix{Float64, Matrix{Float64}}:
+  0.0     0.0     0.0     0.981  -2.058   0.4
+  0.0     0.0     0.0    -0.424   0.733  -0.919
+  0.0     0.0     0.0    -1.815   1.409   1.085
+ -0.981   0.424   1.815   0.0     0.0     0.0
+  2.058  -0.733  -1.409   0.0     0.0     0.0
+ -0.4     0.919  -1.085   0.0     0.0     0.0
source

References

[6]
P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).
[10]
P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).
[11]
T. Bendokat, R. Zimmermann and P.-A. Absil. A Grassmann manifold handbook: Basic geometry and computational aspects, arXiv preprint arXiv:2011.13699 (2020).
[38]
B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
  • 1We already introduced this special matrix together with the Stiefel manifold.
  • 2We derived the following expression for the Riemannian gradient of the Grassmann manifold: $\mathrm{grad}_\mathcal{Y}^{Gr}L = \nabla_Y{}L - YY^T\nabla_YL$. The tangent space to the element $\mathcal{E}$ can thus be written as $\bar{B} - EE^T\bar{B}$ where $B\in\mathbb{R}^{N\times{}n}$ and the matrices in this tangent space have the desired form.
diff --git a/latest/arrays/grassmann_lie_alg_hor_matrix/index.html b/latest/arrays/grassmann_lie_alg_hor_matrix/index.html deleted file mode 100644 index 867869164..000000000 --- a/latest/arrays/grassmann_lie_alg_hor_matrix/index.html +++ /dev/null @@ -1,12 +0,0 @@ - -Grassmann Global Tangent Space · GeometricMachineLearning.jl

The horizontal component of the Lie algebra $\mathfrak{g}$ for the Grassmann manifold

Tangent space to the element $\mathcal{E}$

Consider the tangent space to the distinct element $\mathcal{E}=\mathrm{span}(E)\in{}Gr(n,N)$, where $E$ is again:

\[E = \begin{bmatrix} -\mathbb{I}_n \\ -\mathbb{O} -\end{bmatrix}.\]

The tangent tangent space $T_\mathcal{E}Gr(n,N)$ can be represented through matrices:

\[\begin{pmatrix} - 0 & \cdots & 0 \\ - \cdots & \cdots & \cdots \\ - 0 & \cdots & 0 \\ - a_{11} & \cdots & a_{1n} \\ - \cdots & \cdots & \cdots \\ - a_{(N-n)1} & \cdots & a_{(N-n)n} -\end{pmatrix},\]

where we have used the identification $T_\mathcal{E}Gr(n,N)\to{}T_E\mathcal{S}_E$ that was discussed in the section on the Grassmann manifold. The Grassmann manifold can also be seen as the Stiefel manifold modulo an equivalence class. This leads to the following (which is used for optimization):

\[\mathfrak{g}^\mathrm{hor} = \mathfrak{g}^{\mathrm{hor},\mathcal{E}} = \left\{\begin{pmatrix} 0 & -B^T \\ B & 0 \end{pmatrix}: \text{$B$ arbitrary}\right\}.\]

This is equivalent to the horizontal component of $\mathfrak{g}$ for the Stiefel manifold for the case when $A$ is zero. This is a reflection of the rotational invariance of the Grassmann manifold: the skew-symmetric matrices $A$ are connected to the group of rotations $O(n)$ which is factored out in the Grassmann manifold $Gr(n,N)\simeq{}St(n,N)/O(n)$.

diff --git a/latest/arrays/skew_symmetric_matrix/index.html b/latest/arrays/skew_symmetric_matrix/index.html index a425b1c97..4a0055487 100644 --- a/latest/arrays/skew_symmetric_matrix/index.html +++ b/latest/arrays/skew_symmetric_matrix/index.html @@ -1,17 +1,145 @@ -Symmetric and Skew-Symmetric Matrices · GeometricMachineLearning.jl

SymmetricMatrix and SkewSymMatrix

There are special implementations of symmetric and skew-symmetric matrices in GeometricMachineLearning.jl. They are implemented to work on GPU and for multiplication with tensors. The following image demonstrates how the data necessary for an instance of SkewSymMatrix are stored[1]:

So what is stored internally is a vector of size $n(n-1)/2$ for the skew-symmetric matrix and a vector of size $n(n+1)/2$ for the symmetric matrix. We can sample a random skew-symmetric matrix:

A = rand(SkewSymMatrix, 5)
5×5 SkewSymMatrix{Float64, Vector{Float64}}:
- 0.0       -0.688575  -0.632069   -0.156252   -0.558559
- 0.688575   0.0       -0.176186   -0.393519   -0.411487
- 0.632069   0.176186   0.0        -0.0464104  -0.732238
- 0.156252   0.393519   0.0464104   0.0        -0.103197
- 0.558559   0.411487   0.732238    0.103197    0.0

and then access the vector:

A.S
10-element Vector{Float64}:
- 0.6885748828439957
- 0.6320687661064008
- 0.17618565721386625
- 0.1562520110945037
- 0.39351918465693936
- 0.04641040842491928
- 0.5585593124110475
- 0.4114869338527216
- 0.7322381512179701
- 0.10319684650083005
  • 1It works similarly for SymmetricMatrix.
+Symmetric and Skew-Symmetric Matrices · GeometricMachineLearning.jl

Symmetric Skew-Symmetric and Triangular Matrices.

Among the special arrays implemented in GeometricMachineLearning SymmetricMatrix, SkewSymMatrix, UpperTriangular and LowerTriangular are the most common ones and these can also be found in other libraries; LinearAlgebra.jl has an implementation of a symmetric matrix called Symmetric for example. The versions of these matrices in GeometricMachineLearning are however more memory efficient as they only store as many parameters as are necessary, i.e. $n(n+1)/2$ for the symmetric matrix and $n(n-1)/2$ for the other three. In addition operations such as matrix and tensor multiplication are implemented for these matrices to work in parallel on GPU.

We now show the various matrices. First UpperTriangular:

\[U = \begin{pmatrix} + 0 & a_{12} & \cdots & a_{1n} \\ + 0 & \ddots & & a_{2n} \\ + \vdots & \ddots & \ddots & \vdots \\ + 0 & \cdots & 0 & 0 +\end{pmatrix}.\]

The matrix LowerTriangular:

\[L = \begin{pmatrix} + 0 & 0 & \cdots & 0 \\ + a_{21} & \ddots & & \vdots \\ + \vdots & \ddots & \ddots & \vdots \\ + a_{n1} & \cdots & a_{n(n-1)} & 0 +\end{pmatrix}.\]

An instance of SkewSymMatrix can be written as $A = L - L^T$ or $A = U - U^T$:

\[A = \begin{pmatrix} + 0 & - a_{21} & \cdots & - a_{n1} \\ + a_{21} & \ddots & & \vdots \\ + \vdots & \ddots & \ddots & \vdots \\ + a_{n1} & \cdots & a_{n(n-1)} & 0 +\end{pmatrix}.\]

And lastly a SymmetricMatrix:

\[L = \begin{pmatrix} + a_{11} & a_{21} & \cdots & a_{n1} \\ + a_{21} & \ddots & & \vdots \\ + \vdots & \ddots & \ddots & \vdots \\ + a_{n1} & \cdots & a_{n(n-1)} & a_{nn} +\end{pmatrix}.\]

Note that any matrix $M\in\mathbb{R}^{n\times{}n}$ can be written

\[M = \frac{1}{2}(M - M^T) + \frac{1}{2}(M + M^T),\]

where the first part of this matrix is skew-symmetric and the second part is symmetric. This is also how the constructors for SkewSymMatrix and SymmetricMatrix are designed:

using GeometricMachineLearning
+
+M = rand(3, 3)
3×3 Matrix{Float64}:
+ 0.0599255  0.176197  0.0558987
+ 0.557369   0.468066  0.389839
+ 0.104867   0.247198  0.540566
A = SkewSymMatrix(M)
3×3 SkewSymMatrix{Float64, Vector{Float64}}:
+ 0.0        -0.190586   -0.0244839
+ 0.190586    0.0         0.0713205
+ 0.0244839  -0.0713205   0.0
B = SymmetricMatrix(M)
3×3 SymmetricMatrix{Float64, Vector{Float64}}:
+ 0.0599255  0.366783  0.0803826
+ 0.366783   0.468066  0.318519
+ 0.0803826  0.318519  0.540566
M  ≈ A + B
true

How are Special Matrices Stored?

The following image demonstrates how special matrices are stored in GeometricMachineLearning:

So what is stored internally is a vector of size $n(n-1)/2$ for the skew-symmetric matrix and the triangular matrices and a vector of size $n(n+1)/2$ for the symmetric matrix. We can sample a random skew-symmetric matrix:

using GeometricMachineLearning
+import Random
+Random.seed!(123)
+
+A = rand(SkewSymMatrix, 5)
5×5 SkewSymMatrix{Float64, Vector{Float64}}:
+ 0.0       -0.9063    -0.443494  -0.512083  -0.427328
+ 0.9063     0.0       -0.745673  -0.253849  -0.867547
+ 0.443494   0.745673   0.0       -0.334152  -0.581912
+ 0.512083   0.253849   0.334152   0.0       -0.311448
+ 0.427328   0.867547   0.581912   0.311448   0.0

and then access the vector:

A.S
10-element Vector{Float64}:
+ 0.906299638797481
+ 0.44349373245960455
+ 0.7456733811393941
+ 0.5120830400366143
+ 0.2538490889415096
+ 0.33415153638191886
+ 0.4273278808735992
+ 0.867547200255958
+ 0.5819123423876457
+ 0.3114475007050529

This is equivalent to sampling a vector and then assigning a matrix:

using GeometricMachineLearning
+import Random
+Random.seed!(123)
+
+S = rand(5 * (5 - 1) ÷ 2)
+SkewSymMatrix(S, 5)
5×5 SkewSymMatrix{Float64, Vector{Float64}}:
+ 0.0       -0.9063    -0.443494  -0.512083  -0.427328
+ 0.9063     0.0       -0.745673  -0.253849  -0.867547
+ 0.443494   0.745673   0.0       -0.334152  -0.581912
+ 0.512083   0.253849   0.334152   0.0       -0.311448
+ 0.427328   0.867547   0.581912   0.311448   0.0

These special matrices are important for SympNets, volume-preserving transformers and linear symplectic transformers.

Parallel Computation

The functions GeometricMachineLearning.mat_tensor_mul and GeometricMachineLearning.tensor_mat_mul are also implemented for these matrices for efficient parallel computations. This is elaborated on when we introduce pullbacks.

Library Functions

GeometricMachineLearning.UpperTriangularType
LowerTriangular(S::AbstractVector, n::Int)

Build a lower-triangular matrix from a vector.

A lower-triangular matrix is an $n\times{}n$ matrix that has ones on the diagonal and zeros on the upper triangular.

The data are stored in a vector $S$ similarly to other matrices. See LowerTriangular, SkewSymMatrix and SymmetricMatrix.

The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

Examples

using GeometricMachineLearning
+S = [1, 2, 3, 4, 5, 6]
+UpperTriangular(S, 4)
+
+# output
+
+4×4 UpperTriangular{Int64, Vector{Int64}}:
+ 0  1  2  4
+ 0  0  3  5
+ 0  0  0  6
+ 0  0  0  0
source
GeometricMachineLearning.UpperTriangularMethod
UpperTriangular(A::AbstractMatrix)

Build a lower-triangular matrix from a matrix.

This is done by taking the lower left of that matrix.

Examples

using GeometricMachineLearning
+M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
+UpperTriangular(M)
+
+# output
+
+4×4 UpperTriangular{Int64, Vector{Int64}}:
+ 0  2  3   4
+ 0  0  7   8
+ 0  0  0  12
+ 0  0  0   0
source
GeometricMachineLearning.LowerTriangularType
LowerTriangular(S::AbstractVector, n::Int)

Build a lower-triangular matrix from a vector.

A lower-triangular matrix is an $n\times{}n$ matrix that has ones on the diagonal and zeros on the upper triangular.

The data are stored in a vector $S$ similarly to other matrices. See UpperTriangular, SkewSymMatrix and SymmetricMatrix.

The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

Examples

using GeometricMachineLearning
+S = [1, 2, 3, 4, 5, 6]
+LowerTriangular(S, 4)
+
+# output
+
+4×4 LowerTriangular{Int64, Vector{Int64}}:
+ 0  0  0  0
+ 1  0  0  0
+ 2  3  0  0
+ 4  5  6  0
source
GeometricMachineLearning.LowerTriangularMethod
LowerTriangular(A::AbstractMatrix)

Build a lower-triangular matrix from a matrix.

This is done by taking the lower left of that matrix.

Examples

using GeometricMachineLearning
+M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
+LowerTriangular(M)
+
+# output
+
+4×4 LowerTriangular{Int64, Vector{Int64}}:
+  0   0   0  0
+  5   0   0  0
+  9  10   0  0
+ 13  14  15  0
source
GeometricMachineLearning.SymmetricMatrixType
SymmetricMatrix(S::AbstractVector, n::Integer)

Instantiate a symmetric matrix with information stored in vector S.

A SymmetricMatrix $A$ is a matrix $A^T = A$.

Internally the struct saves a vector $S$ of size $n(n+1)\div2$. The conversion is done the following way:

\[[A]_{ij} = \begin{cases} S[( (i-1) i ) \div 2 + j] & \text{if $i\geq{}j$}\\ + S[( (j-1) j ) \div 2 + i] & \text{else}. \end{cases}\]

So $S$ stores a string of vectors taken from $A$: $S = [\tilde{a}_1, \tilde{a}_2, \ldots, \tilde{a}_n]$ with $\tilde{a}_i = [[A]_{i1},[A]_{i2},\ldots,[A]_{ii}]$.

Also see SkewSymMatrix, LowerTriangular and UpperTriangular.

Examples

using GeometricMachineLearning
+S = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+SymmetricMatrix(S, 4)
+
+# output
+
+4×4 SymmetricMatrix{Int64, Vector{Int64}}:
+ 1  2  4   7
+ 2  3  5   8
+ 4  5  6   9
+ 7  8  9  10
source
GeometricMachineLearning.SymmetricMatrixMethod
SymmetricMatrix(A::AbstractMatrix)

Perform 0.5 * (A + A') and store the matrix in an efficient way (as a vector with $n(n+1)/2$ entries).

If the constructor is called with a matrix as input it returns a symmetric matrix via the projection:

\[A \mapsto \frac{1}{2}(A + A^T).\]

Examples

using GeometricMachineLearning
+M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
+SymmetricMatrix(M)
+
+# output
+
+4×4 SymmetricMatrix{Float64, Vector{Float64}}:
+ 1.0   3.5   6.0   8.5
+ 3.5   6.0   8.5  11.0
+ 6.0   8.5  11.0  13.5
+ 8.5  11.0  13.5  16.0

Extend help

Note that the constructor is designed in such a way that it always returns matrices of type SymmetricMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.

If the user wishes to allocate a matrix SymmetricMatrix{<:Integer} the constructor SymmetricMatrix(::AbstractVector, n::Integer) has to be called.

source
GeometricMachineLearning.SkewSymMatrixType
SkewSymMatrix(S::AbstractVector, n::Integer)

Instantiate a skew-symmetric matrix with information stored in vector S.

A skew-symmetric matrix $A$ is a matrix $A^T = -A$.

Internally the struct saves a vector $S$ of size $n(n-1)\div2$. The conversion is done the following way:

\[[A]_{ij} = \begin{cases} 0 & \text{if $i=j$} \\ + S[( (i-2) (i-1) ) \div 2 + j] & \text{if $i>j$}\\ + S[( (j-2) (j-1) ) \div 2 + i] & \text{else}. \end{cases}\]

Also see SymmetricMatrix, LowerTriangular and UpperTriangular.

Examples

using GeometricMachineLearning
+S = [1, 2, 3, 4, 5, 6]
+SkewSymMatrix(S, 4)
+
+# output
+
+4×4 SkewSymMatrix{Int64, Vector{Int64}}:
+ 0  -1  -2  -4
+ 1   0  -3  -5
+ 2   3   0  -6
+ 4   5   6   0
source
GeometricMachineLearning.SkewSymMatrixMethod
SkewSymMatrix(A::AbstractMatrix)

Perform 0.5 * (A - A') and store the matrix in an efficient way (as a vector with $n(n-1)/2$ entries).

If the constructor is called with a matrix as input it returns a skew-symmetric matrix via the projection:

\[A \mapsto \frac{1}{2}(A - A^T).\]

Examples

using GeometricMachineLearning
+M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
+SkewSymMatrix(M)
+
+# output
+
+4×4 SkewSymMatrix{Float64, Vector{Float64}}:
+ 0.0  -1.5  -3.0  -4.5
+ 1.5   0.0  -1.5  -3.0
+ 3.0   1.5   0.0  -1.5
+ 4.5   3.0   1.5   0.0

Extend help

Note that the constructor is designed in such a way that it always returns matrices of type SkewSymMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.

If the user wishes to allocate a matrix SkewSymMatrix{<:Integer} the constructor SkewSymMatrix(::AbstractVector, n::Integer) has to be called.

source
diff --git a/latest/arrays/stiefel_lie_alg_horizontal/index.html b/latest/arrays/stiefel_lie_alg_horizontal/index.html deleted file mode 100644 index a4d365349..000000000 --- a/latest/arrays/stiefel_lie_alg_horizontal/index.html +++ /dev/null @@ -1,8 +0,0 @@ - -Stiefel Global Tangent Space · GeometricMachineLearning.jl

Horizontal component of the Lie algebra $\mathfrak{g}$

What we use to optimize Adam (and other algorithms) to manifolds is a global tangent space representation of the homogeneous spaces.

For the Stiefel manifold, this global tangent space representation takes a simple form:

\[\mathcal{B} = \begin{bmatrix} - A & -B^T \\ - B & \mathbb{O} -\end{bmatrix},\]

where $A\in\mathbb{R}^{n\times{}n}$ is skew-symmetric and $B\in\mathbb{R}^{N\times{}n}$ is arbitary. In GeometricMachineLearning the struct StiefelLieAlgHorMatrix implements elements of this form.

Theoretical background

Vertical and horizontal components

The Stiefel manifold $St(n, N)$ is a homogeneous space obtained from $SO(N)$ by setting two matrices, whose first $n$ columns conincide, equivalent. Another way of expressing this is:

\[A_1 \sim A_2 \iff A_1E = A_2E\]

for

\[E = \begin{bmatrix} \mathbb{I} \\ \mathbb{O}\end{bmatrix}.\]

Because $St(n,N)$ is a homogeneous space, we can take any element $Y\in{}St(n,N)$ and $SO(N)$ acts transitively on it, i.e. can produce any other element in $SO(N)$. A similar statement is also true regarding the tangent spaces of $St(n,N)$, namely:

\[T_YSt(n,N) = \mathfrak{g}\cdot{}Y,\]

i.e. every tangent space can be expressed through an action of the associated Lie algebra.

The kernel of the mapping $\mathfrak{g}\to{}T_YSt(n,N), B\mapsto{}BY$ is referred to as $\mathfrak{g}^{\mathrm{ver},Y}$, the vertical component of the Lie algebra at $Y$. In the case $Y=E$ it is easy to see that elements belonging to $\mathfrak{g}^{\mathrm{ver},E}$ are of the following form:

\[\begin{bmatrix} -\hat{\mathbb{O}} & \tilde{\mathbb{O}}^T \\ -\tilde{\mathbb{O}} & C -\end{bmatrix},\]

where $\hat{\mathbb{O}}\in\mathbb{R}^{n\times{}n}$ is a "small" matrix and $\tilde{\mathbb{O}}\in\mathbb{R}^{N\times{}n}$ is a bigger one. $C\in\mathbb{R}^{N\times{}N}$ is a skew-symmetric matrix.

The orthogonal complement of the vertical component is referred to as the horizontal component and denoted by $\mathfrak{g}^{\mathrm{hor}, Y}$. It is isomorphic to $T_YSt(n,N)$ and this isomorphism can be found explicitly. In the case of the Stiefel manifold:

\[\Omega(Y, \cdot):T_YSt(n,N)\to\mathfrak{g}^{\mathrm{hor},Y},\, \Delta \mapsto (\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T)\]

The elements of $\mathfrak{g}^{\mathrm{hor},E}=:\mathfrak{g}^\mathrm{hor}$, i.e. for the special case $Y=E$. Its elements are of the form described on top of this page.

Special functions

You can also draw random elements from $\mathfrak{g}^\mathrm{hor}$ through e.g.

rand(CUDADevice(), StiefelLieAlgHorMatrix{Float32}, 10, 5)

In this example: $N=10$ and $n=5$.

diff --git a/latest/data_loader/TODO/index.html b/latest/data_loader/TODO/index.html index 2bfdc8adc..03bd3df09 100644 --- a/latest/data_loader/TODO/index.html +++ b/latest/data_loader/TODO/index.html @@ -1,2 +1,2 @@ -DATA Loader TODO · GeometricMachineLearning.jl

DATA Loader TODO

  • [x] Implement @views instead of allocating a new array in every step.
  • [x] Implement sampling without replacement.
  • [x] Store information on the epoch and the current loss.
  • [x] Usually the training loss is computed over the entire data set, we are probably going to do this for one epoch via

\[loss_e = \frac{1}{|batches|}\sum_{batch\in{}batches}loss(batch).\]

Point 4 makes sense because the output of an AD routine is the value of the loss function as well as the pullback.

+DATA Loader TODO · GeometricMachineLearning.jl

DATA Loader TODO

  • [x] Implement @views instead of allocating a new array in every step.
  • [x] Implement sampling without replacement.
  • [x] Store information on the epoch and the current loss.
  • [x] Usually the training loss is computed over the entire data set, we are probably going to do this for one epoch via

\[loss_e = \frac{1}{|batches|}\sum_{batch\in{}batches}loss(batch).\]

Point 4 makes sense because the output of an AD routine is the value of the loss function as well as the pullback.

diff --git a/latest/data_loader/data_loader/index.html b/latest/data_loader/data_loader/index.html index da047074b..2bf2adf0c 100644 --- a/latest/data_loader/data_loader/index.html +++ b/latest/data_loader/data_loader/index.html @@ -1,28 +1,28 @@ -Routines · GeometricMachineLearning.jl

Data Loader

Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.

Constructor

The data loader can be called with various inputs:

  • A single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
  • A single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps.
  • A single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.
  • A tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are $n_p$ matrices (first input argument) and $n_p$ integers (second input argument).
  • A NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors.
  • An EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.

When we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.

The data loader can be called with various types of arrays as input, for example a snapshot matrix:

SnapshotMatrix = rand(Float32, 10, 100)
+Routines · GeometricMachineLearning.jl

Data Loader

Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.

Constructor

The data loader can be called with various inputs:

  • A single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
  • A single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps.
  • A single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.
  • A tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are $n_p$ matrices (first input argument) and $n_p$ integers (second input argument).
  • A NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors.
  • An EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.

When we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.

The data loader can be called with various types of arrays as input, for example a snapshot matrix:

SnapshotMatrix = rand(Float32, 10, 100)
 
-dl = DataLoader(SnapshotMatrix)
DataLoader{Float32, Array{Float32, 3}, Nothing, :RegularData}(Float32[0.08055109; 0.50507367; … ; 0.43392932; 0.5364415;;; 0.2685191; 0.75166935; … ; 0.51979285; 0.6126436;;; 0.6639238; 0.28986597; … ; 0.83610654; 0.41034335;;; … ;;; 0.48763305; 0.62914866; … ; 0.88620144; 0.4455061;;; 0.960353; 0.043744862; … ; 0.15464705; 0.6991339;;; 0.36715263; 0.9714729; … ; 0.12899584; 0.058760107], nothing, 10, 1, 100, nothing, nothing)

or a snapshot tensor:

SnapshotTensor = rand(Float32, 10, 100, 5)
+dl = DataLoader(SnapshotMatrix)
DataLoader{Float32, Array{Float32, 3}, Nothing, :RegularData}(Float32[0.50513726; 0.74658626; … ; 0.85417485; 0.58674866;;; 0.12884498; 0.12181258; … ; 0.4270541; 0.8041934;;; 0.123054326; 0.12353808; … ; 0.55073285; 0.7408406;;; … ;;; 0.7417651; 0.50908566; … ; 0.92497385; 0.034432232;;; 0.62838274; 0.35206145; … ; 0.5772865; 0.5192775;;; 0.7972284; 0.21483058; … ; 0.5223946; 0.7761893], nothing, 10, 1, 100, nothing, nothing)

or a snapshot tensor:

SnapshotTensor = rand(Float32, 10, 100, 5)
 
-dl = DataLoader(SnapshotTensor)
DataLoader{Float32, Array{Float32, 3}, Nothing, :TimeSeries}(Float32[0.47514552 0.6462269 … 0.06573784 0.50689423; 0.98165625 0.77784 … 0.18607223 0.98225456; … ; 0.047580957 0.21996862 … 0.8547021 0.05940044; 0.7892533 0.1836735 … 0.35932755 0.8204741;;; 0.92092264 0.068466365 … 0.7334706 0.79876995; 0.066657364 0.87059236 … 0.29434735 0.0064014792; … ; 0.70073056 0.5457771 … 0.35128164 0.23886418; 0.04435748 0.31036907 … 0.8479832 0.2991631;;; 0.63582414 0.5664668 … 0.6179489 0.62593204; 0.74342746 0.43990523 … 0.04913807 0.25716132; … ; 0.43826455 0.58407176 … 0.46263957 0.27874207; 0.82068676 0.6319619 … 0.8520811 0.3968606;;; 0.790415 0.68754876 … 0.29745066 0.4063906; 0.62705886 0.1459164 … 0.2450074 0.7278039; … ; 0.6094751 0.56492376 … 0.15517104 0.5480405; 0.6015564 0.025696874 … 0.24260992 0.58764863;;; 0.07477158 0.9281514 … 0.03230077 0.7697606; 0.051793456 0.64377385 … 0.84566677 0.025487304; … ; 0.7416206 0.15734309 … 0.68909234 0.9355397; 0.9731908 0.4581895 … 0.4400956 0.9748987], nothing, 10, 100, 5, nothing, nothing)

Here the DataLoader has different properties :RegularData and :TimeSeries. This indicates that in the first case we treat all columns in the input tensor independently (this is mostly used for autoencoder problems), whereas in the second case we have time series-like data, which are mostly used for integration problems. We can also treat a problem with a matrix as input as a time series-like problem by providing an additional keyword argument: autoencoder=false:

SnapshotMatrix = rand(Float32, 10, 100)
+dl = DataLoader(SnapshotTensor)
DataLoader{Float32, Array{Float32, 3}, Nothing, :TimeSeries}(Float32[0.66194 0.9154617 … 0.6931075 0.121753514; 0.57182056 0.8062481 … 0.9143436 0.7425928; … ; 0.76190317 0.3335904 … 0.7732639 0.22695911; 0.33047318 0.40463448 … 0.66562307 0.7051216;;; 0.60021454 0.9664643 … 0.7052766 0.71521145; 0.2096209 0.5104178 … 0.011067748 0.49961013; … ; 0.9403707 0.12779117 … 0.052363217 0.79296273; 0.5510261 0.8520845 … 0.93764496 0.6518958;;; 0.10690129 0.6951603 … 0.6198033 0.46171385; 0.14894313 0.7717574 … 0.8583469 0.41567224; … ; 0.5625989 0.95870256 … 0.97457296 0.14663988; 0.3282981 0.2900036 … 0.44474506 0.6590729;;; 0.08844793 0.7936482 … 0.22784966 0.79666775; 0.3399279 0.785954 … 0.8760918 0.26853603; … ; 0.6861569 0.047835767 … 0.98782694 0.9878639; 0.3487153 0.6853073 … 0.97168374 0.39452732;;; 0.9609875 0.9626641 … 0.84352314 0.24652505; 0.615612 0.17505419 … 0.8249419 0.2309035; … ; 0.07973236 0.6703848 … 0.674812 0.37466198; 0.5187783 0.022566557 … 0.545338 0.94295365], nothing, 10, 100, 5, nothing, nothing)

Here the DataLoader has different properties :RegularData and :TimeSeries. This indicates that in the first case we treat all columns in the input tensor independently (this is mostly used for autoencoder problems), whereas in the second case we have time series-like data, which are mostly used for integration problems. We can also treat a problem with a matrix as input as a time series-like problem by providing an additional keyword argument: autoencoder=false:

SnapshotMatrix = rand(Float32, 10, 100)
 
 dl = DataLoader(SnapshotMatrix; autoencoder=false)
 dl.input_time_steps
100

DataLoader can also be called with a NamedTuple that has q and p as keys.

In this case the field input_dim of DataLoader is interpreted as the sum of the $q$- and $p$-dimensions, i.e. if $q$ and $p$ both evolve on $\mathbb{R}^n$, then input_dim is $2n$.

SymplecticSnapshotTensor = (q = rand(Float32, 10, 100, 5), p = rand(Float32, 10, 100, 5))
 
-dl = DataLoader(SymplecticSnapshotTensor)
DataLoader{Float32, @NamedTuple{q::Array{Float32, 3}, p::Array{Float32, 3}}, Nothing, :TimeSeries}((q = Float32[0.5409074 0.33400947 … 0.38502377 0.73360056; 0.06896615 0.47037792 … 0.9855435 0.63538295; … ; 0.53323656 0.62345487 … 0.46240884 0.29146194; 0.7084074 0.19235885 … 0.18216854 0.8442955;;; 0.14472288 0.14007372 … 0.8809616 0.6116157; 0.37897152 0.93576175 … 0.77061313 0.66473615; … ; 0.0056789517 0.4252711 … 0.5653655 0.5902304; 0.52615535 0.24021763 … 0.43198758 0.13711923;;; 0.6630877 0.6178773 … 0.59372556 0.9214696; 0.96434176 0.8122992 … 0.6555255 0.73922485; … ; 0.9700783 0.2381919 … 0.04027623 0.30654794; 0.8274364 0.8750849 … 0.14833093 0.6140196;;; 0.39286852 0.24402022 … 0.3899287 0.2361691; 0.63470876 0.007992506 … 0.64387584 0.08781725; … ; 0.857335 0.79250467 … 0.38706666 0.82886785; 0.5973287 0.8015378 … 0.68441206 0.8578106;;; 0.6202667 0.723482 … 0.9599344 0.73314255; 0.23462927 0.97896653 … 0.5872662 0.744849; … ; 0.02173847 0.40871108 … 0.18674421 0.8947121; 0.43480593 0.7880513 … 0.63367516 0.0323202], p = Float32[0.9739549 0.5731187 … 0.305143 0.17167062; 0.96901894 0.507784 … 0.9328522 0.48191214; … ; 0.3829015 0.87084776 … 0.14383078 0.6287044; 0.39667743 0.8032714 … 0.17344844 0.95890194;;; 0.3218662 0.2729864 … 0.089969635 0.20348519; 0.73572713 0.5664705 … 0.588519 0.17863005; … ; 0.6173055 0.31093842 … 0.15364754 0.069250286; 0.05654168 0.7284537 … 0.23924196 0.38993198;;; 0.52271116 0.26793134 … 0.6828395 0.6037737; 0.3697217 0.42009896 … 0.44278055 0.8712644; … ; 0.98830336 0.58787394 … 0.17054576 0.5939276; 0.12487197 0.6478382 … 0.112715185 0.7805136;;; 0.73549676 0.31779546 … 0.84229475 0.0052493215; 0.23535657 0.56617683 … 0.7865545 0.66356707; … ; 0.09083426 0.07119304 … 0.22900683 0.15822649; 0.021619916 0.6723977 … 0.82193846 0.99692017;;; 0.58983916 0.76869166 … 0.25612795 0.6429209; 0.64345515 0.5530285 … 0.007190883 0.2756011; … ; 0.7205214 0.440423 … 0.06352472 0.4563014; 0.6496832 0.78338593 … 0.27621824 0.85176206]), nothing, 20, 100, 5, nothing, nothing)
dl.input_dim
20

The Batch struct

Batch is a struct whose functor acts on an instance of DataLoader to produce a sequence of training samples for training for one epoch.

The Constructor

The constructor for Batch is called with:

  • batch_size::Int
  • seq_length::Int (optional)
  • prediction_window::Int (optional)

The first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a sequence length and a prediction window as input arguments to Batch. These indicate the number of input vectors and the number of output vectors.

The functor

An instance of Batch can be called on an instance of DataLoader to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying batch:Batch to dl::DataLoader is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the time index and the second one is the parameter index.

matrix_data = rand(Float32, 2, 10)
+dl = DataLoader(SymplecticSnapshotTensor)
DataLoader{Float32, @NamedTuple{q::Array{Float32, 3}, p::Array{Float32, 3}}, Nothing, :TimeSeries}((q = Float32[0.20179188 0.035817206 … 0.67239285 0.23499113; 0.78551984 0.68971205 … 0.14649713 0.644054; … ; 0.8369193 0.3282665 … 0.59413654 0.19929636; 0.4518789 0.8461747 … 0.65936357 0.910529;;; 0.6816204 0.8312971 … 0.5655682 0.39162058; 0.73700756 0.6043024 … 0.5945279 0.38146853; … ; 0.9267082 0.8980464 … 0.7353773 0.0696488; 0.9958879 0.43105602 … 0.31615728 0.3899536;;; 0.24031478 0.38514966 … 0.25906086 0.43808413; 0.4536457 0.89918923 … 0.42050612 0.5595634; … ; 0.6395411 0.39858943 … 0.39727676 0.519886; 0.5671472 0.27734232 … 0.98940325 0.56534976;;; 0.59748656 0.90232295 … 0.08945525 0.6116041; 0.005614519 0.1757384 … 0.4170062 0.5038106; … ; 0.29899383 0.5852866 … 0.41701317 0.5748584; 0.25108534 0.69047844 … 0.14569855 0.55655384;;; 0.71069247 0.8214506 … 0.7832521 0.8684449; 0.5335912 0.13672042 … 0.6743857 0.91588897; … ; 0.56801677 0.13889539 … 0.7678544 0.8505472; 0.48185366 0.55619437 … 0.58553785 0.3864385], p = Float32[0.7733558 0.7919344 … 0.3560174 0.62236935; 0.74159324 0.70769215 … 0.38506925 0.6031601; … ; 0.83040255 0.560536 … 0.32543802 0.927745; 0.534915 0.11107159 … 0.49384755 0.16928482;;; 0.9781217 0.16058743 … 0.22721452 0.26094443; 0.7046325 0.8730274 … 0.7478503 0.3627321; … ; 0.34608942 0.23993802 … 0.4994654 0.8429688; 0.32908416 0.6250298 … 0.3999623 0.96006054;;; 0.07347268 0.81248146 … 0.30096197 0.13181859; 0.5886079 0.93334925 … 0.8336328 0.7395555; … ; 0.8867444 0.994447 … 0.74361056 0.04716432; 0.1593607 0.5563105 … 0.20140874 0.6070136;;; 0.68009937 0.7911615 … 0.027392745 0.6020984; 0.88526326 0.4613757 … 0.7842136 0.14825207; … ; 0.3249318 0.34767777 … 0.61749214 0.48749298; 0.004522443 0.6348213 … 0.96371275 0.7197439;;; 0.75690556 0.16966146 … 0.6374611 0.4944222; 0.699607 0.05892378 … 0.90446764 0.50296575; … ; 0.5883538 0.4221211 … 0.20364654 0.7515538; 0.8664315 0.2870798 … 0.14026004 0.4805572]), nothing, 20, 100, 5, nothing, nothing)
dl.input_dim
20

The Batch struct

Batch is a struct whose functor acts on an instance of DataLoader to produce a sequence of training samples for training for one epoch.

The Constructor

The constructor for Batch is called with:

  • batch_size::Int
  • seq_length::Int (optional)
  • prediction_window::Int (optional)

The first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a sequence length and a prediction window as input arguments to Batch. These indicate the number of input vectors and the number of output vectors.

The functor

An instance of Batch can be called on an instance of DataLoader to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying batch:Batch to dl::DataLoader is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the time index and the second one is the parameter index.

matrix_data = rand(Float32, 2, 10)
 dl = DataLoader(matrix_data; autoencoder = true)
 
 batch = Batch(3)
-batch(dl)
([(1, 9), (1, 5), (1, 7)], [(1, 10), (1, 6), (1, 1)], [(1, 2), (1, 3), (1, 4)], [(1, 8)])

This also works if the data are in $qp$ form:

qp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))
+batch(dl)
([(1, 5), (1, 10), (1, 1)], [(1, 7), (1, 4), (1, 6)], [(1, 9), (1, 2), (1, 8)], [(1, 3)])

This also works if the data are in $qp$ form:

qp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))
 dl = DataLoader(qp_data; autoencoder = true)
 
 batch = Batch(3)
-batch(dl)
([(1, 1), (1, 7), (1, 4)], [(1, 8), (1, 2), (1, 5)], [(1, 10), (1, 6), (1, 3)], [(1, 9)])

In those two examples the autoencoder keyword was set to true (the default). This is why the first index was always 1. This changes if we set autoencoder = false:

qp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))
+batch(dl)
([(1, 9), (1, 10), (1, 5)], [(1, 1), (1, 7), (1, 4)], [(1, 8), (1, 3), (1, 2)], [(1, 6)])

In those two examples the autoencoder keyword was set to true (the default). This is why the first index was always 1. This changes if we set autoencoder = false:

qp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))
 dl = DataLoader(qp_data; autoencoder = false) # false is default
 
 batch = Batch(3)
-batch(dl)
([(9, 1), (4, 1), (7, 1)], [(2, 1), (1, 1), (5, 1)], [(8, 1), (3, 1), (6, 1)])

Specifically the routines do the following:

  1. $\mathtt{n\_indices}\leftarrow \mathtt{n\_params}\lor\mathtt{input\_time\_steps},$
  2. $\mathtt{indices} \leftarrow \mathtt{shuffle}(\mathtt{1:\mathtt{n\_indices}}),$
  3. $\mathcal{I}_i \leftarrow \mathtt{indices[(i - 1)} \cdot \mathtt{batch\_size} + 1 \mathtt{:} i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1),$
  4. $\mathcal{I}_\mathtt{last} \leftarrow \mathtt{indices[}(\mathtt{n\_batches} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}.$

Note that the routines are implemented in such a way that no two indices appear double.

Sampling from a tensor

We can also sample tensor data.

qp_data = (q = rand(Float32, 2, 20, 3), p = rand(Float32, 2, 20, 3))
+batch(dl)
([(6, 1), (1, 1), (3, 1)], [(5, 1), (7, 1), (9, 1)], [(8, 1), (4, 1), (2, 1)])

Specifically the routines do the following:

  1. $\mathtt{n\_indices}\leftarrow \mathtt{n\_params}\lor\mathtt{input\_time\_steps},$
  2. $\mathtt{indices} \leftarrow \mathtt{shuffle}(\mathtt{1:\mathtt{n\_indices}}),$
  3. $\mathcal{I}_i \leftarrow \mathtt{indices[(i - 1)} \cdot \mathtt{batch\_size} + 1 \mathtt{:} i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1),$
  4. $\mathcal{I}_\mathtt{last} \leftarrow \mathtt{indices[}(\mathtt{n\_batches} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}.$

Note that the routines are implemented in such a way that no two indices appear double.

Sampling from a tensor

We can also sample tensor data.

qp_data = (q = rand(Float32, 2, 20, 3), p = rand(Float32, 2, 20, 3))
 dl = DataLoader(qp_data)
 
 # also specify sequence length here
 batch = Batch(4, 5)
-batch(dl)
([(10, 1), (5, 1), (6, 1), (3, 1)], [(7, 1), (2, 1), (11, 1), (8, 1)], [(9, 1), (1, 1), (4, 1), (10, 3)], [(5, 3), (6, 3), (3, 3), (7, 3)], [(2, 3), (11, 3), (8, 3), (9, 3)], [(1, 3), (4, 3), (10, 2), (5, 2)], [(6, 2), (3, 2), (7, 2), (2, 2)], [(11, 2), (8, 2), (9, 2), (1, 2)], [(4, 2)])

Sampling from a tensor is done the following way ($\mathcal{I}_i$ again denotes the batch indices for the $i$-th batch):

  1. $\mathtt{time\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:}(\mathtt{input\_time\_steps} - \mathtt{seq\_length} - \mathtt{prediction_window}),$
  2. $\mathtt{parameter\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:n\_params}),$
  3. $\mathtt{complete\_indices} \leftarrow \mathtt{product}(\mathtt{time\_indices}, \mathtt{parameter\_indices}),$
  4. $\mathcal{I}_i \leftarrow \mathtt{complete\_indices[}(i - 1) \cdot \mathtt{batch\_size} + 1 : i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1),$
  5. $\mathcal{I}_\mathrm{last} \leftarrow \mathtt{complete\_indices[}(\mathrm{last} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}.$

This algorithm can be visualized the following way (here batch_size = 4):

Here the sampling is performed over the second axis (the time step dimension) and the third axis (the parameter dimension). Whereas each block has thickness 1 in the $x$ direction (i.e. pertains to a single parameter), its length in the $y$ direction is seq_length. In total we sample as many such blocks as the batch size is big. By construction those blocks are never the same throughout a training epoch but may intersect each other!

+batch(dl)
([(4, 1), (3, 1), (2, 1), (11, 1)], [(7, 1), (9, 1), (5, 1), (10, 1)], [(6, 1), (8, 1), (1, 1), (4, 3)], [(3, 3), (2, 3), (11, 3), (7, 3)], [(9, 3), (5, 3), (10, 3), (6, 3)], [(8, 3), (1, 3), (4, 2), (3, 2)], [(2, 2), (11, 2), (7, 2), (9, 2)], [(5, 2), (10, 2), (6, 2), (8, 2)], [(1, 2)])

Sampling from a tensor is done the following way ($\mathcal{I}_i$ again denotes the batch indices for the $i$-th batch):

  1. $\mathtt{time\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:}(\mathtt{input\_time\_steps} - \mathtt{seq\_length} - \mathtt{prediction_window}),$
  2. $\mathtt{parameter\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:n\_params}),$
  3. $\mathtt{complete\_indices} \leftarrow \mathtt{product}(\mathtt{time\_indices}, \mathtt{parameter\_indices}),$
  4. $\mathcal{I}_i \leftarrow \mathtt{complete\_indices[}(i - 1) \cdot \mathtt{batch\_size} + 1 : i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1),$
  5. $\mathcal{I}_\mathrm{last} \leftarrow \mathtt{complete\_indices[}(\mathrm{last} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}.$

This algorithm can be visualized the following way (here batch_size = 4):

Here the sampling is performed over the second axis (the time step dimension) and the third axis (the parameter dimension). Whereas each block has thickness 1 in the $x$ direction (i.e. pertains to a single parameter), its length in the $y$ direction is seq_length. In total we sample as many such blocks as the batch size is big. By construction those blocks are never the same throughout a training epoch but may intersect each other!

diff --git a/latest/data_loader/snapshot_matrix/index.html b/latest/data_loader/snapshot_matrix/index.html index 52e347d4a..46d45d04b 100644 --- a/latest/data_loader/snapshot_matrix/index.html +++ b/latest/data_loader/snapshot_matrix/index.html @@ -1,8 +1,8 @@ -Snapshot matrix & tensor · GeometricMachineLearning.jl

Snapshot matrix

The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct reduced bases in a data-driven way. So (for a single parameter[1]) the snapshot matrix takes the following form:

\[M = \left[\begin{array}{c:c:c:c} +Snapshot matrix & tensor · GeometricMachineLearning.jl

Snapshot matrix

The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct reduced bases in a data-driven way. So (for a single parameter[1]) the snapshot matrix takes the following form:

\[M = \left[\begin{array}{c:c:c:c} \hat{u}_1(t_0) & \hat{u}_1(t_1) & \quad\ldots\quad & \hat{u}_1(t_f) \\ \hat{u}_2(t_0) & \hat{u}_2(t_1) & \ldots & \hat{u}_2(t_f) \\ \hat{u}_3(t_0) & \hat{u}_3(t_1) & \ldots & \hat{u}_3(t_f) \\ \ldots & \ldots & \ldots & \ldots \\ \hat{u}_{2N}(t_0) & \hat{u}_{2N}(t_1) & \ldots & \hat{u}_{2N}(t_f) \\ -\end{array}\right].\]

In the above example we store a matrix whose first axis is the system dimension (i.e. a state is an element of $\mathbb{R}^{2n}$) and the second dimension gives the time step.

The starting point for using the snapshot matrix as data for a machine learning model is that all the columns of $M$ live on a lower-dimensional solution manifold and we can use techniques such as POD and autoencoders to find this solution manifold. We also note that the second axis of $M$ does not necessarily indicate time but can also represent various parameters (including initial conditions). The second axis in the DataLoader struct is therefore saved in the field n_params.

Snapshot tensor

The snapshot tensor fulfills the same role as the snapshot matrix but has a third axis that describes different initial parameters (such as different initial conditions).

When drawing training samples from the snapshot tensor we also need to specify a sequence length (as an argument to the Batch struct). When sampling a batch from the snapshot tensor we sample over the starting point of the time interval (which is of length seq_length) and the third axis of the tensor (the parameters). The total number of batches in this case is $\lceil\mathtt{(dl.input\_time_steps - batch.seq\_length) * dl.n\_params / batch.batch_size}\rceil$.

  • 1If we deal with a parametrized PDE then there are two stages at which the snapshot matrix has to be processed: the offline stage and the online stage.
+\end{array}\right].\]

In the above example we store a matrix whose first axis is the system dimension (i.e. a state is an element of $\mathbb{R}^{2n}$) and the second dimension gives the time step.

The starting point for using the snapshot matrix as data for a machine learning model is that all the columns of $M$ live on a lower-dimensional solution manifold and we can use techniques such as POD and autoencoders to find this solution manifold. We also note that the second axis of $M$ does not necessarily indicate time but can also represent various parameters (including initial conditions). The second axis in the DataLoader struct is therefore saved in the field n_params.

Snapshot tensor

The snapshot tensor fulfills the same role as the snapshot matrix but has a third axis that describes different initial parameters (such as different initial conditions).

When drawing training samples from the snapshot tensor we also need to specify a sequence length (as an argument to the Batch struct). When sampling a batch from the snapshot tensor we sample over the starting point of the time interval (which is of length seq_length) and the third axis of the tensor (the parameters). The total number of batches in this case is $\lceil\mathtt{(dl.input\_time_steps - batch.seq\_length) * dl.n\_params / batch.batch_size}\rceil$.

  • 1If we deal with a parametrized PDE then there are two stages at which the snapshot matrix has to be processed: the offline stage and the online stage.
diff --git a/latest/index.html b/latest/index.html index 09b22c7d2..54a046733 100644 --- a/latest/index.html +++ b/latest/index.html @@ -1,2 +1,2 @@ -Home · GeometricMachineLearning.jl

Geometric Machine Learning

GeometricMachineLearning.jl implements various scientific machine learning models that aim at learning dynamical systems with geometric structure, such as Hamiltonian (symplectic) or Lagrangian (variational) systems.

Installation

GeometricMachineLearning.jl and all of its dependencies can be installed via the Julia REPL by typing

]add GeometricMachineLearning

Architectures

There are several architectures tailored towards problems in scientific machine learning implemented in GeometricMachineLearning.

Manifolds

GeometricMachineLearning supports putting neural network weights on manifolds. These include:

    Special Neural Network Layer

    Many layers have been adapted in order to be used for problems in scientific machine learning. Including:

    Tutorials

    Tutorials for using GeometricMachineLearning are:

    Reduced Order Modeling

    A short description of the key concepts in reduced order modeling (where GeometricMachineLearning can be used) are in:

    +Home · GeometricMachineLearning.jl

    Geometric Machine Learning

    GeometricMachineLearning.jl implements various scientific machine learning models that aim at learning dynamical systems with geometric structure, such as Hamiltonian (symplectic) or Lagrangian (variational) systems.

    Installation

    GeometricMachineLearning.jl and all of its dependencies can be installed via the Julia REPL by typing

    ]add GeometricMachineLearning

    Architectures

    There are several architectures tailored towards problems in scientific machine learning implemented in GeometricMachineLearning.

    Manifolds

    GeometricMachineLearning supports putting neural network weights on manifolds. These include:

      Special Neural Network Layer

      Many layers have been adapted in order to be used for problems in scientific machine learning. Including:

      Tutorials

      Tutorials for using GeometricMachineLearning are:

      Reduced Order Modeling

      A short description of the key concepts in reduced order modeling (where GeometricMachineLearning can be used) are in:

      diff --git a/latest/layers/attention_layer/index.html b/latest/layers/attention_layer/index.html index 43190cd44..d26efbb3a 100644 --- a/latest/layers/attention_layer/index.html +++ b/latest/layers/attention_layer/index.html @@ -1,10 +1,10 @@ -Attention · GeometricMachineLearning.jl

      The Attention Layer

      The attention mechanism was originally developed for image and natural language processing (NLP) tasks. It is motivated by the need to handle time series data in an efficient way[1]. Its essential idea is to compute correlations between vectors in input sequences. I.e. given sequences

      \[(z_q^{(1)}, z_q^{(2)}, \ldots, z_q^{(T)}) \text{ and } (z_p^{(1)}, z_p^{(2)}, \ldots, z_p^{(T)}),\]

      an attention mechanism computes pair-wise correlations between all combinations of two input vectors from these sequences. In [13] "additive" attention is used to compute such correlations:

      \[(z_q, z_k) \mapsto v^T\sigma(Wz_q + Uz_k), \]

      where $z_q, z_k \in \mathbb{R}^d$ are elements of the input sequences. The learnable parameters are $W, U \in \mathbb{R}^{n\times{}d}$ and $v \in \mathbb{R}^n$.

      However multiplicative attention (see e.g. [14])is more straightforward to interpret and cheaper to handle computationally:

      \[(z_q, z_k) \mapsto z_q^TWz_k,\]

      where $W \in \mathbb{R}^{d\times{}d}$ is a learnable weight matrix with respect to which correlations are computed as scalar products. Regardless of the type of attention used, they all try to compute correlations among input sequences on whose basis further computation is performed. Given two input sequences $Z_q = (z_q^{(1)}, \ldots, z_q^{(T)})$ and $Z_k = (z_k^{(1)}, \ldots, z_k^{(T)})$, we can arrange the various correlations into a correlation matrix $C\in\mathbb{R}^{T\times{}T}$ with entries $[C]_{ij} = \mathtt{attention}(z_q^{(i)}, z_k^{(j)})$. In the case of multiplicative attention this matrix is just $C = Z^TWZ$.

      Reweighting of the input sequence

      In GeometricMachineLearning we always compute self-attention, meaning that the two input sequences $Z_q$ and $Z_k$ are the same, i.e. $Z = Z_q = Z_k$.[2]

      This is then used to reweight the columns in the input sequence $Z$. For this we first apply a nonlinearity $\sigma$ onto $C$ and then multiply $\sigma(C)$ onto $Z$ from the right, i.e. the output of the attention layer is $Z\sigma(C)$. So we perform the following mappings:

      \[Z \xrightarrow{\mathrm{correlations}} C(Z) =: C \xrightarrow{\sigma} \sigma(C) \xrightarrow{\text{right multiplication}} Z \sigma(C).\]

      After the right multiplication the outputs is of the following form:

      \[ [\sum_{i=1}^Tp^{(1)}_iz^{(i)}, \ldots, \sum_{i=1}^Tp^{(T)}_iz^{(i)}],\]

      for $p^{(i)} = [\sigma(C)]_{\bullet{}i}$. What is learned during training are $T$ different linear combinations of the input vectors, where the coefficients $p^{(i)}_j$ in these linear combinations depend on the input $Z$ nonlinearly.

      Volume-Preserving Attention

      The attention layer (and the activation function $\sigma$ defined for it) in GeometricMachineLearning was specifically designed to apply it to data coming from physical systems that can be described through a divergence-free or a symplectic vector field. Traditionally the nonlinearity in the attention mechanism is a softmax[3] (see [14]) and the self-attention layer performs the following mapping:

      \[Z := [z^{(1)}, \ldots, z^{(T)}] \mapsto Z\mathrm{softmax}(Z^TWZ).\]

      The softmax activation acts vector-wise, i.e. if we supply it with a matrix $C$ as input it returns:

      \[\mathrm{softmax}(C) = [\mathrm{softmax}(c_{\bullet{}1}), \ldots, \mathrm{softmax}(c_{\bullet{}T})].\]

      The output of a softmax is a probability vector (also called stochastic vector) and the matrix $P = [p^{(1)}, \ldots, p^{(T)}]$, where each column is a probability vector, is sometimes referred to as a stochastic matrix (see [15]). This attention mechanism finds application in transformer neural networks [14]. The problem with this matrix from a geometric point of view is that all the columns are independent of each other and the nonlinear transformation could in theory produce a stochastic matrix for which all columns are identical and thus lead to a loss of information. So the softmax activation function is inherently non-geometric.

      Besides the traditional attention mechanism GeometricMachineLearning therefore also has a volume-preserving transformation that fulfills a similar role. There are two approaches implemented to realize similar transformations. Both of them however utilize the Cayley transform to produce orthogonal matrices $\sigma(C)$ instead of stochastic matrices. For an orthogonal matrix $\Sigma$ we have $\Sigma^T\Sigma = \mathbb{I}$, so all the columns are linearly independent which is not necessarily true for a stochastic matrix $P$. The following explains how this new activation function is implemented.

      The Cayley transform

      The Cayley transform maps from skew-symmetric matrices to orthonormal matrices[4]. It takes the form:

      \[\mathrm{Cayley}: A \mapsto (\mathbb{I} - A)(\mathbb{I} + A)^{-1}.\]

      We can easily check that $\mathrm{Cayley}(A)$ is orthogonal if $A$ is skew-symmetric. For this consider $\varepsilon \mapsto A(\varepsilon)\in\mathcal{S}_\mathrm{skew}$ with $A(0) = \mathbb{I}$ and $A'(0) = B$. Then we have:

      \[\frac{\delta\mathrm{Cayley}}{\delta{}A} = \frac{d}{d\varepsilon}|_{\varepsilon=0} \mathrm{Cayley}(A(\varepsilon))^T \mathrm{Cayley}(A(\varepsilon)) = \mathbb{O}.\]

      In order to use the Cayley transform as an activation function we further need a mapping from the input $Z$ to a skew-symmetric matrix. This is realized in two ways in GeometricMachineLearning: via a scalar-product with a skew-symmetric weighting and via a scalar-product with an arbitrary weighting.

      First approach: scalar products with a skew-symmetric weighting

      For this the attention layer is modified in the following way:

      \[Z := [z^{(1)}, \ldots, z^{(T)}] \mapsto Z\sigma(Z^TAZ),\]

      where $\sigma(C)=\mathrm{Cayley}(C)$ and $A$ is a skew-symmetric matrix that is learnable, i.e. the parameters of the attention layer are stored in $A$.

      Second approach: scalar products with an arbitrary weighting

      For this approach we compute correlations between the input vectors with a skew-symmetric weighting. The correlations we consider here are based on:

      \[(z^{(2)})^TAz^{(1)}, (z^{(3)})^TAz^{(1)}, \ldots, (z^{(d)})^TAz^{(1)}, (z^{(3)})^TAz^{(2)}, \ldots, (z^{(d)})^TAz^{(2)}, \ldots, (z^{(d)})^TAz^{(d-1)}.\]

      So in total we consider correlations $(z^{(i)})^Tz^{(j)}$ for which $i > j$. We now arrange these correlations into a skew-symmetric matrix:

      \[C = \begin{bmatrix} +Attention · GeometricMachineLearning.jl

      The Attention Layer

      The attention mechanism was originally developed for image and natural language processing (NLP) tasks. It is motivated by the need to handle time series data in an efficient way[1]. Its essential idea is to compute correlations between vectors in input sequences. I.e. given sequences

      \[(z_q^{(1)}, z_q^{(2)}, \ldots, z_q^{(T)}) \text{ and } (z_p^{(1)}, z_p^{(2)}, \ldots, z_p^{(T)}),\]

      an attention mechanism computes pair-wise correlations between all combinations of two input vectors from these sequences. In [16] "additive" attention is used to compute such correlations:

      \[(z_q, z_k) \mapsto v^T\sigma(Wz_q + Uz_k), \]

      where $z_q, z_k \in \mathbb{R}^d$ are elements of the input sequences. The learnable parameters are $W, U \in \mathbb{R}^{n\times{}d}$ and $v \in \mathbb{R}^n$.

      However multiplicative attention (see e.g. [17])is more straightforward to interpret and cheaper to handle computationally:

      \[(z_q, z_k) \mapsto z_q^TWz_k,\]

      where $W \in \mathbb{R}^{d\times{}d}$ is a learnable weight matrix with respect to which correlations are computed as scalar products. Regardless of the type of attention used, they all try to compute correlations among input sequences on whose basis further computation is performed. Given two input sequences $Z_q = (z_q^{(1)}, \ldots, z_q^{(T)})$ and $Z_k = (z_k^{(1)}, \ldots, z_k^{(T)})$, we can arrange the various correlations into a correlation matrix $C\in\mathbb{R}^{T\times{}T}$ with entries $[C]_{ij} = \mathtt{attention}(z_q^{(i)}, z_k^{(j)})$. In the case of multiplicative attention this matrix is just $C = Z^TWZ$.

      Reweighting of the input sequence

      In GeometricMachineLearning we always compute self-attention, meaning that the two input sequences $Z_q$ and $Z_k$ are the same, i.e. $Z = Z_q = Z_k$.[2]

      This is then used to reweight the columns in the input sequence $Z$. For this we first apply a nonlinearity $\sigma$ onto $C$ and then multiply $\sigma(C)$ onto $Z$ from the right, i.e. the output of the attention layer is $Z\sigma(C)$. So we perform the following mappings:

      \[Z \xrightarrow{\mathrm{correlations}} C(Z) =: C \xrightarrow{\sigma} \sigma(C) \xrightarrow{\text{right multiplication}} Z \sigma(C).\]

      After the right multiplication the outputs is of the following form:

      \[ [\sum_{i=1}^Tp^{(1)}_iz^{(i)}, \ldots, \sum_{i=1}^Tp^{(T)}_iz^{(i)}],\]

      for $p^{(i)} = [\sigma(C)]_{\bullet{}i}$. What is learned during training are $T$ different linear combinations of the input vectors, where the coefficients $p^{(i)}_j$ in these linear combinations depend on the input $Z$ nonlinearly.

      Volume-Preserving Attention

      The attention layer (and the activation function $\sigma$ defined for it) in GeometricMachineLearning was specifically designed to apply it to data coming from physical systems that can be described through a divergence-free or a symplectic vector field. Traditionally the nonlinearity in the attention mechanism is a softmax[3] (see [17]) and the self-attention layer performs the following mapping:

      \[Z := [z^{(1)}, \ldots, z^{(T)}] \mapsto Z\mathrm{softmax}(Z^TWZ).\]

      The softmax activation acts vector-wise, i.e. if we supply it with a matrix $C$ as input it returns:

      \[\mathrm{softmax}(C) = [\mathrm{softmax}(c_{\bullet{}1}), \ldots, \mathrm{softmax}(c_{\bullet{}T})].\]

      The output of a softmax is a probability vector (also called stochastic vector) and the matrix $P = [p^{(1)}, \ldots, p^{(T)}]$, where each column is a probability vector, is sometimes referred to as a stochastic matrix (see [18]). This attention mechanism finds application in transformer neural networks [17]. The problem with this matrix from a geometric point of view is that all the columns are independent of each other and the nonlinear transformation could in theory produce a stochastic matrix for which all columns are identical and thus lead to a loss of information. So the softmax activation function is inherently non-geometric.

      Besides the traditional attention mechanism GeometricMachineLearning therefore also has a volume-preserving transformation that fulfills a similar role. There are two approaches implemented to realize similar transformations. Both of them however utilize the Cayley transform to produce orthogonal matrices $\sigma(C)$ instead of stochastic matrices. For an orthogonal matrix $\Sigma$ we have $\Sigma^T\Sigma = \mathbb{I}$, so all the columns are linearly independent which is not necessarily true for a stochastic matrix $P$. The following explains how this new activation function is implemented.

      The Cayley transform

      The Cayley transform maps from skew-symmetric matrices to orthonormal matrices[4]. It takes the form:

      \[\mathrm{Cayley}: A \mapsto (\mathbb{I} - A)(\mathbb{I} + A)^{-1}.\]

      We can easily check that $\mathrm{Cayley}(A)$ is orthogonal if $A$ is skew-symmetric. For this consider $\varepsilon \mapsto A(\varepsilon)\in\mathcal{S}_\mathrm{skew}$ with $A(0) = \mathbb{I}$ and $A'(0) = B$. Then we have:

      \[\frac{\delta\mathrm{Cayley}}{\delta{}A} = \frac{d}{d\varepsilon}|_{\varepsilon=0} \mathrm{Cayley}(A(\varepsilon))^T \mathrm{Cayley}(A(\varepsilon)) = \mathbb{O}.\]

      In order to use the Cayley transform as an activation function we further need a mapping from the input $Z$ to a skew-symmetric matrix. This is realized in two ways in GeometricMachineLearning: via a scalar-product with a skew-symmetric weighting and via a scalar-product with an arbitrary weighting.

      First approach: scalar products with a skew-symmetric weighting

      For this the attention layer is modified in the following way:

      \[Z := [z^{(1)}, \ldots, z^{(T)}] \mapsto Z\sigma(Z^TAZ),\]

      where $\sigma(C)=\mathrm{Cayley}(C)$ and $A$ is a skew-symmetric matrix that is learnable, i.e. the parameters of the attention layer are stored in $A$.

      Second approach: scalar products with an arbitrary weighting

      For this approach we compute correlations between the input vectors with a skew-symmetric weighting. The correlations we consider here are based on:

      \[(z^{(2)})^TAz^{(1)}, (z^{(3)})^TAz^{(1)}, \ldots, (z^{(d)})^TAz^{(1)}, (z^{(3)})^TAz^{(2)}, \ldots, (z^{(d)})^TAz^{(2)}, \ldots, (z^{(d)})^TAz^{(d-1)}.\]

      So in total we consider correlations $(z^{(i)})^Tz^{(j)}$ for which $i > j$. We now arrange these correlations into a skew-symmetric matrix:

      \[C = \begin{bmatrix} 0 & -(z^{(2)})^TAz^{(1)} & -(z^{(3)})^TAz^{(1)} & \ldots & -(z^{(d)})^TAz^{(1)} \\ (z^{(2)})^TAz^{(1)} & 0 & -(z^{(3)})^TAz^{(2)} & \ldots & -(z^{(d)})^TAz^{(2)} \\ \ldots & \ldots & \ldots & \ldots & \ldots \\ (z^{(d)})^TAz^{(1)} & (z^{(d)})^TAz^{(2)} & (z^{(d)})^TAz^{(3)} & \ldots & 0 -\end{bmatrix}.\]

      This correlation matrix can now again be used as an input for the Cayley transform to produce an orthogonal matrix.

      How is structure preserved?

      In order to discuss how structure is preserved we first have to define what structure we mean precisely. This structure is strongly inspired by traditional multi-step methods (see [17]). We now define what volume preservation means for the product space $\mathbb{R}^{d}\times\cdots\times\mathbb{R}^{d}\equiv\times_\text{$T$ times}\mathbb{R}^{d}$.

      Consider an isomorphism $\hat{}: \times_\text{($T$ times)}\mathbb{R}^{d}\stackrel{\approx}{\longrightarrow}\mathbb{R}^{dT}$. Specifically, this isomorphism takes the form:

      \[Z = \left[\begin{array}{cccc} +\end{bmatrix}.\]

      This correlation matrix can now again be used as an input for the Cayley transform to produce an orthogonal matrix.

      How is structure preserved?

      In order to discuss how structure is preserved we first have to define what structure we mean precisely. This structure is strongly inspired by traditional multi-step methods (see [19]). We now define what volume preservation means for the product space $\mathbb{R}^{d}\times\cdots\times\mathbb{R}^{d}\equiv\times_\text{$T$ times}\mathbb{R}^{d}$.

      Consider an isomorphism $\hat{}: \times_\text{($T$ times)}\mathbb{R}^{d}\stackrel{\approx}{\longrightarrow}\mathbb{R}^{dT}$. Specifically, this isomorphism takes the form:

      \[Z = \left[\begin{array}{cccc} z_1^{(1)} & z_1^{(2)} & \quad\cdots\quad & z_1^{(T)} \\ z_2^{(1)} & z_2^{(2)} & \cdots & z_2^{(T)} \\ \cdots & \cdots & \cdots & \cdots \\ @@ -17,4 +17,4 @@ \cdots & \cdots & \ddots & \cdots \\ \mathbb{O} & \mathbb{O} & \cdots & \Lambda(Z) \\ \end{pmatrix} - \left[\begin{array}{c} z_1^{(1)} \\ z_1^{(2)} \\ \ldots \\ z_1^{(T)} \\ z_2^{(1)} \\ \ldots \\ z_d^{(T)} \end{array}\right] .\]

      $\tilde{\Lambda}(Z)$ in m[eq:LambdaApplication]m(@latex) is easily shown to be an orthogonal matrix.

      Historical Note

      Attention was used before, but always in connection with recurrent neural networks (see [18] and [13]).

      References

      [13]
      D. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).
      [18]
      M.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).
      • 1Recurrent neural networks have the same motivation.
      • 2Multihead attention also falls into this category. Here the input $Z$ is multiplied from the left with several projection matrices $P^Q_i$ and $P^K_i$, where $i$ indicates the head. For each head we then compute a correlation matrix $(P^Q_i Z)^T(P^K Z)$.
      • 3The softmax acts on the matrix $C$ in a vector-wise manner, i.e. it operates on each column of the input matrix $C = [c^{(1)}, \ldots, c^{(T)}]$. The result is a sequence of probability vectors $[p^{(1)}, \ldots, p^{(T)}]$ for which $\sum_{i=1}^Tp^{(j)}_i=1\quad\forall{}j\in\{1,\dots,T\}.$
      • 4A matrix $A$ is skew-symmetric if $A = -A^T$ and a matrix $B$ is orthonormal if $B^TB = \mathbb{I}$. The orthonormal matrices form a Lie group, i.e. the set of orthonormal matrices can be endowed with the structure of a differential manifold and this set also satisfies the group axioms. The corresponding Lie algebra are the skew-symmetric matrices and the Cayley transform is a so-called retraction in this case. For more details consult e.g. [7] and [16].
      + \left[\begin{array}{c} z_1^{(1)} \\ z_1^{(2)} \\ \ldots \\ z_1^{(T)} \\ z_2^{(1)} \\ \ldots \\ z_d^{(T)} \end{array}\right] .\]

      $\tilde{\Lambda}(Z)$ in m[eq:LambdaApplication]m(@latex) is easily shown to be an orthogonal matrix.

      Historical Note

      Attention was used before, but always in connection with recurrent neural networks (see [20] and [16]).

      References

      [16]
      D. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).
      [20]
      M.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).
      • 1Recurrent neural networks have the same motivation.
      • 2Multihead attention also falls into this category. Here the input $Z$ is multiplied from the left with several projection matrices $P^Q_i$ and $P^K_i$, where $i$ indicates the head. For each head we then compute a correlation matrix $(P^Q_i Z)^T(P^K Z)$.
      • 3The softmax acts on the matrix $C$ in a vector-wise manner, i.e. it operates on each column of the input matrix $C = [c^{(1)}, \ldots, c^{(T)}]$. The result is a sequence of probability vectors $[p^{(1)}, \ldots, p^{(T)}]$ for which $\sum_{i=1}^Tp^{(j)}_i=1\quad\forall{}j\in\{1,\dots,T\}.$
      • 4A matrix $A$ is skew-symmetric if $A = -A^T$ and a matrix $B$ is orthonormal if $B^TB = \mathbb{I}$. The orthonormal matrices form a Lie group, i.e. the set of orthonormal matrices can be endowed with the structure of a differential manifold and this set also satisfies the group axioms. The corresponding Lie algebra are the skew-symmetric matrices and the Cayley transform is a so-called retraction in this case. For more details consult e.g. [7] and [10].
      diff --git a/latest/layers/linear_symplectic_attention/index.html b/latest/layers/linear_symplectic_attention/index.html index ba00f031b..54abb9d98 100644 --- a/latest/layers/linear_symplectic_attention/index.html +++ b/latest/layers/linear_symplectic_attention/index.html @@ -1,2 +1,2 @@ -Linear Symplectic Attention · GeometricMachineLearning.jl

      Linear Symplectic Attention

      The attention layer introduced here is an extension of the Sympnet gradient layer to the setting where we deal with time series data. We first have to define a notion of symplecticity for multi-step methods.

      This definition is essentially taken from [19, 20] and similar to the definition of volume-preservation in [21].

      Definition

      A multi-step method $\times_T\mathbb{R}^{2n}\to\times_T\mathbb{R}^{2n}$ is called symplectic if it preserves the the symplectic product structure.

      The symplectic product structure is the following skew-symmetric non-degenerate bilinear form:

      \[\mathbb{J}([z^{(1)}, \ldots, z^{(T)}], [\tilde{z}^{(1)}, \ldots, \tilde{z}^{(T)}]) := \sum_{i=1}^T (z^{(i)})^T\tilde{z}^{(i)}.\]

      In order to construct a symplectic attention mechanism we extend the principle SympNet gradient layer, i.e. we construct scalar functions that only depend on $[q^{(1)}, \ldots, q^{(T)}]$ or $[p^{(1)}, \ldots, p^{(T)}]$. The specific choice we make here is the following:

      \[F(q^{(1)}, q^{(T)}) = \frac{1}{2}\mathrm{Tr}(QAQ^T),\]

      where $Q := [q^{(1)}, \ldots, q^{(T)}]$. We therefore have for the gradient:

      \[\nabla_Qf = \frac{1}{2}Q(A + A^T) =: Q\bar{A},\]

      where ``A\in\mathcal{S}_\mathrm{skew}(T). So the map performs:

      \[[q^{(1)}, \ldots, q^{(T)}] \mapsto \left[ \sum_{i=1}^Ta_{1i}q^{(i)}, \ldots, \sum_{i=1}^Ta_{Ti}q^{(i)} \right].\]

      Library Functions

      GeometricMachineLearning.LinearSymplecticAttentionType

      Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the $Q$ or the $P$ part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.

      Constructor

      For the constructors simply call

      LinearSymplecticAttentionQ(sys_dim, seq_length)

      or

      LinearSymplecticAttentionP(sys_dim, seq_length)

      where sys_dim is the system dimension and seq_length is the sequence length.

      source
      +Linear Symplectic Attention · GeometricMachineLearning.jl

      Linear Symplectic Attention

      The attention layer introduced here is an extension of the Sympnet gradient layer to the setting where we deal with time series data. We first have to define a notion of symplecticity for multi-step methods.

      This definition is essentially taken from [21, 22] and similar to the definition of volume-preservation in [23].

      Definition

      A multi-step method $\times_T\mathbb{R}^{2n}\to\times_T\mathbb{R}^{2n}$ is called symplectic if it preserves the the symplectic product structure.

      The symplectic product structure is the following skew-symmetric non-degenerate bilinear form:

      \[\mathbb{J}([z^{(1)}, \ldots, z^{(T)}], [\tilde{z}^{(1)}, \ldots, \tilde{z}^{(T)}]) := \sum_{i=1}^T (z^{(i)})^T\tilde{z}^{(i)}.\]

      In order to construct a symplectic attention mechanism we extend the principle SympNet gradient layer, i.e. we construct scalar functions that only depend on $[q^{(1)}, \ldots, q^{(T)}]$ or $[p^{(1)}, \ldots, p^{(T)}]$. The specific choice we make here is the following:

      \[F(q^{(1)}, q^{(T)}) = \frac{1}{2}\mathrm{Tr}(QAQ^T),\]

      where $Q := [q^{(1)}, \ldots, q^{(T)}]$. We therefore have for the gradient:

      \[\nabla_Qf = \frac{1}{2}Q(A + A^T) =: Q\bar{A},\]

      where ``A\in\mathcal{S}_\mathrm{skew}(T). So the map performs:

      \[[q^{(1)}, \ldots, q^{(T)}] \mapsto \left[ \sum_{i=1}^Ta_{1i}q^{(i)}, \ldots, \sum_{i=1}^Ta_{Ti}q^{(i)} \right].\]

      Library Functions

      GeometricMachineLearning.LinearSymplecticAttentionType

      Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the $Q$ or the $P$ part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.

      Constructor

      For the constructors simply call

      LinearSymplecticAttentionQ(sys_dim, seq_length)

      or

      LinearSymplecticAttentionP(sys_dim, seq_length)

      where sys_dim is the system dimension and seq_length is the sequence length.

      source
      diff --git a/latest/layers/multihead_attention_layer/index.html b/latest/layers/multihead_attention_layer/index.html index 98d5705d6..4180eea95 100644 --- a/latest/layers/multihead_attention_layer/index.html +++ b/latest/layers/multihead_attention_layer/index.html @@ -1,2 +1,2 @@ -Multihead Attention · GeometricMachineLearning.jl

      Multihead Attention

      In order to arrive from the attention layer at the multihead attention layer we have to do a few modifications:

      Note that these neural networks were originally developed for natural language processing (NLP) tasks and the terminology used here bears some resemblance to that field. The input to a multihead attention layer typicaly comprises three components:

      1. Values $V\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are value vectors,
      2. Queries $Q\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are query vectors,
      3. Keys $K\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are key vectors.

      Regular attention performs the following operation:

      \[\mathrm{Attention}(Q,K,V) = V\mathrm{softmax}(\frac{K^TQ}{\sqrt{n}}),\]

      where $n$ is the dimension of the vectors in $V$, $Q$ and $K$. The softmax activation function here acts column-wise, so it can be seen as a transformation $\mathrm{softmax}:\mathbb{R}^{T}\to\mathbb{R}^T$ with $[\mathrm{softmax}(v)]_i = e^{v_i}/\left(\sum_{j=1}e^{v_j}\right)$. The $K^TQ$ term is a similarity matrix between the queries and the vectors.

      The transformer contains a self-attention mechanism, i.e. takes an input $X$ and then transforms it linearly to $V$, $Q$ and $K$, i.e. $V = P^VX$, $Q = P^QX$ and $K = P^KX$. What distinguishes the multihead attention layer from the singlehead attention layer, is that there is not just one $P^V$, $P^Q$ and $P^K$, but there are several: one for each head of the multihead attention layer. After computing the individual values, queries and vectors, and after applying the softmax, the outputs are then concatenated together in order to obtain again an array that is of the same size as the input array:

      Here the various $P$ matrices can be interpreted as being projections onto lower-dimensional subspaces, hence the designation by the letter $P$. Because of this interpretation as projection matrices onto smaller spaces that should capture features in the input data it makes sense to constrain these elements to be part of the Stiefel manifold.

      Computing Correlations in the Multihead-Attention Layer

      The attention mechanism describes a reweighting of the "values" $V_i$ based on correlations between the "keys" $K_i$ and the "queries" $Q_i$. First note the structure of these matrices: they are all a collection of $T$ vectors $(N\div\mathtt{n\_heads})$-dimensional vectors, i.e. $V_i=[v_i^{(1)}, \ldots, v_i^{(T)}], K_i=[k_i^{(1)}, \ldots, k_i^{(T)}], Q_i=[q_i^{(1)}, \ldots, q_i^{(T)}]$ . Those vectors have been obtained by applying the respective projection matrices onto the original input $I_i\in\mathbb{R}^{N\times{}T}$.

      When performing the reweighting of the columns of $V_i$ we first compute the correlations between the vectors in $K_i$ and in $Q_i$ and store the results in a correlation matrix $C_i$:

      \[ [C_i]_{mn} = \left(k_i^{(m)}\right)^Tq_i^{(n)}.\]

      The columns of this correlation matrix are than rescaled with a softmax function, obtaining a matrix of probability vectors $\mathcal{P}_i$:

      \[ [\mathcal{P}_i]_{\bullet{}n} = \mathrm{softmax}([C_i]_{\bullet{}n}).\]

      Finally the matrix $\mathcal{P}_i$ is multiplied onto $V_i$ from the right, resulting in 16 convex combinations of the 16 vectors $v_i^{(m)}$ with $m=1,\ldots,T$:

      \[ V_i\mathcal{P}_i = \left[\sum_{m=1}^{16}[\mathcal{P}_i]_{m,1}v_i^{(m)}, \ldots, \sum_{m=1}^{T}[\mathcal{P}_i]_{m,T}v_i^{(m)}\right].\]

      With this we can now give a better interpretation of what the projection matrices $W_i^V$, $W_i^K$ and $W_i^Q$ should do: they map the original data to lower-dimensional subspaces. We then compute correlations between the representation in the $K$ and in the $Q$ basis and use this correlation to perform a convex reweighting of the vectors in the $V$ basis. These reweighted values are then fed into a standard feedforward neural network.

      Because the main task of the $W_i^V$, $W_i^K$ and $W_i^Q$ matrices here is for them to find bases, it makes sense to constrain them onto the Stiefel manifold; they do not and should not have the maximum possible generality.

      Library Functions

      GeometricMachineLearning.MultiHeadAttentionType

      MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data.

      Constructor

      Takes input arguments:

      • dim::Int: The system dimension
      • n_heads::Int: The number of heads.
      • Stiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold.
      • retraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction.
      • add_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result.
      source

      References

      [14]
      A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).
      +Multihead Attention · GeometricMachineLearning.jl

      Multihead Attention

      In order to arrive from the attention layer at the multihead attention layer we have to do a few modifications:

      Note that these neural networks were originally developed for natural language processing (NLP) tasks and the terminology used here bears some resemblance to that field. The input to a multihead attention layer typicaly comprises three components:

      1. Values $V\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are value vectors,
      2. Queries $Q\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are query vectors,
      3. Keys $K\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are key vectors.

      Regular attention performs the following operation:

      \[\mathrm{Attention}(Q,K,V) = V\mathrm{softmax}(\frac{K^TQ}{\sqrt{n}}),\]

      where $n$ is the dimension of the vectors in $V$, $Q$ and $K$. The softmax activation function here acts column-wise, so it can be seen as a transformation $\mathrm{softmax}:\mathbb{R}^{T}\to\mathbb{R}^T$ with $[\mathrm{softmax}(v)]_i = e^{v_i}/\left(\sum_{j=1}e^{v_j}\right)$. The $K^TQ$ term is a similarity matrix between the queries and the vectors.

      The transformer contains a self-attention mechanism, i.e. takes an input $X$ and then transforms it linearly to $V$, $Q$ and $K$, i.e. $V = P^VX$, $Q = P^QX$ and $K = P^KX$. What distinguishes the multihead attention layer from the singlehead attention layer, is that there is not just one $P^V$, $P^Q$ and $P^K$, but there are several: one for each head of the multihead attention layer. After computing the individual values, queries and vectors, and after applying the softmax, the outputs are then concatenated together in order to obtain again an array that is of the same size as the input array:

      Here the various $P$ matrices can be interpreted as being projections onto lower-dimensional subspaces, hence the designation by the letter $P$. Because of this interpretation as projection matrices onto smaller spaces that should capture features in the input data it makes sense to constrain these elements to be part of the Stiefel manifold.

      Computing Correlations in the Multihead-Attention Layer

      The attention mechanism describes a reweighting of the "values" $V_i$ based on correlations between the "keys" $K_i$ and the "queries" $Q_i$. First note the structure of these matrices: they are all a collection of $T$ vectors $(N\div\mathtt{n\_heads})$-dimensional vectors, i.e. $V_i=[v_i^{(1)}, \ldots, v_i^{(T)}], K_i=[k_i^{(1)}, \ldots, k_i^{(T)}], Q_i=[q_i^{(1)}, \ldots, q_i^{(T)}]$ . Those vectors have been obtained by applying the respective projection matrices onto the original input $I_i\in\mathbb{R}^{N\times{}T}$.

      When performing the reweighting of the columns of $V_i$ we first compute the correlations between the vectors in $K_i$ and in $Q_i$ and store the results in a correlation matrix $C_i$:

      \[ [C_i]_{mn} = \left(k_i^{(m)}\right)^Tq_i^{(n)}.\]

      The columns of this correlation matrix are than rescaled with a softmax function, obtaining a matrix of probability vectors $\mathcal{P}_i$:

      \[ [\mathcal{P}_i]_{\bullet{}n} = \mathrm{softmax}([C_i]_{\bullet{}n}).\]

      Finally the matrix $\mathcal{P}_i$ is multiplied onto $V_i$ from the right, resulting in 16 convex combinations of the 16 vectors $v_i^{(m)}$ with $m=1,\ldots,T$:

      \[ V_i\mathcal{P}_i = \left[\sum_{m=1}^{16}[\mathcal{P}_i]_{m,1}v_i^{(m)}, \ldots, \sum_{m=1}^{T}[\mathcal{P}_i]_{m,T}v_i^{(m)}\right].\]

      With this we can now give a better interpretation of what the projection matrices $W_i^V$, $W_i^K$ and $W_i^Q$ should do: they map the original data to lower-dimensional subspaces. We then compute correlations between the representation in the $K$ and in the $Q$ basis and use this correlation to perform a convex reweighting of the vectors in the $V$ basis. These reweighted values are then fed into a standard feedforward neural network.

      Because the main task of the $W_i^V$, $W_i^K$ and $W_i^Q$ matrices here is for them to find bases, it makes sense to constrain them onto the Stiefel manifold; they do not and should not have the maximum possible generality.

      Library Functions

      GeometricMachineLearning.MultiHeadAttentionType

      MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data.

      Constructor

      Takes input arguments:

      • dim::Int: The system dimension
      • n_heads::Int: The number of heads.
      • Stiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold.
      • retraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction.
      • add_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result.
      source

      References

      [17]
      A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).
      diff --git a/latest/layers/sympnet_gradient/index.html b/latest/layers/sympnet_gradient/index.html index d57e73d30..d9c2625a1 100644 --- a/latest/layers/sympnet_gradient/index.html +++ b/latest/layers/sympnet_gradient/index.html @@ -1,5 +1,5 @@ -Sympnet Gradient Layers · GeometricMachineLearning.jl

      SympNet Gradient Layer

      The Sympnet gradient layer (called GradientLayer in GeometricMachineLearning) is based on the following theorem:

      Theorem

      Given a symplectic vector space $\mathbb{R}^{2n}$ which coordinates $q_1, \ldots, q_n, p_1, \ldots, p_n$ and a function $f:\mathbb{R}^n\to\mathbb{R}$ that only acts on the $q$ part, the map $(q, p) \mapsto (q, p + \nabla_qf)$ is symplectic. A similar statement holds if $f$ only acts on the $p$ part.

      Proofing this is straightforward by looking at the gradient of the mapping:

      \[ \begin{pmatrix} +Sympnet Gradient Layers · GeometricMachineLearning.jl

      SympNet Gradient Layer

      The Sympnet gradient layer (called GradientLayer in GeometricMachineLearning) is based on the following theorem:

      Theorem

      Given a symplectic vector space $\mathbb{R}^{2n}$ which coordinates $q_1, \ldots, q_n, p_1, \ldots, p_n$ and a function $f:\mathbb{R}^n\to\mathbb{R}$ that only acts on the $q$ part, the map $(q, p) \mapsto (q, p + \nabla_qf)$ is symplectic. A similar statement holds if $f$ only acts on the $p$ part.

      Proofing this is straightforward by looking at the gradient of the mapping:

      \[ \begin{pmatrix} \mathbb{I} & \mathbb{O} \\ \nabla_q^2f & \mathbb{I} \end{pmatrix},\]

      where $\nabla_q^2f$ is the Hessian of $f$. This matrix is symmetric and for any symmetric matrix $A$ we have that:

      \[ \begin{pmatrix} @@ -25,4 +25,4 @@ \begin{pmatrix} \mathbb{O} & \mathbb{I} \\ -\mathbb{I} & \mathbb{O} - \end{pmatrix}.\]

      If we deal with GSympNets this function $f$ is

      \[ f(q) = a^T \Sigma(Kq + b),\]

      where $a, b\in\mathbb{R}^m$, $K\in\mathbb{R}^{m\times{}n}$ and $\Sigma$ is the antiderivative of some common activation function $\sigma$. We routinely refer to $m$ as the upscaling dimension in GeometricMachineLearning. Computing the gradient of $f$ gives:

      \[ [\nabla_qf]_k = \sum_{i=1}^m a_i \sigma(\sum_{j=1}^nk_{ij}q_j + b_i)k_{ik} = = K^T a \odot \sigma(Kq + b),\]

      where $\odot$ is the element-wise product, i.e. $[a\odot{}v]_k = a_kv_k$.

      + \end{pmatrix}.\]

      If we deal with GSympNets this function $f$ is

      \[ f(q) = a^T \Sigma(Kq + b),\]

      where $a, b\in\mathbb{R}^m$, $K\in\mathbb{R}^{m\times{}n}$ and $\Sigma$ is the antiderivative of some common activation function $\sigma$. We routinely refer to $m$ as the upscaling dimension in GeometricMachineLearning. Computing the gradient of $f$ gives:

      \[ [\nabla_qf]_k = \sum_{i=1}^m a_i \sigma(\sum_{j=1}^nk_{ij}q_j + b_i)k_{ik} = = K^T a \odot \sigma(Kq + b),\]

      where $\odot$ is the element-wise product, i.e. $[a\odot{}v]_k = a_kv_k$.

      diff --git a/latest/layers/volume_preserving_feedforward/index.html b/latest/layers/volume_preserving_feedforward/index.html index 5d12115f9..68407bc2d 100644 --- a/latest/layers/volume_preserving_feedforward/index.html +++ b/latest/layers/volume_preserving_feedforward/index.html @@ -1,5 +1,5 @@ -Volume-Preserving Layers · GeometricMachineLearning.jl

      Volume-Preserving Feedforward Layer

      Volume preserving feedforward layers are a special type of ResNet layer for which we restrict the weight matrices to be of a particular form. I.e. each layer computes:

      \[x \mapsto x + \sigma(Ax + b),\]

      where $\sigma$ is a nonlinearity, $A$ is the weight and $b$ is the bias. The matrix $A$ is either a lower-triangular matrix $L$ or an upper-triangular matrix $U$[1]. The lower triangular matrix is of the form (the upper-triangular layer is simply the transpose of the lower triangular):

      \[L = \begin{pmatrix} +Volume-Preserving Layers · GeometricMachineLearning.jl

      Volume-Preserving Feedforward Layer

      Volume preserving feedforward layers are a special type of ResNet layer for which we restrict the weight matrices to be of a particular form. I.e. each layer computes:

      \[x \mapsto x + \sigma(Ax + b),\]

      where $\sigma$ is a nonlinearity, $A$ is the weight and $b$ is the bias. The matrix $A$ is either a lower-triangular matrix $L$ or an upper-triangular matrix $U$[1]. The lower triangular matrix is of the form (the upper-triangular layer is simply the transpose of the lower triangular):

      \[L = \begin{pmatrix} 0 & 0 & \cdots & 0 \\ a_{21} & \ddots & & \vdots \\ \vdots & \ddots & \ddots & \vdots \\ @@ -9,4 +9,4 @@ b_{21} & \ddots & & \vdots \\ \vdots & \ddots & \ddots & \vdots \\ b_{n1} & \cdots & b_{n(n-1)} & 1 -\end{pmatrix},\]

      and the determinant of $J$ is 1, i.e. the map is volume-preserving.

      Library Functions

      GeometricMachineLearning.VolumePreservingFeedForwardLayerType

      Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following:

      \[x \mapsto \begin{cases} \sigma(Lx + b) & \text{where $L$ is }\mathtt{LowerTriangular} \\ \sigma(Ux + b) & \text{where $U$ is }\mathtt{UpperTriangular}. \end{cases}\]

      The functor can be applied to a vecotr, a matrix or a tensor.

      Constructor

      The constructors are called with:

      • sys_dim::Int: the system dimension.
      • activation=tanh: the activation function.
      • include_bias::Bool=true (keyword argument): specifies whether a bias should be used.
      source
      • 1Implemented as LowerTriangular and UpperTriangular in GeometricMachineLearning.
      +\end{pmatrix},\]

      and the determinant of $J$ is 1, i.e. the map is volume-preserving.

      Library Functions

      GeometricMachineLearning.VolumePreservingFeedForwardLayerType

      Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following:

      \[x \mapsto \begin{cases} \sigma(Lx + b) & \text{where $L$ is }\mathtt{LowerTriangular} \\ \sigma(Ux + b) & \text{where $U$ is }\mathtt{UpperTriangular}. \end{cases}\]

      The functor can be applied to a vecotr, a matrix or a tensor.

      Constructor

      The constructors are called with:

      • sys_dim::Int: the system dimension.
      • activation=tanh: the activation function.
      • include_bias::Bool=true (keyword argument): specifies whether a bias should be used.
      source
      • 1Implemented as LowerTriangular and UpperTriangular in GeometricMachineLearning.
      diff --git a/latest/library/index.html b/latest/library/index.html index 81bd54139..b7f8cb065 100644 --- a/latest/library/index.html +++ b/latest/library/index.html @@ -1,38 +1,109 @@ -Library · GeometricMachineLearning.jl

      GeometricMachineLearning Library Functions

      GeometricMachineLearning.AbstractRetractionType

      AbstractRetraction is a type that comprises all retraction methods for manifolds. For every manifold layer one has to specify a retraction method that takes the layer and elements of the (global) tangent space.

      source
      GeometricMachineLearning.ActivationLayerPMethod

      Performs:

      \[\begin{pmatrix} +Library · GeometricMachineLearning.jl

      GeometricMachineLearning Library Functions

      GeometricMachineLearning.AbstractRetractionType

      AbstractRetraction is a type that comprises all retraction methods for manifolds. For every manifold layer one has to specify a retraction method that takes the layer and elements of the (global) tangent space.

      source
      GeometricMachineLearning.AdamOptimizerWithDecayType

      Defines the Adam Optimizer with weight decay.

      Constructors

      The default constructor takes as input:

      • n_epochs::Int
      • η₁: the learning rate at the start
      • η₂: the learning rate at the end
      • ρ₁: the decay parameter for the first moment
      • ρ₂: the decay parameter for the second moment
      • δ: the safety parameter
      • T (keyword argument): the type.

      The second constructor is called with:

      • n_epochs::Int
      • T

      ... the rest are keyword arguments

      source
      GeometricMachineLearning.AutoEncoderType

      The autoencoder architecture

      An autoencoder [39] is a neural network consisting of an encoder $\Psi^e$ and a decoder $\Psi^d$. In the simplest case they are trained on some data set $\mathcal{D}$ to reduce the following error:

      \[||\Psi^d\circ\Psi^e(\mathcal{D}) - \mathcal{D}||,\]

      which we call the reconstruction error or autoencoder error (see the docs for AutoEncoderLoss) and $||\cdot||$ is some norm.

      Implementation details.

      Abstract AutoEncoder type. If a custom <:AutoEncoder architecture is implemented it should have the fields full_dim, reduced_dim, n_encoder_blocks and n_decoder_blocks. Further the routines encoder, decoder, encoder_parameters and decoder_parameters should be extended.

      source
      GeometricMachineLearning.AutoEncoderLossType

      This loss should always be used together with a neural network of type AutoEncoder (and it is also the default for training such a network).

      It simply computes:

      \[\mathtt{AutoEncoderLoss}(nn\mathtt{::Loss}, input) = ||nn(input) - input||.\]

      source
      GeometricMachineLearning.BFGSCacheType

      The cache for the BFGS optimizer.

      It stores an array for the previous time step B and the inverse of the Hessian matrix H.

      It is important to note that setting up this cache already requires a derivative! This is not the case for the other optimizers.

      source
      GeometricMachineLearning.BFGSDummyCacheType

      In order to initialize BGGSCache we first need gradient information. This is why we initially have this BFGSDummyCache until gradient information is available.

      NOTE: we may not need this.

      source
      GeometricMachineLearning.BatchType

      Batch is a struct whose functor acts on an instance of DataLoader to produce a sequence of training samples for training for one epoch.

      The Constructor

      The constructor for Batch is called with:

      • batch_size::Int
      • seq_length::Int (optional)
      • prediction_window::Int (optional)

      The first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a sequence length and a prediction window as input arguments to Batch. These indicate the number of input vectors and the number of output vectors.

      The functor

      An instance of Batch can be called on an instance of DataLoader to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying batch:Batch to dl::DataLoader is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the time index and the second one is the parameter index.

      source
      GeometricMachineLearning.ClassificationType

      Classification Layer that takes a matrix as an input and returns a vector that is used for MNIST classification.

      It has the following arguments:

      • M: input dimension
      • N: output dimension
      • activation: the activation function

      And the following optional argument:

      • average: If this is set to true, then the output is computed as $\frac{1}{N}\sum_{i=1}^N[input]_{\bullet{}i}$. If set to false (the default) it picks the last column of the input.
      source
      GeometricMachineLearning.ClassificationTransformerType

      This is a transformer neural network for classification purposes. At the moment this is only used for training on MNIST, but can in theory be used for any classification problem.

      It has to be called with a DataLoader that stores an input and an output tensor. The optional arguments are:

      • n_heads: The number of heads in the MultiHeadAttention (mha) layers. Default: 7.
      • n_layers: The number of transformer layers. Default: 16.
      • activation: The activation function. Default: softmax.
      • Stiefel: Wheter the matrices in the mha layers are on the Stiefel manifold.
      • add_connection: Whether the input is appended to the output of the mha layer. (skip connection)
      source
      GeometricMachineLearning.DataLoaderType

      Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.

      Constructor

      The data loader can be called with various inputs:

      • A single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
      • A single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps.
      • A single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.
      • A tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are $n_p$ matrices (first input argument) and $n_p$ integers (second input argument).
      • A NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors.
      • An EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.

      When we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.

      Fields of DataLoader

      The fields of the DataLoader struct are the following:

      • input: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.
      • output: The tensor that contains the output (supervised learning) - this may be of type Nothing if the constructor is only called with one tensor (unsupervised learning).
      • input_dim: The dimension of the system, i.e. what is taken as input by a regular neural network.
      • input_time_steps: The length of the entire time series (length of the second axis).
      • n_params: The number of parameters that are present in the data set (length of third axis)
      • output_dim: The dimension of the output tensor (first axis). If output is of type Nothing, then this is also of type Nothing.
      • output_time_steps: The size of the second axis of the output tensor. If output is of type Nothing, then this is also of type Nothing.

      The input and output fields of DataLoader

      Even though the arguments to the Constructor may be vectors or matrices, internally DataLoader always stores tensors.

      source
      GeometricMachineLearning.DataLoaderMethod

      Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.

      Constructor

      The data loader can be called with various inputs:

      • A single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
      • A single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps.
      • A single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.
      • A tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are $n_p$ matrices (first input argument) and $n_p$ integers (second input argument).
      • A NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors.
      • An EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.

      When we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.

      source
      GeometricMachineLearning.GSympNetType

      GSympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are:

      • upscaling_dimension::Int: The upscaling dimension of the gradient layer. See the documentation for GradientLayerQ and GradientLayerP for further explanation. The default is 2*dim.
      • n_layers::Int: The number of layers (i.e. the total number of GradientLayerQ and GradientLayerP). The default is 2.
      • activation: The activation function that is applied. By default this is tanh.
      • init_upper::Bool: Initialize the gradient layer so that it first modifies the $q$-component. The default is true.
      source
      GeometricMachineLearning.GlobalSectionType

      This implements global sections for the Stiefel manifold and the Symplectic Stiefel manifold.

      In practice this is implemented using Householder reflections, with the auxiliary column vectors given by: |0| |0| |.| |1| ith spot for i in (n+1) to N (or with random columns) |0| |.| |0|

      Maybe consider dividing the output in the check functions by n!

      Implement a general global section here!!!! Tₓ𝔐 → G×𝔤 !!!!!! (think about random initialization!)

      source
      GeometricMachineLearning.AdamOptimizerWithDecayType

      Defines the Adam Optimizer with weight decay.

      Constructors

      The default constructor takes as input:

      • n_epochs::Int
      • η₁: the learning rate at the start
      • η₂: the learning rate at the end
      • ρ₁: the decay parameter for the first moment
      • ρ₂: the decay parameter for the second moment
      • δ: the safety parameter
      • T (keyword argument): the type.

      The second constructor is called with:

      • n_epochs::Int
      • T

      ... the rest are keyword arguments

      source
      GeometricMachineLearning.AutoEncoderType

      The autoencoder architecture

      An autoencoder [41] is a neural network consisting of an encoder $\Psi^e$ and a decoder $\Psi^d$. In the simplest case they are trained on some data set $\mathcal{D}$ to reduce the following error:

      \[||\Psi^d\circ\Psi^e(\mathcal{D}) - \mathcal{D}||,\]

      which we call the reconstruction error or autoencoder error (see the docs for AutoEncoderLoss) and $||\cdot||$ is some norm.

      Implementation details.

      Abstract AutoEncoder type. If a custom <:AutoEncoder architecture is implemented it should have the fields full_dim, reduced_dim, n_encoder_blocks and n_decoder_blocks. Further the routines encoder, decoder, encoder_parameters and decoder_parameters should be extended.

      source
      GeometricMachineLearning.AutoEncoderLossType

      This loss should always be used together with a neural network of type AutoEncoder (and it is also the default for training such a network).

      It simply computes:

      \[\mathtt{AutoEncoderLoss}(nn\mathtt{::Loss}, input) = ||nn(input) - input||.\]

      source
      GeometricMachineLearning.BFGSCacheType

      The cache for the BFGS optimizer.

      It stores an array for the previous time step B and the inverse of the Hessian matrix H.

      It is important to note that setting up this cache already requires a derivative! This is not the case for the other optimizers.

      source
      GeometricMachineLearning.BFGSDummyCacheType

      In order to initialize BGGSCache we first need gradient information. This is why we initially have this BFGSDummyCache until gradient information is available.

      NOTE: we may not need this.

      source
      GeometricMachineLearning.BatchType

      Batch is a struct whose functor acts on an instance of DataLoader to produce a sequence of training samples for training for one epoch.

      The Constructor

      The constructor for Batch is called with:

      • batch_size::Int
      • seq_length::Int (optional)
      • prediction_window::Int (optional)

      The first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a sequence length and a prediction window as input arguments to Batch. These indicate the number of input vectors and the number of output vectors.

      The functor

      An instance of Batch can be called on an instance of DataLoader to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying batch:Batch to dl::DataLoader is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the time index and the second one is the parameter index.

      source
      GeometricMachineLearning.ClassificationType

      Classification Layer that takes a matrix as an input and returns a vector that is used for MNIST classification.

      It has the following arguments:

      • M: input dimension
      • N: output dimension
      • activation: the activation function

      And the following optional argument:

      • average: If this is set to true, then the output is computed as $\frac{1}{N}\sum_{i=1}^N[input]_{\bullet{}i}$. If set to false (the default) it picks the last column of the input.
      source
      GeometricMachineLearning.ClassificationTransformerType

      This is a transformer neural network for classification purposes. At the moment this is only used for training on MNIST, but can in theory be used for any classification problem.

      It has to be called with a DataLoader that stores an input and an output tensor. The optional arguments are:

      • n_heads: The number of heads in the MultiHeadAttention (mha) layers. Default: 7.
      • n_layers: The number of transformer layers. Default: 16.
      • activation: The activation function. Default: softmax.
      • Stiefel: Wheter the matrices in the mha layers are on the Stiefel manifold.
      • add_connection: Whether the input is appended to the output of the mha layer. (skip connection)
      source
      GeometricMachineLearning.DataLoaderType

      Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.

      Constructor

      The data loader can be called with various inputs:

      • A single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
      • A single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps.
      • A single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.
      • A tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are $n_p$ matrices (first input argument) and $n_p$ integers (second input argument).
      • A NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors.
      • An EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.

      When we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.

      Fields of DataLoader

      The fields of the DataLoader struct are the following:

      • input: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.
      • output: The tensor that contains the output (supervised learning) - this may be of type Nothing if the constructor is only called with one tensor (unsupervised learning).
      • input_dim: The dimension of the system, i.e. what is taken as input by a regular neural network.
      • input_time_steps: The length of the entire time series (length of the second axis).
      • n_params: The number of parameters that are present in the data set (length of third axis)
      • output_dim: The dimension of the output tensor (first axis). If output is of type Nothing, then this is also of type Nothing.
      • output_time_steps: The size of the second axis of the output tensor. If output is of type Nothing, then this is also of type Nothing.

      The input and output fields of DataLoader

      Even though the arguments to the Constructor may be vectors or matrices, internally DataLoader always stores tensors.

      source
      GeometricMachineLearning.DataLoaderMethod

      Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.

      Constructor

      The data loader can be called with various inputs:

      • A single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
      • A single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps.
      • A single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.
      • A tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are $n_p$ matrices (first input argument) and $n_p$ integers (second input argument).
      • A NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors.
      • An EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.

      When we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.

      source
      GeometricMachineLearning.GSympNetType

      GSympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are:

      • upscaling_dimension::Int: The upscaling dimension of the gradient layer. See the documentation for GradientLayerQ and GradientLayerP for further explanation. The default is 2*dim.
      • n_layers::Int: The number of layers (i.e. the total number of GradientLayerQ and GradientLayerP). The default is 2.
      • activation: The activation function that is applied. By default this is tanh.
      • init_upper::Bool: Initialize the gradient layer so that it first modifies the $q$-component. The default is true.
      source
      GeometricMachineLearning.GlobalSectionType
      GlobalSection(Y::AbstractMatrix)

      Construct a global section for Y.

      A global section $\lambda$ is a mapping from a homogeneous space $\mathcal{M}$ to the corresponding Lie group $G$ such that

      \[\lambda(Y)E = Y,\]

      Also see apply_section and global_rep.

      Implementation

      For an implementation of GlobalSection for a custom array (especially manifolds), the function global_section has to be generalized.

      source
      GeometricMachineLearning.GradientLayerPType

      The gradient layer that changes the $q$ component. It is of the form:

      \[\begin{bmatrix} \mathbb{I} & \mathbb{O} \\ \nabla{}V & \mathbb{I} -\end{bmatrix},\]

      with $V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)$, where $\Sigma$ is the antiderivative of the activation function $\sigma$ (one-layer neural network). We refer to $M$ as the upscaling dimension. Such layers are by construction symplectic.

      source
      GeometricMachineLearning.GradientLayerQType

      The gradient layer that changes the $q$ component. It is of the form:

      \[\begin{bmatrix} +\end{bmatrix},\]

      with $V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)$, where $\Sigma$ is the antiderivative of the activation function $\sigma$ (one-layer neural network). We refer to $M$ as the upscaling dimension. Such layers are by construction symplectic.

      source
      GeometricMachineLearning.GradientLayerQType

      The gradient layer that changes the $q$ component. It is of the form:

      \[\begin{bmatrix} \mathbb{I} & \nabla{}V \\ \mathbb{O} & \mathbb{I} -\end{bmatrix},\]

      with $V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)$, where $\Sigma$ is the antiderivative of the activation function $\sigma$ (one-layer neural network). We refer to $M$ as the upscaling dimension. Such layers are by construction symplectic.

      source
      GeometricMachineLearning.HRedSysType

      HRedSys computes the reconstructed dynamics in the full system based on the reduced one. Optionally it can be compared to the FOM solution.

      It can be called using the following constructor: HRedSys(N, n; encoder, decoder, v_full, f_full, v_reduced, f_reduced, parameters, tspan, tstep, ics, projection_error) where

      • encoder: a function $\mathbb{R}^{2N}\mapsto{}\mathbb{R}^{2n}$
      • decoder: a (differentiable) function $\mathbb{R}^{2n}\mapsto\mathbb{R}^{2N}$
      • v_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • f_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • v_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • f_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • parameters: a NamedTuple that parametrizes the vector fields (the same for fullvectorfield and reducedvectorfield)
      • tspan: a tuple (t₀, tₗ) that specifies start and end point of the time interval over which integration is performed.
      • tstep: the time step
      • ics: the initial condition for the big system.
      • projection_error: the error $||M - \mathcal{R}\circ\mathcal{P}(M)||$ where $M$ is the snapshot matrix; $\mathcal{P}$ and $\mathcal{R}$ are the reduction and reconstruction respectively.
      source
      GeometricMachineLearning.LASympNetType

      LASympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are:

      • depth::Int: The number of linear layers that are applied. The default is 5.
      • nhidden::Int: The number of hidden layers (i.e. layers that are not input or output layers). The default is 2.
      • activation: The activation function that is applied. By default this is tanh.
      • init_upper_linear::Bool: Initialize the linear layer so that it first modifies the $q$-component. The default is true.
      • init_upper_act::Bool: Initialize the activation layer so that it first modifies the $q$-component. The default is true.
      source
      GeometricMachineLearning.LinearLayerPMethod

      Equivalent to a left multiplication by the matrix:

      \[\begin{pmatrix} +\end{bmatrix},\]

      with $V(p) = \sum_{i=1}^Ma_i\Sigma(\sum_jk_{ij}p_j+b_i)$, where $\Sigma$ is the antiderivative of the activation function $\sigma$ (one-layer neural network). We refer to $M$ as the upscaling dimension. Such layers are by construction symplectic.

      source
      GeometricMachineLearning.GrassmannLieAlgHorMatrixType
      GrassmannLieAlgHorMatrix(B::AbstractMatrix{T}, N::Integer, n::Integer) where T

      Build an instance of GrassmannLieAlgHorMatrix based on an arbitrary matrix B of size $(N-n)\times{}n$.

      GrassmannLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: $\pi:S \to SE/\sim$ where

      \[E = \begin{pmatrix} \mathbb{I}_{n} \\ \mathbb{O}_{(N-n)\times{}n} \end{pmatrix},\]

      and the equivalence relation is

      \[V_1 \sim V_2 \iff \exists A\in\mathcal{S}_\mathrm{skew}(n) \text{such that } V_2 = V_1 + \begin{pmatrix} A \\ \mathbb{O} \end{pmatrix}\]

      An element of GrassmannLieAlgMatrix takes the form:

      \[\begin{pmatrix} +\bar{\mathbb{O}} & B^T \\ B & \mathbb{O} +\end{pmatrix},\]

      where $\bar{\mathbb{O}}\in\mathbb{R}^{n\times{}n}$ and $\mathbb{O}\in\mathbb{R}^{(N - n)\times{}n}.$

      source
      GeometricMachineLearning.GrassmannLieAlgHorMatrixMethod
      GrassmannLieAlgHorMatrix(D::AbstractMatrix, n::Integer)

      Take a big matrix as input and build an instance of GrassmannLieAlgHorMatrix belonging to the GrassmannManifold $Gr(n, N)$ where $N$ is the number of rows of D.

      If the constructor is called with a big $N\times{}N$ matrix, then the projection is performed the following way:

      \[\begin{pmatrix} +A & B_1 \\ +B_2 & D +\end{pmatrix} \mapsto +\begin{pmatrix} +\bar{\mathbb{O}} & -B_2^T \\ +B_2 & \mathbb{O} +\end{pmatrix}.\]

      This can also be seen as the operation:

      \[D \mapsto \Omega(E, DE - EE^TDE),\]

      where $\Omega$ is the horizontal lift GeometricMachineLearning.Ω.

      source
      GeometricMachineLearning.HRedSysType

      HRedSys computes the reconstructed dynamics in the full system based on the reduced one. Optionally it can be compared to the FOM solution.

      It can be called using the following constructor: HRedSys(N, n; encoder, decoder, v_full, f_full, v_reduced, f_reduced, parameters, tspan, tstep, ics, projection_error) where

      • encoder: a function $\mathbb{R}^{2N}\mapsto{}\mathbb{R}^{2n}$
      • decoder: a (differentiable) function $\mathbb{R}^{2n}\mapsto\mathbb{R}^{2N}$
      • v_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • f_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • v_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • f_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.
      • parameters: a NamedTuple that parametrizes the vector fields (the same for fullvectorfield and reducedvectorfield)
      • tspan: a tuple (t₀, tₗ) that specifies start and end point of the time interval over which integration is performed.
      • tstep: the time step
      • ics: the initial condition for the big system.
      • projection_error: the error $||M - \mathcal{R}\circ\mathcal{P}(M)||$ where $M$ is the snapshot matrix; $\mathcal{P}$ and $\mathcal{R}$ are the reduction and reconstruction respectively.
      source
      GeometricMachineLearning.LASympNetType

      LASympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are:

      • depth::Int: The number of linear layers that are applied. The default is 5.
      • nhidden::Int: The number of hidden layers (i.e. layers that are not input or output layers). The default is 2.
      • activation: The activation function that is applied. By default this is tanh.
      • init_upper_linear::Bool: Initialize the linear layer so that it first modifies the $q$-component. The default is true.
      • init_upper_act::Bool: Initialize the activation layer so that it first modifies the $q$-component. The default is true.
      source
      GeometricMachineLearning.LinearSymplecticAttentionType

      Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the $Q$ or the $P$ part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.

      Constructor

      For the constructors simply call

      LinearSymplecticAttentionQ(sys_dim, seq_length)

      or

      LinearSymplecticAttentionP(sys_dim, seq_length)

      where sys_dim is the system dimension and seq_length is the sequence length.

      source
      GeometricMachineLearning.LinearSymplecticTransformerType

      Realizes the linear Symplectic Transformer.

      Constructor:

      The constructor is called with the following arguments

      1. dim::Int: System dimension
      2. seq_length::Int: Number of time steps that the transformer considers.

      Optional keyword arguments:

      • n_sympnet::Int=2: The number of sympnet layers in the transformer.
      • upscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer.
      • L::Int=1: The number of transformer units.
      • activation=tanh: The activation function for the SympNet layers.
      • init_upper::Bool=true: Specifies if the first layer is a $Q$-type layer (init_upper=true) or if it is a $P$-type layer (init_upper=false).
      source
      GeometricMachineLearning.LowerTriangularType

      A lower-triangular matrix is an $n\times{}n$ matrix that has ones on the diagonal and zeros on the upper triangular.

      The data are stored in a vector $S$ similarly to SkewSymMatrix.

      The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

      source
      GeometricMachineLearning.MultiHeadAttentionType

      MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data.

      Constructor

      Takes input arguments:

      • dim::Int: The system dimension
      • n_heads::Int: The number of heads.
      • Stiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold.
      • retraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction.
      • add_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result.
      source
      GeometricMachineLearning.OptimizerType

      Optimizer struct that stores the 'method' (i.e. Adam with corresponding hyperparameters), the cache and the optimization step.

      It takes as input an optimization method and the parameters of a network.

      For technical reasons we first specify an OptimizerMethod that stores all the hyperparameters of the optimizer.

      source
      GeometricMachineLearning.OptimizerMethod

      A functor for Optimizer. It is called with: - nn::NeuralNetwork - dl::DataLoader - batch::Batch - n_epochs::Int - loss

      The last argument is a function through which Zygote differentiates. This argument is optional; if it is not supplied GeometricMachineLearning defaults to an appropriate loss for the DataLoader.

      source
      GeometricMachineLearning.PSDArchType

      The architecture

      Proper symplectic decomposition (PSD) can be seen as a SymplecticAutoencoder for which the decoder and the encoder are both PSD-like matrices (see the docs for PSDLayer.

      Training

      For optimizing the parameters in this architecture no neural network training is necessary (see the docs for solve!).

      The constructor

      The constructor only takes two arguments as input:

      • full_dim::Integer
      • reduced_dim::Integer
      source
      GeometricMachineLearning.PSDLayerType

      This is a PSD-like layer used for symplectic autoencoders. One layer has the following shape:

      \[A = \begin{bmatrix} \Phi & \mathbb{O} \\ \mathbb{O} & \Phi \end{bmatrix},\]

      where $\Phi$ is an element of the Stiefel manifold $St(n, N)$.

      The constructor of PSDLayer is called by PSDLayer(M, N; retraction=retraction):

      • M is the input dimension.
      • N is the output dimension.
      • retraction is an instance of a struct with supertype AbstractRetraction. The only options at the moment are Geodesic() and Cayley().
      source
      GeometricMachineLearning.ResNetType

      A ResNet is a neural network that realizes a mapping of the form: $x = \mathcal{NN}(x) + x$, so the input is again added to the output (a so-called add connection). In GeometricMachineLearning the specific ResNet that we use consists of a series of simple ResNetLayers.

      source
      GeometricMachineLearning.SkewSymMatrixType

      A SkewSymMatrix is a matrix $A$ s.t. $A^T = -A$.

      If the constructor is called with a matrix as input it returns a symmetric matrix via the projection $A \mapsto \frac{1}{2}(A - A^T)$. This is a projection defined via the canonical metric $\mathbb{R}^{n\times{}n}\times\mathbb{R}^{n\times{}n}\to\mathbb{R}, (A,B) \mapsto \mathrm{Tr}(A^TB)$.

      The first index is the row index, the second one the column index.

      The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

      source
      GeometricMachineLearning.StandardTransformerIntegratorType

      The regular transformer used as an integrator (multi-step method).

      The constructor is called with one argument:

      • sys_dim::Int

      The following are keyword arguments:

      • transformer_dim::Int: the default is transformer_dim = sys_dim.
      • n_blocks::Int: The default is 1.
      • n_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)
      • L::Int the number of transformer blocks (default is L = 2).
      • upscaling_activation: by default identity
      • resnet_activation: by default tanh
      • add_connection:Bool=true: if the input should be added to the output.
      source
      GeometricMachineLearning.StiefelLieAlgHorMatrixType

      StiefelLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: (\pi:S \to SE ) where

      \[E = \begin{pmatrix} \mathbb{I}_{n} \\ \mathbb{O}_{(N-n)\times{}n} \end{pmatrix}.\]

      The matrix (E) is implemented under StiefelProjection in GeometricMachineLearning.

      An element of StiefelLieAlgMatrix takes the form:

      \[\begin{pmatrix} +\end{pmatrix}, \]

      where $B$ is a symmetric matrix.

      source
      GeometricMachineLearning.LinearSymplecticAttentionType

      Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the $Q$ or the $P$ part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.

      Constructor

      For the constructors simply call

      LinearSymplecticAttentionQ(sys_dim, seq_length)

      or

      LinearSymplecticAttentionP(sys_dim, seq_length)

      where sys_dim is the system dimension and seq_length is the sequence length.

      source
      GeometricMachineLearning.LinearSymplecticTransformerType

      Realizes the linear Symplectic Transformer.

      Constructor:

      The constructor is called with the following arguments

      1. dim::Int: System dimension
      2. seq_length::Int: Number of time steps that the transformer considers.

      Optional keyword arguments:

      • n_sympnet::Int=2: The number of sympnet layers in the transformer.
      • upscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer.
      • L::Int=1: The number of transformer units.
      • activation=tanh: The activation function for the SympNet layers.
      • init_upper::Bool=true: Specifies if the first layer is a $Q$-type layer (init_upper=true) or if it is a $P$-type layer (init_upper=false).
      source
      GeometricMachineLearning.LowerTriangularType
      LowerTriangular(S::AbstractVector, n::Int)

      Build a lower-triangular matrix from a vector.

      A lower-triangular matrix is an $n\times{}n$ matrix that has ones on the diagonal and zeros on the upper triangular.

      The data are stored in a vector $S$ similarly to other matrices. See UpperTriangular, SkewSymMatrix and SymmetricMatrix.

      The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

      Examples

      using GeometricMachineLearning
      +S = [1, 2, 3, 4, 5, 6]
      +LowerTriangular(S, 4)
      +
      +# output
      +
      +4×4 LowerTriangular{Int64, Vector{Int64}}:
      + 0  0  0  0
      + 1  0  0  0
      + 2  3  0  0
      + 4  5  6  0
      source
      GeometricMachineLearning.LowerTriangularMethod
      LowerTriangular(A::AbstractMatrix)

      Build a lower-triangular matrix from a matrix.

      This is done by taking the lower left of that matrix.

      Examples

      using GeometricMachineLearning
      +M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
      +LowerTriangular(M)
      +
      +# output
      +
      +4×4 LowerTriangular{Int64, Vector{Int64}}:
      +  0   0   0  0
      +  5   0   0  0
      +  9  10   0  0
      + 13  14  15  0
      source
      GeometricMachineLearning.MultiHeadAttentionType

      MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data.

      Constructor

      Takes input arguments:

      • dim::Int: The system dimension
      • n_heads::Int: The number of heads.
      • Stiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold.
      • retraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction.
      • add_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result.
      source
      GeometricMachineLearning.OptimizerType

      Optimizer struct that stores the 'method' (i.e. Adam with corresponding hyperparameters), the cache and the optimization step.

      It takes as input an optimization method and the parameters of a network.

      For technical reasons we first specify an OptimizerMethod that stores all the hyperparameters of the optimizer.

      source
      GeometricMachineLearning.OptimizerMethod

      A functor for Optimizer. It is called with: - nn::NeuralNetwork - dl::DataLoader - batch::Batch - n_epochs::Int - loss

      The last argument is a function through which Zygote differentiates. This argument is optional; if it is not supplied GeometricMachineLearning defaults to an appropriate loss for the DataLoader.

      source
      GeometricMachineLearning.PSDArchType

      The architecture

      Proper symplectic decomposition (PSD) can be seen as a SymplecticAutoencoder for which the decoder and the encoder are both PSD-like matrices (see the docs for PSDLayer.

      Training

      For optimizing the parameters in this architecture no neural network training is necessary (see the docs for solve!).

      The constructor

      The constructor only takes two arguments as input:

      • full_dim::Integer
      • reduced_dim::Integer
      source
      GeometricMachineLearning.PSDLayerType

      This is a PSD-like layer used for symplectic autoencoders. One layer has the following shape:

      \[A = \begin{bmatrix} \Phi & \mathbb{O} \\ \mathbb{O} & \Phi \end{bmatrix},\]

      where $\Phi$ is an element of the Stiefel manifold $St(n, N)$.

      The constructor of PSDLayer is called by PSDLayer(M, N; retraction=retraction):

      • M is the input dimension.
      • N is the output dimension.
      • retraction is an instance of a struct with supertype AbstractRetraction. The only options at the moment are Geodesic() and Cayley().
      source
      GeometricMachineLearning.ResNetType

      A ResNet is a neural network that realizes a mapping of the form: $x = \mathcal{NN}(x) + x$, so the input is again added to the output (a so-called add connection). In GeometricMachineLearning the specific ResNet that we use consists of a series of simple ResNetLayers.

      source
      GeometricMachineLearning.SkewSymMatrixType
      SkewSymMatrix(S::AbstractVector, n::Integer)

      Instantiate a skew-symmetric matrix with information stored in vector S.

      A skew-symmetric matrix $A$ is a matrix $A^T = -A$.

      Internally the struct saves a vector $S$ of size $n(n-1)\div2$. The conversion is done the following way:

      \[[A]_{ij} = \begin{cases} 0 & \text{if $i=j$} \\ + S[( (i-2) (i-1) ) \div 2 + j] & \text{if $i>j$}\\ + S[( (j-2) (j-1) ) \div 2 + i] & \text{else}. \end{cases}\]

      Also see SymmetricMatrix, LowerTriangular and UpperTriangular.

      Examples

      using GeometricMachineLearning
      +S = [1, 2, 3, 4, 5, 6]
      +SkewSymMatrix(S, 4)
      +
      +# output
      +
      +4×4 SkewSymMatrix{Int64, Vector{Int64}}:
      + 0  -1  -2  -4
      + 1   0  -3  -5
      + 2   3   0  -6
      + 4   5   6   0
      source
      GeometricMachineLearning.SkewSymMatrixMethod
      SkewSymMatrix(A::AbstractMatrix)

      Perform 0.5 * (A - A') and store the matrix in an efficient way (as a vector with $n(n-1)/2$ entries).

      If the constructor is called with a matrix as input it returns a skew-symmetric matrix via the projection:

      \[A \mapsto \frac{1}{2}(A - A^T).\]

      Examples

      using GeometricMachineLearning
      +M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
      +SkewSymMatrix(M)
      +
      +# output
      +
      +4×4 SkewSymMatrix{Float64, Vector{Float64}}:
      + 0.0  -1.5  -3.0  -4.5
      + 1.5   0.0  -1.5  -3.0
      + 3.0   1.5   0.0  -1.5
      + 4.5   3.0   1.5   0.0

      Extend help

      Note that the constructor is designed in such a way that it always returns matrices of type SkewSymMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.

      If the user wishes to allocate a matrix SkewSymMatrix{<:Integer} the constructor SkewSymMatrix(::AbstractVector, n::Integer) has to be called.

      source
      GeometricMachineLearning.StandardTransformerIntegratorType

      The regular transformer used as an integrator (multi-step method).

      The constructor is called with one argument:

      • sys_dim::Int

      The following are keyword arguments:

      • transformer_dim::Int: the default is transformer_dim = sys_dim.
      • n_blocks::Int: The default is 1.
      • n_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)
      • L::Int the number of transformer blocks (default is L = 2).
      • upscaling_activation: by default identity
      • resnet_activation: by default tanh
      • add_connection:Bool=true: if the input should be added to the output.
      source
      GeometricMachineLearning.StiefelLieAlgHorMatrixType
      StiefelLieAlgHorMatrix(A::SkewSymMatrix{T}, B::AbstractMatrix{T}, N::Integer, n::Integer) where T

      Build an instance of StiefelLieAlgHorMatrix based on a skew-symmetric matrix A and an arbitrary matrix B.

      StiefelLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: $\pi:S \to SE$ where

      \[E = \begin{pmatrix} \mathbb{I}_{n} \\ \mathbb{O}_{(N-n)\times{}n} \end{pmatrix}.\]

      The matrix (E) is implemented under StiefelProjection in GeometricMachineLearning.

      An element of StiefelLieAlgMatrix takes the form:

      \[\begin{pmatrix} A & B^T \\ B & \mathbb{O} -\end{pmatrix},\]

      where (A) is skew-symmetric (this is SkewSymMatrix in GeometricMachineLearning).

      If the constructor is called with a big (N\times{}N) matrix, then the projection is performed the following way:

      \[\begin{pmatrix} +\end{pmatrix},\]

      where $A$ is skew-symmetric (this is SkewSymMatrix in GeometricMachineLearning).

      Also see GrassmannLieAlgHorMatrix.

      source
      GeometricMachineLearning.StiefelLieAlgHorMatrixMethod
      StiefelLieAlgHorMatrix(D::AbstractMatrix, n::Integer)

      Take a big matrix as input and build an instance of StiefelLieAlgHorMatrix belonging to the StiefelManifold $St(n, N)$ where $N$ is the number of rows of D.

      If the constructor is called with a big $N\times{}N$ matrix, then the projection is performed the following way:

      \[\begin{pmatrix} A & B_1 \\ B_2 & D \end{pmatrix} \mapsto \begin{pmatrix} \mathrm{skew}(A) & -B_2^T \\ B_2 & \mathbb{O} -\end{pmatrix}.\]

      The operation $\mathrm{skew}:\mathbb{R}^{n\times{}n}\to\mathcal{S}_\mathrm{skew}(n)$ is the skew-symmetrization operation. This is equivalent to calling the constructor of SkewSymMatrix with an (n\times{}n) matrix.

      source
      GeometricMachineLearning.StiefelManifoldType

      An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices $Y\in\mathbb{R}^{N\times{}n}$ whose columns are orthonormal, i.e.

      \[ St(n, N) = \{Y: Y^TY = \mathbb{I}_n \}.\]

      The Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.

      source
      GeometricMachineLearning.StiefelProjectionType

      An array that essentially does vcat(I(n), zeros(N-n, n)) with GPU support. It has three inner constructors. The first one is called with the following arguments:

      1. backend: backends as supported by KernelAbstractions.
      2. T::Type
      3. N::Integer
      4. n::Integer

      The second constructor is called by supplying a matrix as input. The constructor will then extract the backend, the type and the dimensions of that matrix.

      The third constructor is called by supplying an instance of StiefelLieAlgHorMatrix.

      Technically this should be a subtype of StiefelManifold.

      source
      GeometricMachineLearning.SymmetricMatrixType

      A SymmetricMatrix $A$ is a matrix $A^T = A$.

      This is a projection defined via the canonical metric $(A,B) \mapsto \mathrm{tr}(A^TB)$.

      Internally the struct saves a vector $S$ of size $n(n+1)\div2$. The conversion is done the following way:

      \[[A]_{ij} = \begin{cases} S[( (i-1) i ) \div 2 + j] & \text{if $i\geq{}j$}\\ - S[( (j-1) j ) \div 2 + i] & \text{else}. \end{cases}\]

      So $S$ stores a string of vectors taken from $A$: $S = [\tilde{a}_1, \tilde{a}_2, \ldots, \tilde{a}_n]$ with $\tilde{a}_i = [[A]_{i1},[A]_{i2},\ldots,[A]_{ii}]$.

      Constructor

      If the constructor is called with a matrix as input it returns a symmetric matrix via the projection:

      \[A \mapsto \frac{1}{2}(A + A^T).\]

      It can also be called with two arguments S::AbstractVector and n::Integer where length(S) == n * (n + 1) ÷ 2 has to be true.

      source
      GeometricMachineLearning.SympNetType

      The SympNet type encompasses GSympNets and LASympNets. SympNets are universal approximators of symplectic flows, i.e. maps $\varphi:\mathbb{R}^{2n}\to\mathbb{R}^{2n}$ for which $(\nabla\varphi)^T\mathbb{J}\nabla\varphi = \mathbb{J}$ holds.

      source
      GeometricMachineLearning.SymplecticAutoencoderType

      The architecture

      The symplectic autoencoder architecture was introduced in [40]. Like any other autoencoder it consists of an encoder $\Psi^e:\mathbb{R}^{2N}\to\mathbb{R}^{2n}$ and a decoder $\Psi^d:\mathbb{R}^{2n}\to\mathbb{R}^{2N}$ with $n\ll{}N$. These satisfy the following properties:

      \[\nabla_z\Psi^e\mathbb{J}_{2N}(\nabla_z\Psi^e\mathbb{J}_{2N})^T = \mathbb{J}_{2n} \text{ and } (\nabla_\xi\Psi^d)^T\mathbb{J}_{2N}\nabla_\xi\Psi^d = \mathbb{J}_{2n}.\]

      Because the decoder has this particular property, the reduced system can be described by the Hamiltonian $H\circ\Psi^d$:

      \[\mathbb{J}_{2n}\nabla_\xi(H\circ\Psi^d) = \mathbb{J}_{2n}(\nabla_\xi\Psi^d)^T\nabla_{\Psi^d(\xi)}H = \mathbb{J}_{2n}(\nabla_\xi\Psi^d)^T\mathbb{J}_{2N}^T\mathbb{J}_{2N}\nabla_{\Psi^d(\xi)}H = (\nabla_\xi\Psi^d)^+X_H(\Psi^d(\xi)),\]

      where $(\nabla_\xi\Psi^d)^+$ is the pseudoinverse of $\nabla_\xi\Psi^d$ (for more details see the docs on the AutoEncoder type).

      The constructor

      The constructor is called with

      • full_dim::Integer
      • reduced_dim::Integer
      • n_encoder_layers::Integer = 4 (keyword argument)
      • n_encoder_blocks::Integer = 2 (keyword argument)
      • n_decoder_layers::Integer = 1 (keyword argument)
      • n_decoder_blocks::Integer = 3 (keyword argument)
      • sympnet_upscale::Integer = 5 (keyword argument)
      • activation = tanh (keyword argument)
      • encoder_init_q::Bool = true (keyword argument)
      • decoder_init_q::Bool = true (keyword argument)
      source
      GeometricMachineLearning.SymplecticPotentialType

      SymplecticPotential(n)

      Returns a symplectic matrix of size 2n x 2n

      \[\begin{pmatrix} +\end{pmatrix}.\]

      The operation $\mathrm{skew}:\mathbb{R}^{n\times{}n}\to\mathcal{S}_\mathrm{skew}(n)$ is the skew-symmetrization operation. This is equivalent to calling of SkewSymMatrix with an $n\times{}n$ matrix.

      This can also be seen as the operation:

      \[D \mapsto \Omega(E, DE) = \mathrm{skew}\left(2 \left(\mathbb{I} - \frac{1}{2} E E^T \right) DE E^T\right).\]

      Also see GeometricMachineLearning.Ω.

      source
      GeometricMachineLearning.StiefelManifoldType

      An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices $Y\in\mathbb{R}^{N\times{}n}$ whose columns are orthonormal, i.e.

      \[ St(n, N) = \{Y: Y^TY = \mathbb{I}_n \}.\]

      The Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.

      source
      GeometricMachineLearning.StiefelProjectionType
      StiefelProjection(backend, T, N, n)

      Make a matrix of the form $\begin{bmatrix} \mathbb{I} & \mathbb{O} \end{bmatrix}^T$ for a specific backend and data type.

      An array that essentially does vcat(I(n), zeros(N-n, n)) with GPU support.

      Extend help

      Technically this should be a subtype of StiefelManifold.

      source
      GeometricMachineLearning.StiefelProjectionMethod
      StiefelProjection(A::AbstractMatrix)

      Extract necessary information from A and build an instance of StiefelProjection.

      Necessary information here referes to the backend, the data type and the size of the matrix.

      source
      GeometricMachineLearning.StiefelProjectionMethod
      StiefelProjection(B::StiefelLieAlgHorMatrix)

      Extract necessary information from B and build an instance of StiefelProjection.

      Necessary information here referes to the backend, the data type and the size of the matrix.

      The size is queried through B.N and B.n.

      source
      GeometricMachineLearning.SymmetricMatrixType
      SymmetricMatrix(S::AbstractVector, n::Integer)

      Instantiate a symmetric matrix with information stored in vector S.

      A SymmetricMatrix $A$ is a matrix $A^T = A$.

      Internally the struct saves a vector $S$ of size $n(n+1)\div2$. The conversion is done the following way:

      \[[A]_{ij} = \begin{cases} S[( (i-1) i ) \div 2 + j] & \text{if $i\geq{}j$}\\ + S[( (j-1) j ) \div 2 + i] & \text{else}. \end{cases}\]

      So $S$ stores a string of vectors taken from $A$: $S = [\tilde{a}_1, \tilde{a}_2, \ldots, \tilde{a}_n]$ with $\tilde{a}_i = [[A]_{i1},[A]_{i2},\ldots,[A]_{ii}]$.

      Also see SkewSymMatrix, LowerTriangular and UpperTriangular.

      Examples

      using GeometricMachineLearning
      +S = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
      +SymmetricMatrix(S, 4)
      +
      +# output
      +
      +4×4 SymmetricMatrix{Int64, Vector{Int64}}:
      + 1  2  4   7
      + 2  3  5   8
      + 4  5  6   9
      + 7  8  9  10
      source
      GeometricMachineLearning.SymmetricMatrixMethod
      SymmetricMatrix(A::AbstractMatrix)

      Perform 0.5 * (A + A') and store the matrix in an efficient way (as a vector with $n(n+1)/2$ entries).

      If the constructor is called with a matrix as input it returns a symmetric matrix via the projection:

      \[A \mapsto \frac{1}{2}(A + A^T).\]

      Examples

      using GeometricMachineLearning
      +M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
      +SymmetricMatrix(M)
      +
      +# output
      +
      +4×4 SymmetricMatrix{Float64, Vector{Float64}}:
      + 1.0   3.5   6.0   8.5
      + 3.5   6.0   8.5  11.0
      + 6.0   8.5  11.0  13.5
      + 8.5  11.0  13.5  16.0

      Extend help

      Note that the constructor is designed in such a way that it always returns matrices of type SymmetricMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.

      If the user wishes to allocate a matrix SymmetricMatrix{<:Integer} the constructor SymmetricMatrix(::AbstractVector, n::Integer) has to be called.

      source
      GeometricMachineLearning.SympNetType

      The SympNet type encompasses GSympNets and LASympNets. SympNets are universal approximators of symplectic flows, i.e. maps $\varphi:\mathbb{R}^{2n}\to\mathbb{R}^{2n}$ for which $(\nabla\varphi)^T\mathbb{J}\nabla\varphi = \mathbb{J}$ holds.

      source
      GeometricMachineLearning.SymplecticAutoencoderType

      The architecture

      The symplectic autoencoder architecture was introduced in [42]. Like any other autoencoder it consists of an encoder $\Psi^e:\mathbb{R}^{2N}\to\mathbb{R}^{2n}$ and a decoder $\Psi^d:\mathbb{R}^{2n}\to\mathbb{R}^{2N}$ with $n\ll{}N$. These satisfy the following properties:

      \[\nabla_z\Psi^e\mathbb{J}_{2N}(\nabla_z\Psi^e\mathbb{J}_{2N})^T = \mathbb{J}_{2n} \text{ and } (\nabla_\xi\Psi^d)^T\mathbb{J}_{2N}\nabla_\xi\Psi^d = \mathbb{J}_{2n}.\]

      Because the decoder has this particular property, the reduced system can be described by the Hamiltonian $H\circ\Psi^d$:

      \[\mathbb{J}_{2n}\nabla_\xi(H\circ\Psi^d) = \mathbb{J}_{2n}(\nabla_\xi\Psi^d)^T\nabla_{\Psi^d(\xi)}H = \mathbb{J}_{2n}(\nabla_\xi\Psi^d)^T\mathbb{J}_{2N}^T\mathbb{J}_{2N}\nabla_{\Psi^d(\xi)}H = (\nabla_\xi\Psi^d)^+X_H(\Psi^d(\xi)),\]

      where $(\nabla_\xi\Psi^d)^+$ is the pseudoinverse of $\nabla_\xi\Psi^d$ (for more details see the docs on the AutoEncoder type).

      The constructor

      The constructor is called with

      • full_dim::Integer
      • reduced_dim::Integer
      • n_encoder_layers::Integer = 4 (keyword argument)
      • n_encoder_blocks::Integer = 2 (keyword argument)
      • n_decoder_layers::Integer = 1 (keyword argument)
      • n_decoder_blocks::Integer = 3 (keyword argument)
      • sympnet_upscale::Integer = 5 (keyword argument)
      • activation = tanh (keyword argument)
      • encoder_init_q::Bool = true (keyword argument)
      • decoder_init_q::Bool = true (keyword argument)
      source
      GeometricMachineLearning.UpperTriangularType

      An upper-triangular matrix is an $n\times{}n$ matrix that has ones on the diagonal and zeros on the upper triangular.

      The data are stored in a vector $S$ similarly to SkewSymMatrix.

      The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

      source
      GeometricMachineLearning.VolumePreservingAttentionType

      Volume-preserving attention (single head attention)

      Drawbacks:

      • the super fast activation is only implemented for sequence lengths of 2, 3, 4 and 5.
      • other sequence lengths only work on CPU for now (lu decomposition has to be implemented to work for tensors in parallel).

      Constructor

      The constructor is called with:

      • dim::Int: The system dimension
      • seq_length::Int: The sequence length to be considered. The default is zero, i.e. arbitrary sequence lengths; this works for all sequence lengths but doesn't apply the super-fast activation.
      • skew_sym::Bool (keyword argument): specifies if we the weight matrix is skew symmetric or arbitrary (default is false).

      Functor

      Applying a layer of type VolumePreservingAttention does the following:

      • First we perform the operation $X \mapsto X^T A X =: C$, where $X\in\mathbb{R}^{N\times\mathtt{seq\_length}}$ is a vector containing time series data and $A$ is the skew symmetric matrix associated with the layer.
      • In a second step we compute the Cayley transform of $C$; $\Lambda = \mathrm{Cayley}(C)$.
      • The output of the layer is then $X\Lambda$.
      source
      GeometricMachineLearning.VolumePreservingFeedForwardType

      Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer.

      Constructor

      The constructor is called with the following arguments:

      • sys_dim::Int: The system dimension.
      • n_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.
      • n_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
      • activation: The activation function for the nonlinear layers in a block.
      • init_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper.
      source
      GeometricMachineLearning.VolumePreservingFeedForwardLayerType

      Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following:

      \[x \mapsto \begin{cases} \sigma(Lx + b) & \text{where $L$ is }\mathtt{LowerTriangular} \\ \sigma(Ux + b) & \text{where $U$ is }\mathtt{UpperTriangular}. \end{cases}\]

      The functor can be applied to a vecotr, a matrix or a tensor.

      Constructor

      The constructors are called with:

      • sys_dim::Int: the system dimension.
      • activation=tanh: the activation function.
      • include_bias::Bool=true (keyword argument): specifies whether a bias should be used.
      source
      GeometricMachineLearning.VolumePreservingTransformerType

      The volume-preserving transformer with the Cayley activation function and built-in upscaling.

      Constructor

      The arguments for the constructor are:

      1. sys_dim::Int
      2. seq_length::Int: The sequence length of the data fed into the transformer.

      The following are keyword argumetns:

      • n_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.
      • n_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
      • L::Int=1: The number of transformer units.
      • activation=tanh: The activation function.
      • init_upper::Bool=false: Specifies if the network first acts on the $q$ component.
      • skew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.
      source
      AbstractNeuralNetworks.update!Method

      Optimization for an entire neural networks with BFGS. What is different in this case is that we still have to initialize the cache.

      If o.step == 1, then we initialize the cache

      source
      Base.iterateMethod

      This function computes a trajectory for a Transformer that has already been trained for valuation purposes.

      It takes as input:

      • nn: a NeuralNetwork (that has been trained).
      • ics: initial conditions (a matrix in $\mathbb{R}^{2n\times\mathtt{seq\_length}}$ or NamedTuple of two matrices in $\mathbb{R}^{n\times\mathtt{seq\_length}}$)
      • n_points::Int=100 (keyword argument): The number of steps for which we run the prediction.
      • prediction_window::Int=size(ics.q, 2): The prediction window (i.e. the number of steps we predict into the future) is equal to the sequence length (i.e. the number of input time steps) by default.
      source
      Base.iterateMethod

      This function computes a trajectory for a SympNet that has already been trained for valuation purposes.

      It takes as input:

      • nn: a NeuralNetwork (that has been trained).
      • ics: initial conditions (a NamedTuple of two vectors)
      source
      Base.randMethod
      rand(backend::KernelAbstractions.Backend, manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold)

      Draw random elements for a specific device.

      Examples

      using GeometricMachineLearning
      + - noisemaker
      source
      GeometricMachineLearning.UpperTriangularType
      LowerTriangular(S::AbstractVector, n::Int)

      Build a lower-triangular matrix from a vector.

      A lower-triangular matrix is an $n\times{}n$ matrix that has ones on the diagonal and zeros on the upper triangular.

      The data are stored in a vector $S$ similarly to other matrices. See LowerTriangular, SkewSymMatrix and SymmetricMatrix.

      The struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension $n$ for $A\in\mathbb{R}^{n\times{}n}$.

      Examples

      using GeometricMachineLearning
      +S = [1, 2, 3, 4, 5, 6]
      +UpperTriangular(S, 4)
      +
      +# output
      +
      +4×4 UpperTriangular{Int64, Vector{Int64}}:
      + 0  1  2  4
      + 0  0  3  5
      + 0  0  0  6
      + 0  0  0  0
      source
      GeometricMachineLearning.UpperTriangularMethod
      UpperTriangular(A::AbstractMatrix)

      Build a lower-triangular matrix from a matrix.

      This is done by taking the lower left of that matrix.

      Examples

      using GeometricMachineLearning
      +M = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]
      +UpperTriangular(M)
      +
      +# output
      +
      +4×4 UpperTriangular{Int64, Vector{Int64}}:
      + 0  2  3   4
      + 0  0  7   8
      + 0  0  0  12
      + 0  0  0   0
      source
      GeometricMachineLearning.VolumePreservingAttentionType

      Volume-preserving attention (single head attention)

      Drawbacks:

      • the super fast activation is only implemented for sequence lengths of 2, 3, 4 and 5.
      • other sequence lengths only work on CPU for now (lu decomposition has to be implemented to work for tensors in parallel).

      Constructor

      The constructor is called with:

      • dim::Int: The system dimension
      • seq_length::Int: The sequence length to be considered. The default is zero, i.e. arbitrary sequence lengths; this works for all sequence lengths but doesn't apply the super-fast activation.
      • skew_sym::Bool (keyword argument): specifies if we the weight matrix is skew symmetric or arbitrary (default is false).

      Functor

      Applying a layer of type VolumePreservingAttention does the following:

      • First we perform the operation $X \mapsto X^T A X =: C$, where $X\in\mathbb{R}^{N\times\mathtt{seq\_length}}$ is a vector containing time series data and $A$ is the skew symmetric matrix associated with the layer.
      • In a second step we compute the Cayley transform of $C$; $\Lambda = \mathrm{Cayley}(C)$.
      • The output of the layer is then $X\Lambda$.
      source
      GeometricMachineLearning.VolumePreservingFeedForwardType

      Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer.

      Constructor

      The constructor is called with the following arguments:

      • sys_dim::Int: The system dimension.
      • n_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.
      • n_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
      • activation: The activation function for the nonlinear layers in a block.
      • init_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper.
      source
      GeometricMachineLearning.VolumePreservingFeedForwardLayerType

      Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following:

      \[x \mapsto \begin{cases} \sigma(Lx + b) & \text{where $L$ is }\mathtt{LowerTriangular} \\ \sigma(Ux + b) & \text{where $U$ is }\mathtt{UpperTriangular}. \end{cases}\]

      The functor can be applied to a vecotr, a matrix or a tensor.

      Constructor

      The constructors are called with:

      • sys_dim::Int: the system dimension.
      • activation=tanh: the activation function.
      • include_bias::Bool=true (keyword argument): specifies whether a bias should be used.
      source
      GeometricMachineLearning.VolumePreservingTransformerType

      The volume-preserving transformer with the Cayley activation function and built-in upscaling.

      Constructor

      The arguments for the constructor are:

      1. sys_dim::Int
      2. seq_length::Int: The sequence length of the data fed into the transformer.

      The following are keyword argumetns:

      • n_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.
      • n_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.
      • L::Int=1: The number of transformer units.
      • activation=tanh: The activation function.
      • init_upper::Bool=false: Specifies if the network first acts on the $q$ component.
      • skew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.
      source
      AbstractNeuralNetworks.update!Method

      Optimization for an entire neural networks with BFGS. What is different in this case is that we still have to initialize the cache.

      If o.step == 1, then we initialize the cache

      source
      Base.iterateMethod

      This function computes a trajectory for a Transformer that has already been trained for valuation purposes.

      It takes as input:

      • nn: a NeuralNetwork (that has been trained).
      • ics: initial conditions (a matrix in $\mathbb{R}^{2n\times\mathtt{seq\_length}}$ or NamedTuple of two matrices in $\mathbb{R}^{n\times\mathtt{seq\_length}}$)
      • n_points::Int=100 (keyword argument): The number of steps for which we run the prediction.
      • prediction_window::Int=size(ics.q, 2): The prediction window (i.e. the number of steps we predict into the future) is equal to the sequence length (i.e. the number of input time steps) by default.
      source
      Base.iterateMethod

      This function computes a trajectory for a SympNet that has already been trained for valuation purposes.

      It takes as input:

      • nn: a NeuralNetwork (that has been trained).
      • ics: initial conditions (a NamedTuple of two vectors)
      source
      Base.randMethod
      rand(backend::KernelAbstractions.Backend, manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold)

      Draw random elements for a specific device.

      Examples

      using GeometricMachineLearning
      +using GeometricMachineLearning: _round # hide
       import Random
       Random.seed!(123)
       
       N, n = 5, 3
      -rand(CPU(), StiefelManifold{Float32}, N, n)
      +Y = rand(CPU(), StiefelManifold{Float32}, N, n)
      +_round(Y; digits = 5) # hide
       
       # output
       
       5×3 StiefelManifold{Float32, Matrix{Float32}}:
      - -0.275746    0.329913   0.772753
      - -0.624851   -0.332242  -0.0685991
      - -0.693326    0.36724   -0.189882
      - -0.0929493  -0.731446   0.460639
      -  0.210203    0.333008   0.387173

      Random elements of the manifold can also be allocated on GPU, via e.g. ...

      rand(CUDABackend(), StiefelManifold{Float32}, N, n)

      ... for drawing elements on a CUDA device.

      source
      Base.randMethod
      rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold

      Draw random elements from the Stiefel and the Grassmann manifold.

      Because both of these manifolds are compact spaces we can sample them uniformly [8].

      Examples

      When we call ...

      using GeometricMachineLearning
      + -0.27575   0.32991   0.77275
      + -0.62485  -0.33224  -0.0686
      + -0.69333   0.36724  -0.18988
      + -0.09295  -0.73145   0.46064
      +  0.2102    0.33301   0.38717

      Random elements of the manifold can also be allocated on GPU, via e.g. ...

      rand(CUDABackend(), StiefelManifold{Float32}, N, n)

      ... for drawing elements on a CUDA device.

      source
      Base.randMethod
      rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold

      Draw random elements from the Stiefel and the Grassmann manifold.

      Because both of these manifolds are compact spaces we can sample them uniformly [8].

      Examples

      When we call ...

      using GeometricMachineLearning
      +using GeometricMachineLearning: _round # hide
       import Random
       Random.seed!(123)
       
       N, n = 5, 3
      -rand(StiefelManifold{Float32}, N, n)
      +Y = rand(StiefelManifold{Float32}, N, n)
      +_round(Y; digits = 5) # hide
       
       # output
       
       5×3 StiefelManifold{Float32, Matrix{Float32}}:
      - -0.275746    0.329913   0.772753
      - -0.624851   -0.332242  -0.0685991
      - -0.693326    0.36724   -0.189882
      - -0.0929493  -0.731446   0.460639
      -  0.210203    0.333008   0.387173

      ... the sampling is done by first allocating a random matrix of size $N\times{}n$ via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix.

      source
      Base.vecMethod

      If vec is applied onto Triangular, then the output is the associated vector.

      source
      Base.vecMethod

      If vec is applied onto SkewSymMatrix, then the output is the associated vector.

      source
      GeometricMachineLearning.GradientFunction

      This is an old constructor and will be depricated. For change_q=true it is equivalent to GradientLayerQ; for change_q=false it is equivalent to GradientLayerP.

      If full_grad=false then ActivationLayer is called

      source
      GeometricMachineLearning.TransformerMethod

      The architecture for a "transformer encoder" is essentially taken from arXiv:2010.11929, but with the difference that no layer normalization is employed. This is because we still need to find a generalization of layer normalization to manifolds.

      The transformer is called with the following inputs:

      • dim: the dimension of the transformer
      • n_heads: the number of heads
      • L: the number of transformer blocks

      In addition we have the following optional arguments:

      • activation: the activation function used for the ResNet (tanh by default)
      • Stiefel::Bool: if the matrices $P^V$, $P^Q$ and $P^K$ should live on a manifold (false by default)
      • retraction: which retraction should be used (Geodesic() by default)
      • add_connection::Bool: if the input should by added to the ouput after the MultiHeadAttention layer is used (true by default)
      • use_bias::Bool: If the ResNet should use a bias (true by default)
      source
      GeometricMachineLearning.accuracyMethod

      Computes the accuracy (as opposed to the loss) of a neural network classifier.

      It takes as input:

      • model::Chain
      • ps: parameters of the network
      • dl::DataLoader
      source
      GeometricMachineLearning.apply_layer_to_nt_and_return_arrayMethod

      This function is used in the wrappers where the input to the SympNet layers is not a NamedTuple (as it should be) but an AbstractArray.

      It converts the Array to a NamedTuple (via assign_q_and_p), then calls the SympNet routine(s) and converts back to an AbstractArray (with vcat).

      source
      GeometricMachineLearning.assign_batch_kernel!Method

      Takes as input a batch tensor (to which the data are assigned), the whole data tensor and two vectors params and time_steps that include the specific parameters and time steps we want to assign.

      Note that this assigns sequential data! For e.g. being processed by a transformer.

      source
      GeometricMachineLearning.assign_output_estimateMethod

      The function assign_output_estimate is closely related to the transformer. It takes the last prediction_window columns of the output and uses them for the final prediction. i.e.

      \[\mathbb{R}^{N\times\mathtt{pw}}\to\mathbb{R}^{N\times\mathtt{pw}}, + -0.27575 0.32991 0.77275 + -0.62485 -0.33224 -0.0686 + -0.69333 0.36724 -0.18988 + -0.09295 -0.73145 0.46064 + 0.2102 0.33301 0.38717

      ... the sampling is done by first allocating a random matrix of size $N\times{}n$ via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix.

      source
      Base.vecMethod

      If vec is applied onto Triangular, then the output is the associated vector.

      source
      Base.vecMethod

      If vec is applied onto SkewSymMatrix, then the output is the associated vector.

      source
      GeometricMachineLearning.GradientFunction

      This is an old constructor and will be depricated. For change_q=true it is equivalent to GradientLayerQ; for change_q=false it is equivalent to GradientLayerP.

      If full_grad=false then ActivationLayer is called

      source
      GeometricMachineLearning.TransformerMethod

      The architecture for a "transformer encoder" is essentially taken from arXiv:2010.11929, but with the difference that no layer normalization is employed. This is because we still need to find a generalization of layer normalization to manifolds.

      The transformer is called with the following inputs:

      • dim: the dimension of the transformer
      • n_heads: the number of heads
      • L: the number of transformer blocks

      In addition we have the following optional arguments:

      • activation: the activation function used for the ResNet (tanh by default)
      • Stiefel::Bool: if the matrices $P^V$, $P^Q$ and $P^K$ should live on a manifold (false by default)
      • retraction: which retraction should be used (Geodesic() by default)
      • add_connection::Bool: if the input should by added to the ouput after the MultiHeadAttention layer is used (true by default)
      • use_bias::Bool: If the ResNet should use a bias (true by default)
      source
      GeometricMachineLearning.accuracyMethod

      Computes the accuracy (as opposed to the loss) of a neural network classifier.

      It takes as input:

      • model::Chain
      • ps: parameters of the network
      • dl::DataLoader
      source
      GeometricMachineLearning.apply_layer_to_nt_and_return_arrayMethod

      This function is used in the wrappers where the input to the SympNet layers is not a NamedTuple (as it should be) but an AbstractArray.

      It converts the Array to a NamedTuple (via assign_q_and_p), then calls the SympNet routine(s) and converts back to an AbstractArray (with vcat).

      source
      GeometricMachineLearning.apply_sectionMethod
      apply_section(λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT <: StiefelManifold{T}}

      Apply λY to Y₂.

      Mathematically this is the group action of the element $\lambda{}Y\in{}G$ on the element $Y_2$ of the homogeneous space $\mathcal{M}$.

      Internally it calls the inplace version apply_section!.

      source
      GeometricMachineLearning.assign_batch_kernel!Method

      Takes as input a batch tensor (to which the data are assigned), the whole data tensor and two vectors params and time_steps that include the specific parameters and time steps we want to assign.

      Note that this assigns sequential data! For e.g. being processed by a transformer.

      source
      GeometricMachineLearning.assign_output_estimateMethod

      The function assign_output_estimate is closely related to the transformer. It takes the last prediction_window columns of the output and uses them for the final prediction. i.e.

      \[\mathbb{R}^{N\times\mathtt{pw}}\to\mathbb{R}^{N\times\mathtt{pw}}, \begin{bmatrix} z^{(1)}_1 & \cdots & z^{(T)}_1 \\ \cdots & \cdots & \cdots \\ @@ -79,6 +174,148 @@ \begin{bmatrix} z^{(T - \mathtt{pw})}_1 & \cdots & z^{(T)}_1 \\ \cdots & \cdots & \cdots \\ - z^{(T - \mathtt{pw})}_n & \cdots & z^{(T})_n\end{bmatrix} \]

      source
      GeometricMachineLearning.assign_q_and_pMethod

      Allocates two new arrays q and p whose first dimension is half of that of the input x. This should also be supplied through the second argument N.

      The output is a Tuple containing q and p.

      source
      GeometricMachineLearning.build_v_reducedMethod

      Builds the reduced vector field based on the full vector field for a Hamiltonian system. We derive the reduced vector field via the reduced Hamiltonian: $\tilde{H} := H\circ\Psi^\mathrm{dec}$. We then get

      \[\mathbb{J}_{2n}\nabla_\xi\tilde{H} = \mathbb{J}_{2n}(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}^T\mathbb{J}_{2N}\nabla_z{}H = \mathbb{J}_{2n}(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}^T \begin{pmatrix} v(z) \\ f(z) \end{pmatrix} = \begin{pmatrix} - (\nabla_p\Psi_q)^Tf(z) + (\nabla_p\Psi_p)^Tv(z) \\ (\nabla_q\Psi_q)^Tf(z) - (\nabla_q\Psi_p)^Tv(z) \end{pmatrix}.\]

      source
      GeometricMachineLearning.geodesicMethod

      The geodesic map for the manifolds. It takes as input an element $x$ of $\mathcal{M}$ and an element of $T_x\mathcal{M}$ and returns $\mathtt{geodesic}(x, v_x) = \exp(v_x).$ For example:

      Y = rand(StiefelManifold{Float64}, N, n)
      +    z^{(T - \mathtt{pw})}_n & \cdots      & z^{(T})_n\end{bmatrix}     \]

      source
      GeometricMachineLearning.assign_q_and_pMethod

      Allocates two new arrays q and p whose first dimension is half of that of the input x. This should also be supplied through the second argument N.

      The output is a Tuple containing q and p.

      source
      GeometricMachineLearning.build_v_reducedMethod

      Builds the reduced vector field based on the full vector field for a Hamiltonian system. We derive the reduced vector field via the reduced Hamiltonian: $\tilde{H} := H\circ\Psi^\mathrm{dec}$. We then get

      \[\mathbb{J}_{2n}\nabla_\xi\tilde{H} = \mathbb{J}_{2n}(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}^T\mathbb{J}_{2N}\nabla_z{}H = \mathbb{J}_{2n}(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}^T \begin{pmatrix} v(z) \\ f(z) \end{pmatrix} = \begin{pmatrix} - (\nabla_p\Psi_q)^Tf(z) + (\nabla_p\Psi_p)^Tv(z) \\ (\nabla_q\Psi_q)^Tf(z) - (\nabla_q\Psi_p)^Tv(z) \end{pmatrix}.\]

      source
      GeometricMachineLearning.geodesicMethod
      geodesic(Y::Manifold, Δ)

      Take as input an element of a manifold Y and a tangent vector in Δ in the corresponding tangent space and compute the geodesic (exponential map).

      In different notation: take as input an element $x$ of $\mathcal{M}$ and an element of $T_x\mathcal{M}$ and return $\mathtt{geodesic}(x, v_x) = \exp(v_x).$ For example:

      Y = rand(StiefelManifold{Float64}, N, n)
       Δ = rgrad(Y, rand(N, n))
      -geodesic(Y, Δ)

      See the docstring for $rgrad$ for details on this function.

      source
      GeometricMachineLearning.init_optimizer_cacheMethod

      Wrapper for the functions setup_adam_cache, setup_momentum_cache, setup_gradient_cache, setup_bfgs_cache. These appear outside of optimizer_caches.jl because the OptimizerMethods first have to be defined.

      source
      GeometricMachineLearning.metricMethod

      Implements the canonical Riemannian metric for the Stiefel manifold:

      \[g_Y: (\Delta_1, \Delta_2) \mapsto \mathrm{tr}(\Delta_1^T(\mathbb{I} - \frac{1}{2}YY^T)\Delta_2).\]

      It is called with:

      • Y::StiefelManifold
      • Δ₁::AbstractMatrix
      • Δ₂::AbstractMatrix
      source
      GeometricMachineLearning.onehotbatchMethod

      One-hot-batch encoding of a vector of integers: $input\in\{0,1,\ldots,9\}^\ell$. The output is a tensor of shape $10\times1\times\ell$.

      \[0 \mapsto \begin{bmatrix} 1 & 0 & \ldots & 0 \end{bmatrix}.\]

      In more abstract terms: $i \mapsto e_i$.

      source
      GeometricMachineLearning.optimization_step!Method

      Optimization for a single layer.

      inputs:

      • o::Optimizer
      • d::Union{AbstractExplicitLayer, AbstractExplicitCell}
      • ps::NamedTuple: the parameters
      • C::NamedTuple: NamedTuple of the caches
      • dx::NamedTuple: NamedTuple of the derivatives (output of AD routine)

      ps, C and dx must have the same keys.

      source
      GeometricMachineLearning.optimize_for_one_epoch!Method

      Optimize for an entire epoch. For this you have to supply:

      • an instance of the optimizer.
      • the neural network model
      • the parameters of the model
      • the data (in form of DataLoader)
      • in instance of Batch that contains batch_size (and optionally seq_length)

      With the optional argument:

      • the loss, which takes the model, the parameters ps and an instance of DataLoader as input.

      The output of optimize_for_one_epoch! is the average loss over all batches of the epoch:

      \[output = \frac{1}{\mathtt{steps\_per\_epoch}}\sum_{t=1}^\mathtt{steps\_per\_epoch}loss(\theta^{(t-1)}).\]

      This is done because any reverse differentiation routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps) (if the loss only depends on the parameters).

      source
      GeometricMachineLearning.rgradMethod

      Computes the Riemannian gradient for the Stiefel manifold given an element $Y\in{}St(N,n)$ and a matrix $\nabla{}L\in\mathbb{R}^{N\times{}n}$ (the Euclidean gradient). It computes the Riemannian gradient with respect to the canonical metric (see the documentation for the function metric for an explanation of this). The precise form of the mapping is:

      \[\mathtt{rgrad}(Y, \nabla{}L) \mapsto \nabla{}L - Y(\nabla{}L)^TY\]

      It is called with inputs:

      • Y::StiefelManifold
      • e_grad::AbstractMatrix: i.e. the Euclidean gradient (what was called $\nabla{}L$) above.
      source
      GeometricMachineLearning.split_and_flattenMethod

      split_and_flatten takes a tensor as input and produces another one as output (essentially rearranges the input data in an intricate way) so that it can easily be processed with a transformer.

      The optional arguments are:

      • patch_length: by default this is 7.
      • number_of_patches: by default this is 16.
      source
      GeometricMachineLearning.tensor_mat_skew_sym_assignMethod

      Takes as input:

      • Z::AbstractArray{T, 3}: A tensor that stores a bunch of time series.
      • A::AbstractMatrix: A matrix that is used to perform various scalar products.

      For one of these time series the function performs the following computation:

      \[ (z^{(i)}, z^{(j)}) \mapsto (z^{(i)})^TAz^{(j)} \text{ for } i > j.\]

      The result of this are $n(n-2)\div2$ scalar products. These scalar products are written into a lower-triangular matrix and the final output of the function is a tensor of these lower-triangular matrices.

      source
      GeometricMachineLearning.train!Function
      train!(...)

      Perform a training of a neural networks on data using given method a training Method

      Different ways of use:

      train!(neuralnetwork, data, optimizer = GradientOptimizer(1e-2), training_method; nruns = 1000, batch_size = default(data, type), showprogress = false )

      Arguments

      • neuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend
      • data : the data (see TrainingData)
      • optimizer = GradientOptimizer: the optimization method (see Optimizer)
      • training_method : specify the loss function used
      • nruns : number of iteration through the process with default value
      • batch_size : size of batch of data used for each step
      source
      GeometricMachineLearning.train!Method
      train!(neuralnetwork, data, optimizer, training_method; nruns = 1000, batch_size, showprogress = false )

      Arguments

      • neuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend
      • data::AbstractTrainingData : the data
      • ``
      source
      GeometricMachineLearning.ΩMethod

      Implements the canonical horizontal lift for the Stiefel manifold:

      \[ (\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      Internally this performs

      SkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')

      to save memory.

      source
      +geodesic(Y, Δ)

      See the docstring for rgrad for details on this function.

      source
      GeometricMachineLearning.global_repMethod
      global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:GrassmannManifold{T}}

      Express Δ (an the tangent space of Y) as an instance of GrassmannLieAlgHorMatrix.

      The method global_rep for GrassmannManifold is similar to that for StiefelManifold.

      Examples

      using GeometricMachineLearning
      +using GeometricMachineLearning: _round
      +import Random 
      +
      +Random.seed!(123)
      +
      +Y = rand(GrassmannManifold, 6, 3)
      +Δ = rgrad(Y, randn(6, 3))
      +λY = GlobalSection(Y)
      +
      +_round(global_rep(λY, Δ); digits = 3)
      +
      +# output
      +
      +6×6 GrassmannLieAlgHorMatrix{Float64, Matrix{Float64}}:
      +  0.0     0.0     0.0     0.981  -2.058   0.4
      +  0.0     0.0     0.0    -0.424   0.733  -0.919
      +  0.0     0.0     0.0    -1.815   1.409   1.085
      + -0.981   0.424   1.815   0.0     0.0     0.0
      +  2.058  -0.733  -1.409   0.0     0.0     0.0
      + -0.4     0.919  -1.085   0.0     0.0     0.0
      source
      GeometricMachineLearning.global_repMethod
      global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:StiefelManifold{T}}

      Express Δ (an the tangent space of Y) as an instance of StiefelLieAlgHorMatrix.

      This maps an element from $T_Y\mathcal{M}$ to an element of $\mathfrak{g}^\mathrm{hor}$.

      These two spaces are isomorphic where the isomorphism where the isomorphism is established through $\lambda(Y)\in{}G$ via:

      \[T_Y\mathcal{M} \to \mathfrak{g}^{\mathrm{hor}}, \Delta \mapsto \lambda(Y)^{-1}\Omega(Y, \Delta)\lambda(Y).\]

      Also see GeometricMachineLearning.Ω.

      Examples

      using GeometricMachineLearning
      +using GeometricMachineLearning: _round
      +import Random 
      +
      +Random.seed!(123)
      +
      +Y = rand(StiefelManifold, 6, 3)
      +Δ = rgrad(Y, randn(6, 3))
      +λY = GlobalSection(Y)
      +
      +_round(global_rep(λY, Δ); digits = 3)
      +
      +# output
      +
      +6×6 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:
      +  0.0     0.679   1.925   0.981  -2.058   0.4
      + -0.679   0.0     0.298  -0.424   0.733  -0.919
      + -1.925  -0.298   0.0    -1.815   1.409   1.085
      + -0.981   0.424   1.815   0.0     0.0     0.0
      +  2.058  -0.733  -1.409   0.0     0.0     0.0
      + -0.4     0.919  -1.085   0.0     0.0     0.0

      Implementation

      The function global_rep does in fact not perform the entire map $\lambda(Y)^{-1}\Omega(Y, \Delta)\lambda(Y)$ but only

      \[\Delta \mapsto \mathrm{skew}(Y^T\Delta),\]

      to get the small skew-symmetric matrix and

      \[\Delta \mapsto (\lambda(Y)_{[1:N, n:N]}^T \Delta)_{[1:(N-n), 1:n]},\]

      for the arbitrary matrix.

      source
      GeometricMachineLearning.global_sectionMethod
      global_section(Y::StiefelManifold)

      Compute a matrix of size $N\times(N-n)$ whose columns are orthogonal to the columns in Y.

      This matrix is also called $Y_\perp$ [6, 10, 11].

      Examples

      using GeometricMachineLearning
      +using GeometricMachineLearning: global_section
      +import Random
      +
      +Random.seed!(123)
      +
      +Y = StiefelManifold([1. 0.; 0. 1.; 0. 0.; 0. 0.])
      +
      +round.(Matrix(global_section(Y)); digits = 3)
      +
      +# output
      +
      +4×2 Matrix{Float64}:
      + 0.0    -0.0
      + 0.0     0.0
      + 0.936  -0.353
      + 0.353   0.936

      Further note that we convert the QRCompactWYQ object to a Matrix before we display it.

      Implementation

      The implementation is done with a QR decomposition (LinearAlgebra.qr!). Internally we do:

      A = randn(N, N - n) # or the gpu equivalent
      +A = A - Y.A * (Y.A' * A)
      +qr!(A).Q
      source
      GeometricMachineLearning.init_optimizer_cacheMethod

      Wrapper for the functions setup_adam_cache, setup_momentum_cache, setup_gradient_cache, setup_bfgs_cache. These appear outside of optimizer_caches.jl because the OptimizerMethods first have to be defined.

      source
      GeometricMachineLearning.mat_tensor_mul!Method
      mat_tensor_mul!(C::AbstractArray{T, 3}, A::LowerTriangular{T}, B::AbstractArray{T, 3}) where T

      Multiply the lower-triangular matrix A onto the tensor B from the left and store the result in C.

      Also checks the bounds of the input arrays.

      This performs an efficient multiplication based on the special structure of the lower-triangular matrix A.

      source
      GeometricMachineLearning.mat_tensor_mul!Method
      mat_tensor_mul!(C::AbstractArray{T, 3}, A::SkewSymMatrix{T}, B::AbstractArray{T, 3}) where T

      Multiply skew-symmetric the matrix A onto the tensor B from the left and store the result in C.

      Also checks the bounds of the input arrays.

      This performs an efficient multiplication based on the special structure of the skew-symmetric matrix A.

      source
      GeometricMachineLearning.mat_tensor_mul!Method
      mat_tensor_mul!(C::AbstractArray{T, 3}, A::SymmetricMatrix{T}, B::AbstractArray{T, 3}) where T

      Multiply the symmetric matrix A onto the tensor B from the left and store the result in C.

      Also checks the bounds of the input arrays.

      This performs an efficient multiplication based on the special structure of the symmetric matrix A.

      source
      GeometricMachineLearning.mat_tensor_mul!Method
      mat_tensor_mul!(C::AbstractArray{T, 3}, A::UpperTriangular{T}, B::AbstractArray{T, 3}) where T

      Multiply the upper-triangular matrix A onto the tensor B from the left and store the result in C.

      Also checks the bounds of the input arrays.

      This performs an efficient multiplication based on the special structure of the upper-triangular matrix A.

      source
      GeometricMachineLearning.mat_tensor_mulMethod
      mat_tensor_mul(A::AbstractMatrix{T}, B::AbstractArray{T, 3}) where T

      Multipliy the matrix A onto the tensor B from the left.

      Internally this calls the inplace version mat_tensor_mul!.

      Examples

      using GeometricMachineLearning: mat_tensor_mul
      +
      +B = [1 1 1; 1 1 1; 1 1 1;;; 2 2 2; 2 2 2; 2 2 2]
      +A = [3 0 0; 0 2 0; 0 0 1]
      +
      +mat_tensor_mul(A, B)
      +
      +# output
      +
      +3×3×2 Array{Int64, 3}:
      +[:, :, 1] =
      + 3  3  3
      + 2  2  2
      + 1  1  1
      +
      +[:, :, 2] =
      + 6  6  6
      + 4  4  4
      + 2  2  2
      source
      GeometricMachineLearning.metricMethod
      metric(Y::GrassmannManifold, Δ₁::AbstractMatrix, Δ₂::AbstractMatrix)

      Compute the metric for vectors Δ₁ and Δ₂ at Y.

      The representation of the Grassmann manifold is realized as a quotient space of the Stiefel manifold.

      The metric for the Grassmann manifold is:

      \[g^{Gr}_Y(\Delta_1, \Delta_2) = g^{St}_Y(\Delta_1, \Delta_2) = \mathrm{Tr}(\Delta_1^T (\mathbb{I} - Y Y^T) \Delta_2) = \mathrm{Tr}(\Delta_1^T \Delta_2).\]

      source
      GeometricMachineLearning.metricMethod

      Implements the canonical Riemannian metric for the Stiefel manifold:

      \[g_Y: (\Delta_1, \Delta_2) \mapsto \mathrm{tr}(\Delta_1^T(\mathbb{I} - \frac{1}{2}YY^T)\Delta_2).\]

      It is called with:

      • Y::StiefelManifold
      • Δ₁::AbstractMatrix
      • Δ₂::AbstractMatrix
      source
      GeometricMachineLearning.onehotbatchMethod

      One-hot-batch encoding of a vector of integers: $input\in\{0,1,\ldots,9\}^\ell$. The output is a tensor of shape $10\times1\times\ell$.

      \[0 \mapsto \begin{bmatrix} 1 & 0 & \ldots & 0 \end{bmatrix}.\]

      In more abstract terms: $i \mapsto e_i$.

      source
      GeometricMachineLearning.optimization_step!Method

      Optimization for a single layer.

      inputs:

      • o::Optimizer
      • d::Union{AbstractExplicitLayer, AbstractExplicitCell}
      • ps::NamedTuple: the parameters
      • C::NamedTuple: NamedTuple of the caches
      • dx::NamedTuple: NamedTuple of the derivatives (output of AD routine)

      ps, C and dx must have the same keys.

      source
      GeometricMachineLearning.optimize_for_one_epoch!Method

      Optimize for an entire epoch. For this you have to supply:

      • an instance of the optimizer.
      • the neural network model
      • the parameters of the model
      • the data (in form of DataLoader)
      • in instance of Batch that contains batch_size (and optionally seq_length)

      With the optional argument:

      • the loss, which takes the model, the parameters ps and an instance of DataLoader as input.

      The output of optimize_for_one_epoch! is the average loss over all batches of the epoch:

      \[output = \frac{1}{\mathtt{steps\_per\_epoch}}\sum_{t=1}^\mathtt{steps\_per\_epoch}loss(\theta^{(t-1)}).\]

      This is done because any reverse differentiation routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps) (if the loss only depends on the parameters).

      source
      GeometricMachineLearning.rgradMethod
      rgrad(Y::GrassmannManifold, e_grad::AbstractMatrix)

      Compute the Riemannian gradient at $Y\in{}Gr(n, N)$.

      These gradient have the property that they are orthogonal to the space spanned by $Y$.

      The precise form of the mapping is:

      \[\mathtt{rgrad}(Y, \nabla{}L) \mapsto \nabla{}L - YY^T\nabla{}L\]

      Note the property $Y^T\mathrm{rgrad}(Y, \nabla{}L) = \mathbb{O}.$

      Also see rgrad(::StiefelManifold, ::AbstractMatrix).

      Examples

      using GeometricMachineLearning
      +
      +Y = GrassmannManifold([1 0 ; 0 1 ; 0 0; 0 0])
      +Δ = [1 2; 3 4; 5 6; 7 8]
      +rgrad(Y, Δ)
      +
      +# output
      +
      +4×2 Matrix{Int64}:
      + 0  0
      + 0  0
      + 5  6
      + 7  8
      source
      GeometricMachineLearning.rgradMethod
      rgrad(Y::StiefelManifold, e_grad::AbstractMatrix)

      Compute the Riemannian gradient for the Stiefel manifold at $Y\in{}St(N,n)$ based on $\nabla{}L\in\mathbb{R}^{N\times{}n}$ (the Euclidean gradient).

      The function computes the Riemannian gradient with respect to the canonical metric.

      The precise form of the mapping is:

      \[\mathtt{rgrad}(Y, \nabla{}L) \mapsto \nabla{}L - Y(\nabla{}L)^TY\]

      Note the property $Y^T\mathrm{rgrad}(Y, \nabla{}L)\in\mathcal{S}_\mathrm{skew}(n).$

      Examples

      using GeometricMachineLearning
      +
      +Y = StiefelManifold([1 0 ; 0 1 ; 0 0; 0 0])
      +Δ = [1 2; 3 4; 5 6; 7 8]
      +rgrad(Y, Δ)
      +
      +# output
      +
      +4×2 Matrix{Int64}:
      + 0  -1
      + 1   0
      + 5   6
      + 7   8
      source
      GeometricMachineLearning.split_and_flattenMethod

      split_and_flatten takes a tensor as input and produces another one as output (essentially rearranges the input data in an intricate way) so that it can easily be processed with a transformer.

      The optional arguments are:

      • patch_length: by default this is 7.
      • number_of_patches: by default this is 16.
      source
      GeometricMachineLearning.tensor_mat_mul!Method
      mat_tensor_mul!(C::AbstractArray{T, 3}, B::AbstractArray{T, 3}, A::SymmetricMatrix{T}) where T

      Multiply the symmetric matrix A onto the tensor B from the right and store the result in C.

      Also checks the bounds of the input arrays.

      This performs an efficient multiplication based on the special structure of the symmetric matrix A.

      source
      GeometricMachineLearning.tensor_mat_mulMethod
      tensor_mat_mul(A::AbstractArray{T, 3}, B::AbstractArray{T}) where T

      Multipliy the matrix B onto the tensor A from the right.

      Internally this calls the inplace version tensor_mat_mul!.

      Examples

      using GeometricMachineLearning: tensor_mat_mul
      +
      +A = [1 1 1; 1 1 1; 1 1 1;;; 2 2 2; 2 2 2; 2 2 2]
      +B = [3 0 0; 0 2 0; 0 0 1]
      +
      +tensor_mat_mul(A, B)
      +
      +# output
      +
      +3×3×2 Array{Int64, 3}:
      +[:, :, 1] =
      + 3  2  1
      + 3  2  1
      + 3  2  1
      +
      +[:, :, 2] =
      + 6  4  2
      + 6  4  2
      + 6  4  2
      source
      GeometricMachineLearning.tensor_mat_skew_sym_assignMethod

      Takes as input:

      • Z::AbstractArray{T, 3}: A tensor that stores a bunch of time series.
      • A::AbstractMatrix: A matrix that is used to perform various scalar products.

      For one of these time series the function performs the following computation:

      \[ (z^{(i)}, z^{(j)}) \mapsto (z^{(i)})^TAz^{(j)} \text{ for } i > j.\]

      The result of this are $n(n-2)\div2$ scalar products. These scalar products are written into a lower-triangular matrix and the final output of the function is a tensor of these lower-triangular matrices.

      source
      GeometricMachineLearning.train!Function
      train!(...)

      Perform a training of a neural networks on data using given method a training Method

      Different ways of use:

      train!(neuralnetwork, data, optimizer = GradientOptimizer(1e-2), training_method; nruns = 1000, batch_size = default(data, type), showprogress = false )

      Arguments

      • neuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend
      • data : the data (see TrainingData)
      • optimizer = GradientOptimizer: the optimization method (see Optimizer)
      • training_method : specify the loss function used
      • nruns : number of iteration through the process with default value
      • batch_size : size of batch of data used for each step
      source
      GeometricMachineLearning.train!Method
      train!(neuralnetwork, data, optimizer, training_method; nruns = 1000, batch_size, showprogress = false )

      Arguments

      • neuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend
      • data::AbstractTrainingData : the data
      • ``
      source
      GeometricMachineLearning.ΩMethod
      Ω(Y::GrassmannManifold{T}, Δ::AbstractMatrix{T}) where T

      Perform the canonical horizontal lift for the Grassmann manifold:

      \[ \Delta \mapsto \Omega^{St}(Y, Δ),\]

      where $\Omega^{St}$ is the canonical horizontal lift for the Stiefel manifold.

      using GeometricMachineLearning
      +E = GrassmannManifold(StiefelProjection(5, 2))
      +Δ = [0. 0.; 0. 0.; 2. 3.; 4. 5.; 6. 7.]
      +GeometricMachineLearning.Ω(E, Δ)
      +
      +# output
      +
      +5×5 SkewSymMatrix{Float64, Vector{Float64}}:
      + 0.0  -0.0  -2.0  -4.0  -6.0
      + 0.0   0.0  -3.0  -5.0  -7.0
      + 2.0   3.0   0.0  -0.0  -0.0
      + 4.0   5.0   0.0   0.0  -0.0
      + 6.0   7.0   0.0   0.0   0.0
      source
      GeometricMachineLearning.ΩMethod
      Ω(Y::StiefelManifold{T}, Δ::AbstractMatrix{T}) where T

      Perform canonical horizontal lift for the Stiefel manifold:

      \[ \Delta \mapsto (\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      Internally this performs

      SkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')

      to save memory.

      Examples

      using GeometricMachineLearning
      +E = StiefelManifold(StiefelProjection(5, 2))
      +Δ = [0. -1.; 1. 0.; 2. 3.; 4. 5.; 6. 7.]
      +GeometricMachineLearning.Ω(E, Δ)
      +
      +# output
      +
      +5×5 SkewSymMatrix{Float64, Vector{Float64}}:
      + 0.0  -1.0  -2.0  -4.0  -6.0
      + 1.0   0.0  -3.0  -5.0  -7.0
      + 2.0   3.0   0.0  -0.0  -0.0
      + 4.0   5.0   0.0   0.0  -0.0
      + 6.0   7.0   0.0   0.0   0.0

      Note that the output of Ω is a skew-symmetric matrix, i.e. an element of $\mathfrak{g}$.

      source
      diff --git a/latest/manifolds/basic_topology/index.html b/latest/manifolds/basic_topology/index.html index 48f4cb9a6..efe7c3b51 100644 --- a/latest/manifolds/basic_topology/index.html +++ b/latest/manifolds/basic_topology/index.html @@ -1,2 +1,2 @@ -Concepts from General Topology · GeometricMachineLearning.jl

      Basic Concepts from General Topology

      On this page we discuss basic notions of topology that are necessary to define manifolds and work with them. Here we largely omit concrete examples and only define concepts that are necessary for defining a manifold[1], namely the properties of being Hausdorff and second countable. For a detailed discussion of the theory and for a wide range of examples that illustrate the theory see e.g. [1]. The here-presented concepts are also (rudimentarily) covered in most differential geometry books such as [2, 3].

      We now start by giving all the definitions, theorem and corresponding proofs that are needed to define manifolds. Every manifold is a topological space which is why we give this definition first:

      Definition

      A topological space is a set $\mathcal{M}$ for which we define a collection of subsets of $\mathcal{M}$, which we denote by $\mathcal{T}$ and call the open subsets. $\mathcal{T}$ further has to satisfy the following three conditions:

      1. The empty set and $\mathcal{M}$ belong to $\mathcal{T}$.
      2. Any union of an arbitrary number of elements of $\mathcal{T}$ again belongs to $\mathcal{T}$.
      3. Any intersection of a finite number of elements of $\mathcal{T}$ again belongs to $\mathcal{T}$.

      So an arbitrary union of open sets is again open and a finite intersection of open sets is again open.

      Based on this definition of a topological space we can now define what it means to be Hausdorff:

      Definition

      A topological space $\mathcal{M}$ is said to be Hausdorff if for any two points $x,y\in\mathcal{M}$ we can find two open sets $U_x,U_y\in\mathcal{T}$ s.t. $x\in{}U_x, y\in{}U_y$ and $U_x\cap{}U_y=\{\}$.

      We now give the second definition that we need for defining manifolds, that of second countability:

      Definition

      A topological space $\mathcal{M}$ is said to be second-countable if we can find a countable subcollection of $\mathcal{T}$ called $\mathcal{U}$ s.t. $\forall{}U\in\mathcal{T}$ and $x\in{}U$ we can find an element $V\in\mathcal{U}$ for which $x\in{}V\subset{}U$.

      We now give a few definitions and results that are needed for the inverse function theorem which is essential for practical applications of manifold theory. We start with the definition of continuity:

      Definition

      A mapping $f$ between topological spaces $\mathcal{M}$ and $\mathcal{N}$ is called continuous if the preimage of every open set is again an open set, i.e. if $f^{-1}\{U\}\in\mathcal{T}$ for $U$ open in $\mathcal{N}$ and $\mathcal{T}$ the topology on $\mathcal{M}$.

      Continuity can also be formulated in terms of closed sets instead of doing it with open sets. The definition of closed sets is given below:

      Definition

      A closed set of a topological space $\mathcal{M}$ is one whose complement is an open set, i.e. $F$ is closed if $F^c\in\mathcal{T}$, where the superscript ${}^c$ indicates the complement. For closed sets we thus have the following three properties:

      1. The empty set and $\mathcal{M}$ are closed sets.
      2. Any union of a finite number of closed sets is again closed.
      3. Any intersection of an arbitrary number of closed sets is again closed.

      So a finite union of closed sets is again closed and an arbitrary intersection of closed sets is again closed.

      We now give an equivalent definition of continuity:

      Theorem

      The definition of continuity is equivalent to the following, second definition: $f:\mathcal{M}\to\mathcal{N}$ is continuous if $f^{-1}\{F\}\subset\mathcal{M}$ is a closed set for each closed set $F\subset\mathcal{N}$.

      Proof

      First assume that $f$ is continuous according to the first definition and not to the second. Then $f^{-1}\{F\}$ is not closed but $f^{-1}\{F^c\}$ is open. But $f^{-1}\{F^c\} = \{x\in\mathcal{M}:f(x)\not\in\mathcal{N}\} = (f^{-1}\{F\})^c$ cannot be open, else $f^{-1}\{F\}$ would be closed. The implication of the first definition under assumption of the second can be shown analogously.

      The next theorem makes the rather abstract definition of closed sets more concrete; this definition is especially important for many practical proofs:

      Theorem

      The property of a set $F$ being closed is equivalent to the following statement: If a point $y$ is such that for every open set $U$ containing it we have $U\cap{}F\ne\{\}$ then this point is contained in $F$.

      Proof

      We first proof that if a set is closed then the statement holds. Consider a closed set $F$ and a point $y\not\in{}F$ s.t. every open set containing $y$ has nonempty intersection with $F$. But the complement $F^c$ also is such a set, which is a clear contradiction. Now assume the above statement for a set $F$ and further assume $F$ is not closed. Its complement $F^c$ is thus not open. Now consider the interior of this set: $\mathrm{int}(F^c):=\cup\{U:U\subset{}F^c\text{ and $U$ open}\}$, i.e. the biggest open set contained within $F^c$. Hence there must be a point $y$ which is in $F^c$ but is not in its interior, else $F^c$ would be equal to its interior, i.e. would be open. We further must be able to find an open set $U$ that contains $y$ but is also contained in $F^c$, else $y$ would be an element of $F$. A contradiction.

      Next we define open covers, a concept that is very important in developing a theory of manifolds:

      Definition

      An open cover of a topological space $\mathcal{M}$ is a (not necessarily countable) collection of open sets $\{U_i\}_{i\mathcal{I}}$ s.t. their union contains $\mathcal{M}$. A finite open cover is a finite collection of open sets that cover $\mathcal{M}$. We say that an open cover is reducible to a finite cover if we can find a finite number of elements in the open cover whose union still contains $\mathcal{M}$.

      And connected to this definition we state what it means for a topological space to be compact. This is a rather strong property that some of the manifolds treated in here have, for example the Stiefel manifold.

      Definition

      A topological space $\mathcal{M}$ is called compact if every open cover is reducible to a finite cover.

      A very important result from general topology is that continuous functions preserve compactness[2]:

      Theorem

      Consider a continuous function $f:\mathcal{M}\to\mathcal{N}$ and a compact set $K\in\mathcal{M}$. Then $f(K)$ is also compact.

      Proof

      Consider an open cover of $f(K)$: $\{U_i\}_{i\in\mathcal{I}}$. Then $\{f^{-1}\{U_i\}\}_{i\in\mathcal{I}}$ is an open cover of $K$ and hence reducible to a finite cover $\{f^{-1}\{U_i\}\}_{i\in\{i_1,\ldots,i_n\}}$. But then $\{{U_i\}_{i\in\{i_1,\ldots,i_n}}$ also covers $f(K)$.

      Moreover compactness is a property that is inherited by closed subspaces:

      Theorem

      A closed subset of a compact space is compact.

      Proof

      Call the closed set $F$ and consider an open cover of this set: $\{U\}_{i\in\mathcal{I}}$. Then this open cover combined with $F^c$ is an open cover for the entire compact space, hence reducible to a finite cover.

      Theorem

      A compact subset of a Hausdorff space is closed.

      Proof

      Consider a compact subset $K$. If $K$ is not closed, then there has to be a point $y\not\in{}K$ s.t. every open set containing $y$ intersects $K$. Because the surrounding space is Hausdorff we can now find the following two collections of open sets: $\{(U_z, U_{z,y}: U_z\cap{}U_{z,y}=\{\})\}_{z\in{}K}$. The open cover $\{U_z\}_{z\in{}K}$ is then reducible to a finite cover $\{U_z\}_{z\in\{z_1, \ldots, z_n\}}$. The intersection $\cap_{z\in{z_1, \ldots, z_n}}U_{z,y}$ is then an open set that contains $y$ but has no intersection with $K$. A contraction.

      This last theorem we will use in proofing the inverse function theorem:

      Theorem

      If $\mathcal{M}$ is compact and $\mathcal{N}$ is Hausdorff, then the inverse of a continuous function $f:\mathcal{M}\to\mathcal{N}$ is again continuous, i.e. $f(V)$ is an open set in $\mathcal{N}$ for $V\in\mathcal{T}$.

      Proof

      We can equivalently show that every closed set is mapped to a closed set. First consider the set $K\in\mathcal{M}$. Its image is again compact and hence closed because $\mathcal{N}$ is Hausdorff.

      References

      [1]
      S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).
      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      [3]
      S. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).
      • 1Some authors (see e.g. [2]) do not require these properties. But since they constitute very weak restrictions and are always satisfied by the manifolds relevant for our purposes we require them here.
      • 2We also say that compactness is a topological property [1].
      +Concepts from General Topology · GeometricMachineLearning.jl

      Basic Concepts from General Topology

      On this page we discuss basic notions of topology that are necessary to define manifolds and work with them. Here we largely omit concrete examples and only define concepts that are necessary for defining a manifold[1], namely the properties of being Hausdorff and second countable. For a detailed discussion of the theory and for a wide range of examples that illustrate the theory see e.g. [1]. The here-presented concepts are also (rudimentarily) covered in most differential geometry books such as [2, 3].

      We now start by giving all the definitions, theorem and corresponding proofs that are needed to define manifolds. Every manifold is a topological space which is why we give this definition first:

      Definition

      A topological space is a set $\mathcal{M}$ for which we define a collection of subsets of $\mathcal{M}$, which we denote by $\mathcal{T}$ and call the open subsets. $\mathcal{T}$ further has to satisfy the following three conditions:

      1. The empty set and $\mathcal{M}$ belong to $\mathcal{T}$.
      2. Any union of an arbitrary number of elements of $\mathcal{T}$ again belongs to $\mathcal{T}$.
      3. Any intersection of a finite number of elements of $\mathcal{T}$ again belongs to $\mathcal{T}$.

      So an arbitrary union of open sets is again open and a finite intersection of open sets is again open.

      Based on this definition of a topological space we can now define what it means to be Hausdorff:

      Definition

      A topological space $\mathcal{M}$ is said to be Hausdorff if for any two points $x,y\in\mathcal{M}$ we can find two open sets $U_x,U_y\in\mathcal{T}$ s.t. $x\in{}U_x, y\in{}U_y$ and $U_x\cap{}U_y=\{\}$.

      We now give the second definition that we need for defining manifolds, that of second countability:

      Definition

      A topological space $\mathcal{M}$ is said to be second-countable if we can find a countable subcollection of $\mathcal{T}$ called $\mathcal{U}$ s.t. $\forall{}U\in\mathcal{T}$ and $x\in{}U$ we can find an element $V\in\mathcal{U}$ for which $x\in{}V\subset{}U$.

      We now give a few definitions and results that are needed for the inverse function theorem which is essential for practical applications of manifold theory. We start with the definition of continuity:

      Definition

      A mapping $f$ between topological spaces $\mathcal{M}$ and $\mathcal{N}$ is called continuous if the preimage of every open set is again an open set, i.e. if $f^{-1}\{U\}\in\mathcal{T}$ for $U$ open in $\mathcal{N}$ and $\mathcal{T}$ the topology on $\mathcal{M}$.

      Continuity can also be formulated in terms of closed sets instead of doing it with open sets. The definition of closed sets is given below:

      Definition

      A closed set of a topological space $\mathcal{M}$ is one whose complement is an open set, i.e. $F$ is closed if $F^c\in\mathcal{T}$, where the superscript ${}^c$ indicates the complement. For closed sets we thus have the following three properties:

      1. The empty set and $\mathcal{M}$ are closed sets.
      2. Any union of a finite number of closed sets is again closed.
      3. Any intersection of an arbitrary number of closed sets is again closed.

      So a finite union of closed sets is again closed and an arbitrary intersection of closed sets is again closed.

      We now give an equivalent definition of continuity:

      Theorem

      The definition of continuity is equivalent to the following, second definition: $f:\mathcal{M}\to\mathcal{N}$ is continuous if $f^{-1}\{F\}\subset\mathcal{M}$ is a closed set for each closed set $F\subset\mathcal{N}$.

      Proof

      First assume that $f$ is continuous according to the first definition and not to the second. Then $f^{-1}\{F\}$ is not closed but $f^{-1}\{F^c\}$ is open. But $f^{-1}\{F^c\} = \{x\in\mathcal{M}:f(x)\not\in\mathcal{N}\} = (f^{-1}\{F\})^c$ cannot be open, else $f^{-1}\{F\}$ would be closed. The implication of the first definition under assumption of the second can be shown analogously.

      The next theorem makes the rather abstract definition of closed sets more concrete; this definition is especially important for many practical proofs:

      Theorem

      The property of a set $F$ being closed is equivalent to the following statement: If a point $y$ is such that for every open set $U$ containing it we have $U\cap{}F\ne\{\}$ then this point is contained in $F$.

      Proof

      We first proof that if a set is closed then the statement holds. Consider a closed set $F$ and a point $y\not\in{}F$ s.t. every open set containing $y$ has nonempty intersection with $F$. But the complement $F^c$ also is such a set, which is a clear contradiction. Now assume the above statement for a set $F$ and further assume $F$ is not closed. Its complement $F^c$ is thus not open. Now consider the interior of this set: $\mathrm{int}(F^c):=\cup\{U:U\subset{}F^c\text{ and $U$ open}\}$, i.e. the biggest open set contained within $F^c$. Hence there must be a point $y$ which is in $F^c$ but is not in its interior, else $F^c$ would be equal to its interior, i.e. would be open. We further must be able to find an open set $U$ that contains $y$ but is also contained in $F^c$, else $y$ would be an element of $F$. A contradiction.

      Next we define open covers, a concept that is very important in developing a theory of manifolds:

      Definition

      An open cover of a topological space $\mathcal{M}$ is a (not necessarily countable) collection of open sets $\{U_i\}_{i\mathcal{I}}$ s.t. their union contains $\mathcal{M}$. A finite open cover is a finite collection of open sets that cover $\mathcal{M}$. We say that an open cover is reducible to a finite cover if we can find a finite number of elements in the open cover whose union still contains $\mathcal{M}$.

      And connected to this definition we state what it means for a topological space to be compact. This is a rather strong property that some of the manifolds treated in here have, for example the Stiefel manifold.

      Definition

      A topological space $\mathcal{M}$ is called compact if every open cover is reducible to a finite cover.

      A very important result from general topology is that continuous functions preserve compactness[2]:

      Theorem

      Consider a continuous function $f:\mathcal{M}\to\mathcal{N}$ and a compact set $K\in\mathcal{M}$. Then $f(K)$ is also compact.

      Proof

      Consider an open cover of $f(K)$: $\{U_i\}_{i\in\mathcal{I}}$. Then $\{f^{-1}\{U_i\}\}_{i\in\mathcal{I}}$ is an open cover of $K$ and hence reducible to a finite cover $\{f^{-1}\{U_i\}\}_{i\in\{i_1,\ldots,i_n\}}$. But then $\{{U_i\}_{i\in\{i_1,\ldots,i_n}}$ also covers $f(K)$.

      Moreover compactness is a property that is inherited by closed subspaces:

      Theorem

      A closed subset of a compact space is compact.

      Proof

      Call the closed set $F$ and consider an open cover of this set: $\{U\}_{i\in\mathcal{I}}$. Then this open cover combined with $F^c$ is an open cover for the entire compact space, hence reducible to a finite cover.

      Theorem

      A compact subset of a Hausdorff space is closed.

      Proof

      Consider a compact subset $K$. If $K$ is not closed, then there has to be a point $y\not\in{}K$ s.t. every open set containing $y$ intersects $K$. Because the surrounding space is Hausdorff we can now find the following two collections of open sets: $\{(U_z, U_{z,y}: U_z\cap{}U_{z,y}=\{\})\}_{z\in{}K}$. The open cover $\{U_z\}_{z\in{}K}$ is then reducible to a finite cover $\{U_z\}_{z\in\{z_1, \ldots, z_n\}}$. The intersection $\cap_{z\in{z_1, \ldots, z_n}}U_{z,y}$ is then an open set that contains $y$ but has no intersection with $K$. A contraction.

      This last theorem we will use in proofing the inverse function theorem:

      Theorem

      If $\mathcal{M}$ is compact and $\mathcal{N}$ is Hausdorff, then the inverse of a continuous function $f:\mathcal{M}\to\mathcal{N}$ is again continuous, i.e. $f(V)$ is an open set in $\mathcal{N}$ for $V\in\mathcal{T}$.

      Proof

      We can equivalently show that every closed set is mapped to a closed set. First consider the set $K\in\mathcal{M}$. Its image is again compact and hence closed because $\mathcal{N}$ is Hausdorff.

      References

      [1]
      S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).
      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      [3]
      S. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).
      • 1Some authors (see e.g. [2]) do not require these properties. But since they constitute very weak restrictions and are always satisfied by the manifolds relevant for our purposes we require them here.
      • 2We also say that compactness is a topological property [1].
      diff --git a/latest/manifolds/existence_and_uniqueness_theorem/index.html b/latest/manifolds/existence_and_uniqueness_theorem/index.html index bac3c739e..42c10a271 100644 --- a/latest/manifolds/existence_and_uniqueness_theorem/index.html +++ b/latest/manifolds/existence_and_uniqueness_theorem/index.html @@ -1,7 +1,7 @@ -Differential Equations and the EAU theorem · GeometricMachineLearning.jl

      The Existence-And-Uniqueness Theorem

      The existence-and-uniqueness theorem, also known as the Picard-Lindelöf theorem, Picard's existence theorem and the Cauchy-Lipschitz theorem gives a proof of the existence of solutions for ODEs. Here we state the existence-and-uniqueness theorem for manifolds as vector fields are just a special case of this. Its proof relies on the Banach fixed-point theorem[1].

      Theorem (Existence-And-Uniqueness Theorem)

      Let $X$ a vector field on the manifold $\mathcal{M}$ that is differentiable at $x$. Then we can find an $\epsilon>0$ and a unique curve $\gamma:(-\epsilon, \epsilon)\to\mathcal{M}$ such that $\gamma'(t) = X(\gamma(t))$.

      Proof

      We consider a ball around a point $x\in\mathcal{M}$ with radius $r$ that we pick such that the ball $B(x, r)$ fits into the $U$ of some coordinate chart $\varphi_U$; we further use $X$ and $\varphi'\circ{}X\circ\varphi^{-1}$ interchangeably in this proof. We then define $L := \mathrm{sup}_{y,z\in{}B(x,r)}|X(y) - X(z)|/|y - z|.$ Note that this $L$ is always finite because $X$ is bounded and differentiable. We now define the map $\Gamma: C^\infty((-\epsilon, \epsilon), \mathbb{R}^n)\to{}C^\infty((-\epsilon, \epsilon), \mathbb{R}^n)$ (for some $\epsilon$ that we do not yet fix) as

      \[\Gamma\gamma(t) = x + \int_0^tX(\gamma(s))ds,\]

      i.e. $\Gamma$ maps $C^\infty$ curves through $x$ into $C^\infty$ curves through $x$. We further have with the norm $||\gamma||_\infty = \mathrm{sup}_{t \in (-\epsilon, \epsilon)}|\gamma(t)|$:

      \[\begin{aligned} +Differential Equations and the EAU theorem · GeometricMachineLearning.jl

      The Existence-And-Uniqueness Theorem

      The existence-and-uniqueness theorem, also known as the Picard-Lindelöf theorem, Picard's existence theorem and the Cauchy-Lipschitz theorem gives a proof of the existence of solutions for ODEs. Here we state the existence-and-uniqueness theorem for manifolds as vector fields are just a special case of this. Its proof relies on the Banach fixed-point theorem[1].

      Theorem (Existence-And-Uniqueness Theorem)

      Let $X$ a vector field on the manifold $\mathcal{M}$ that is differentiable at $x$. Then we can find an $\epsilon>0$ and a unique curve $\gamma:(-\epsilon, \epsilon)\to\mathcal{M}$ such that $\gamma'(t) = X(\gamma(t))$.

      Proof

      We consider a ball around a point $x\in\mathcal{M}$ with radius $r$ that we pick such that the ball $B(x, r)$ fits into the $U$ of some coordinate chart $\varphi_U$; we further use $X$ and $\varphi'\circ{}X\circ\varphi^{-1}$ interchangeably in this proof. We then define $L := \mathrm{sup}_{y,z\in{}B(x,r)}|X(y) - X(z)|/|y - z|.$ Note that this $L$ is always finite because $X$ is bounded and differentiable. We now define the map $\Gamma: C^\infty((-\epsilon, \epsilon), \mathbb{R}^n)\to{}C^\infty((-\epsilon, \epsilon), \mathbb{R}^n)$ (for some $\epsilon$ that we do not yet fix) as

      \[\Gamma\gamma(t) = x + \int_0^tX(\gamma(s))ds,\]

      i.e. $\Gamma$ maps $C^\infty$ curves through $x$ into $C^\infty$ curves through $x$. We further have with the norm $||\gamma||_\infty = \mathrm{sup}_{t \in (-\epsilon, \epsilon)}|\gamma(t)|$:

      \[\begin{aligned} ||\Gamma(\gamma_1 - \gamma_2)||_\infty & = \mathrm{sup}_{t \in (-\epsilon, \epsilon)}\left| \int_0^t (X(\gamma_1(s)) - X(\gamma_2(s)))ds \right| \\ & \leq \mathrm{sup}_{t \in (-\epsilon, \epsilon)}\int_0^t | X(\gamma_1(s)) - X(\gamma_2(s)) | ds \\ & \leq \mathrm{sup}_{t \in (-\epsilon, \epsilon)}\int_0^t L |\gamma_1(s) - \gamma_2(s)| ds \\ & \leq \epsilon{}L \cdot \mathrm{sup}_{t \in (-\epsilon, \epsilon)}|\gamma_1(t) - \gamma_2(t)|, -\end{aligned}\]

      and we see that $\Gamma$ is a contractive mapping if we pick $\epsilon$ small enough and we can hence apply the fixed-point theorem. So there has to exist a $C^\infty$ curve through $x$ that we call $\gamma^*$ such that `$math \gamma^*(t) = \int_0^tX(\gamma^*(s))ds, and this$\gamma^*`` is the curve we were looking for. Its uniqueness is guaranteed by the fixed-point theorem.

      For all the problems we discuss here we can extend the integral curves of $X$ from the finite interval $(-\epsilon, \epsilon)$ to all of $\mathbb{R}$. The solution $\gamma$ we call an integral curve or flow of the vector field (ODE).

      Time-Dependent Vector Fields

      We proved the theorem above for a time-independent vector field $X$, but it also holds for time-dependent vector fields, i.e. for mapping of the form:

      \[X: [0,T]\times\mathcal{M}\to{}TM.\]

      The proof for this case proceeds analogously to the case of the time-independent vector field; to apply the proof we simply have to extend the vector field to (here written for a specific coordinate chart $\varphi_U$):

      \[\bar{X}: [0, T]\times\mathbb{R}^n\to{}\mathbb{R}^{n+1},\, (t, x_1, \ldots, x_n) \mapsto (1, X(x_1, \ldots, x_n)).\]

      More details on this can be found in e.g. [2]. For GeometricMachineLearning time-dependent vector fields are important because many of the optimizers we are using (such as the Adam optimizer) can be seen as approximating the flow of a time-dependent vector field.

      Reference

      [4]
      S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).
      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      • 1It has to be noted that the proof given here is not entirely self-contained. The proof of the fundamental theorem of calculus, i.e. the proof of the existence of an antiderivative of a continuous function [4], is omitted for example.
      +\end{aligned}\]

      and we see that $\Gamma$ is a contractive mapping if we pick $\epsilon$ small enough and we can hence apply the fixed-point theorem. So there has to exist a $C^\infty$ curve through $x$ that we call $\gamma^*$ such that `$math \gamma^*(t) = \int_0^tX(\gamma^*(s))ds, and this$\gamma^*`` is the curve we were looking for. Its uniqueness is guaranteed by the fixed-point theorem.

      For all the problems we discuss here we can extend the integral curves of $X$ from the finite interval $(-\epsilon, \epsilon)$ to all of $\mathbb{R}$. The solution $\gamma$ we call an integral curve or flow of the vector field (ODE).

      Time-Dependent Vector Fields

      We proved the theorem above for a time-independent vector field $X$, but it also holds for time-dependent vector fields, i.e. for mapping of the form:

      \[X: [0,T]\times\mathcal{M}\to{}TM.\]

      The proof for this case proceeds analogously to the case of the time-independent vector field; to apply the proof we simply have to extend the vector field to (here written for a specific coordinate chart $\varphi_U$):

      \[\bar{X}: [0, T]\times\mathbb{R}^n\to{}\mathbb{R}^{n+1},\, (t, x_1, \ldots, x_n) \mapsto (1, X(x_1, \ldots, x_n)).\]

      More details on this can be found in e.g. [2]. For GeometricMachineLearning time-dependent vector fields are important because many of the optimizers we are using (such as the Adam optimizer) can be seen as approximating the flow of a time-dependent vector field.

      Reference

      [4]
      S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).
      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      • 1It has to be noted that the proof given here is not entirely self-contained. The proof of the fundamental theorem of calculus, i.e. the proof of the existence of an antiderivative of a continuous function [4], is omitted for example.
      diff --git a/latest/manifolds/homogeneous_spaces/index.html b/latest/manifolds/homogeneous_spaces/index.html index 263ab6a28..8e5044750 100644 --- a/latest/manifolds/homogeneous_spaces/index.html +++ b/latest/manifolds/homogeneous_spaces/index.html @@ -1,12 +1,12 @@ -Homogeneous Spaces · GeometricMachineLearning.jl

      Homogeneous Spaces

      Homogeneous spaces are very important in GeometricMachineLearning as we can generalize existing neural network optimizers from vector spaces to such homogenous spaces. They are intricately linked to the notion of a Lie Group and its Lie Algebra[1].

      Definition

      A homogeneous space is a manifold $\mathcal{M}$ on which a Lie group $G$ acts transitively, i.e.

      \[\forall X,Y\in\mathcal{M} \exists{}A\in{}G\text{ s.t. }AX = Y.\]

      Now fix a distinct element $E\in\mathcal{M}$; we will refer to this as the canonical element. We can also establish an isomorphism between $\mathcal{M}$ and the quotient space $G/\sim$ with the equivalence relation:

      \[A_1 \sim A_2 \iff A_1E = A_2E.\]

      Note that this is independent of the chosen $E$.

      The tangent spaces of $\mathcal{M}$ are of the form $T_Y\mathcal{M} = \mathfrak{g}\cdot{}Y$, i.e. can be fully described through its Lie algebra. Based on this we can perform a splitting of $\mathfrak{g}$ into two parts:

      Definition

      A splitting of the Lie algebra $mathfrak{g}$ at an element of a homogeneous space $Y$ is a decomposition into a vertical and a horizontal component, denoted by $\mathfrak{g} = \mathfrak{g}^{\mathrm{ver},Y} \oplus \mathfrak{g}^{\mathrm{hor},Y}$ such that

      1. The vertical component $\mathfrak{g}^{\mathrm{ver},Y}$ is the kernel of the map $\mathfrak{g}\to{}T_Y\mathcal{M}, V \mapsto VY$, i.e. $\mathfrak{g}^{\mathrm{ver},Y} = \{V\in\mathfrak{g}:VY = 0\}.$
      2. The horizontal component $\mathfrak{g}^{\mathrm{hor},Y}$ is the orthogonal complement of $\mathfrak{g}^{\mathrm{ver},Y}$ in $\mathfrak{g}$. It is isomorphic to $T_Y\mathcal{M}$.

      We will refer to the mapping from $T_Y\mathcal{M}$ to $\mathfrak{g}^{\mathrm{hor}, Y}$ by $\Omega$. We will give explicit examples of $\Omega$ below. If we have now defined a metric $\langle\cdot,\cdot\rangle$ on $\mathfrak{g}$, then this induces a Riemannian metric on $\mathcal{M}$:

      \[g_Y(\Delta_1, \Delta_2) = \langle\Omega(Y,\Delta_1),\Omega(Y,\Delta_2)\rangle\text{ for $\Delta_1,\Delta_2\in{}T_Y\mathcal{M}$.}\]

      Two examples of homogeneous spaces implemented in GeometricMachineLearning are the Stiefel and the Grassmann manifold. The Lie group $SO(N)$ acts transitively on both of these manifolds, i.e. turns them into homogeneous spaces. The Lie algebra of $SO(N)$ are the skew-symmetric matrices $\mathfrak{so}(N):=\{V\in\mathbb{R}^{N\times{}N}:V^T + V = 0\}$ and the canonical metric associated with it is simply $(V_1,V_2)\mapsto\frac{1}{2}\mathrm{Tr}(V_1^TV_2)$.

      The Stiefel Manifold

      The Stiefel manifold $St(n, N)$ is the space of all orthonormal frames in $\mathbb{R}^{N\times{}n}$, i.e. matrices $Y\in\mathbb{R}^{N\times{}n}$ s.t. $Y^TY = \mathbb{I}_n$. It can also be seen as $SO(N)$ modulo an equivalence relation: $A\sim{}B\iff{}AE = BE$ for

      \[E = \begin{bmatrix} +Homogeneous Spaces · GeometricMachineLearning.jl

      Homogeneous Spaces

      Homogeneous spaces are very important in GeometricMachineLearning as we can generalize existing neural network optimizers from vector spaces to such homogenous spaces. They are intricately linked to the notion of a Lie Group and its Lie Algebra[1].

      Definition

      A homogeneous space is a manifold $\mathcal{M}$ on which a Lie group $G$ acts transitively, i.e.

      \[\forall X,Y\in\mathcal{M} \exists{}A\in{}G\text{ s.t. }AX = Y.\]

      Now fix a distinct element $E\in\mathcal{M}$; we will refer to this as the canonical element. We can also establish an isomorphism between $\mathcal{M}$ and the quotient space $G/\sim$ with the equivalence relation:

      \[A_1 \sim A_2 \iff A_1E = A_2E.\]

      Note that this is independent of the chosen $E$.

      The tangent spaces of $\mathcal{M}$ are of the form $T_Y\mathcal{M} = \mathfrak{g}\cdot{}Y$, i.e. can be fully described through its Lie algebra. Based on this we can perform a splitting of $\mathfrak{g}$ into two parts:

      Definition

      A splitting of the Lie algebra $mathfrak{g}$ at an element of a homogeneous space $Y$ is a decomposition into a vertical and a horizontal component, denoted by $\mathfrak{g} = \mathfrak{g}^{\mathrm{ver},Y} \oplus \mathfrak{g}^{\mathrm{hor},Y}$ such that

      1. The vertical component $\mathfrak{g}^{\mathrm{ver},Y}$ is the kernel of the map $\mathfrak{g}\to{}T_Y\mathcal{M}, V \mapsto VY$, i.e. $\mathfrak{g}^{\mathrm{ver},Y} = \{V\in\mathfrak{g}:VY = 0\}.$
      2. The horizontal component $\mathfrak{g}^{\mathrm{hor},Y}$ is the orthogonal complement of $\mathfrak{g}^{\mathrm{ver},Y}$ in $\mathfrak{g}$. It is isomorphic to $T_Y\mathcal{M}$.

      We will refer to the mapping from $T_Y\mathcal{M}$ to $\mathfrak{g}^{\mathrm{hor}, Y}$ by $\Omega$. We will give explicit examples of $\Omega$ below. If we have now defined a metric $\langle\cdot,\cdot\rangle$ on $\mathfrak{g}$, then this induces a Riemannian metric on $\mathcal{M}$:

      \[g_Y(\Delta_1, \Delta_2) = \langle\Omega(Y,\Delta_1),\Omega(Y,\Delta_2)\rangle\text{ for $\Delta_1,\Delta_2\in{}T_Y\mathcal{M}$.}\]

      Two examples of homogeneous spaces implemented in GeometricMachineLearning are the Stiefel and the Grassmann manifold. The Lie group $SO(N)$ acts transitively on both of these manifolds, i.e. turns them into homogeneous spaces. The Lie algebra of $SO(N)$ are the skew-symmetric matrices $\mathfrak{so}(N):=\{V\in\mathbb{R}^{N\times{}N}:V^T + V = 0\}$ and the canonical metric associated with it is simply $(V_1,V_2)\mapsto\frac{1}{2}\mathrm{Tr}(V_1^TV_2)$.

      The Stiefel Manifold

      The Stiefel manifold $St(n, N)$ is the space of all orthonormal frames in $\mathbb{R}^{N\times{}n}$, i.e. matrices $Y\in\mathbb{R}^{N\times{}n}$ s.t. $Y^TY = \mathbb{I}_n$. It can also be seen as $SO(N)$ modulo an equivalence relation: $A\sim{}B\iff{}AE = BE$ for

      \[E = \begin{bmatrix} \mathbb{I}_n \\ \mathbb{O} \end{bmatrix}\in{}St(n, N),\]

      which is the canonical element of the Stiefel manifold. In words: the first $n$ columns of $A$ and $B$ are the same. We also use this principle to draw random elements from the Stiefel manifold.

      Example

      Drawing random elements from the Stiefel (and the Grassmann) manifold is done by first calling rand(N, n) (i.e. drawing from a normal distribution) and then performing a $QR$ decomposition. We then take the first $n$ columns of the $Q$ matrix to be an element of the Stiefel manifold.

      The tangent space to the element $Y\in{}St(n,N)$ can be determined by considering $C^\infty$ curves on $SO(N)$ through $\mathbb{I}$ which we write $t\mapsto{}A(t)$. Because $SO(N)$ acts transitively on $St(n, N)$ each $C^\infty$ curve on $St(n, N)$ through $Y$ can be written as $A(t)Y$ and we get:

      \[T_YSt(n,N)=\{BY : B\in\mathfrak{g}\} = \{\Delta\in\mathbb{R}^{N\times{}n}: \Delta^TY + Y^T\Delta = \mathbb{O}\},\]

      where the last equality can be established through the isomorphism

      \[\Omega: T_YSt(n, N) \to \mathfrak{g}^{\mathrm{vec}, Y}, \Delta \mapsto (\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      That this is an isomorphism can be easily checked:

      \[ \Omega(\Delta)Y = (\mathbb{I} - \frac{1}{2}YY^T)\Delta - \frac{1}{2}Y\Delta^TY = \Delta.\]

      The isomorphism is also implemented in GeometricMachineLearning:

      using GeometricMachineLearning
       
       Y = rand(StiefelManifold{Float32}, 5, 3)
       Δ = rgrad(Y, rand(Float32, 5, 3))
      -GeometricMachineLearning.Ω(Y, Δ) * Y ≈ Δ
      true

      The function rgrad is introduced below.

      The Riemannian Gradient for the Stiefel Manifold

      We defined the Riemannian gradient to be a vector field $\mathrm{grad}^gL$ such that it is compatible with the Riemannian metric in some sense; the definition we gave relied on an explicit coordinate chart. We can also express the Riemannian gradient for matrix manifolds by not relying on an explicit coordinate representation (which would be computationally expensive) [6].

      Definition

      Given a Riemannian matrix manifold $\mathcal{M}$ we define the Riemannian gradient of $L:\mathcal{M}\to\mathbb{R}$ at $Y$, called $\mathrm{grad}_YL\in{}T_Y\mathcal{M}$, as the unique element of $T_Y\mathcal{M}$ such that for any other $\Delta\in{}T_Y\mathcal{M}$ we have

      \[\mathrm{Tr}((\nabla{}L)^T\Delta) = g_Y(\mathrm{grad}_YL, \Delta),\]

      where Tr indicates the usual matrix trace.

      For the Stiefel manifold the Riemannian gradient is given by:

      \[ \mathrm{grad}_YL = \nabla_YL - Y(\nabla_YL)^TY =: \mathtt{rgrad}(Y, \nabla_YL),\]

      where $\nabla_YL$ refers to the Euclidean gradient, i.e.

      \[ [\nabla_YL]_{ij} = \frac{\partial{}L}{\partial{}y_{ij}}.\]

      The Euclidean gradient $\nabla{}L$ can in practice be obtained with an AD routine. We then use the function rgrad to map $\nabla_YL$ from $\mathbb{R}^{N\times{}n}$ to $T_YSt(n,N)$. We can check that this mapping indeed maps to the Riemannian gradient

      using GeometricMachineLearning
      +GeometricMachineLearning.Ω(Y, Δ) * Y.A ≈ Δ
      true

      The function rgrad is introduced below.

      The Riemannian Gradient for the Stiefel Manifold

      We defined the Riemannian gradient to be a vector field $\mathrm{grad}^gL$ such that it is compatible with the Riemannian metric in some sense; the definition we gave relied on an explicit coordinate chart. We can also express the Riemannian gradient for matrix manifolds by not relying on an explicit coordinate representation (which would be computationally expensive) [6].

      Definition

      Given a Riemannian matrix manifold $\mathcal{M}$ we define the Riemannian gradient of $L:\mathcal{M}\to\mathbb{R}$ at $Y$, called $\mathrm{grad}_YL\in{}T_Y\mathcal{M}$, as the unique element of $T_Y\mathcal{M}$ such that for any other $\Delta\in{}T_Y\mathcal{M}$ we have

      \[\mathrm{Tr}((\nabla{}L)^T\Delta) = g_Y(\mathrm{grad}_YL, \Delta),\]

      where Tr indicates the usual matrix trace.

      For the Stiefel manifold the Riemannian gradient is given by:

      \[ \mathrm{grad}_YL = \nabla_YL - Y(\nabla_YL)^TY =: \mathtt{rgrad}(Y, \nabla_YL),\]

      where $\nabla_YL$ refers to the Euclidean gradient, i.e.

      \[ [\nabla_YL]_{ij} = \frac{\partial{}L}{\partial{}y_{ij}}.\]

      The Euclidean gradient $\nabla{}L$ can in practice be obtained with an AD routine. We then use the function rgrad to map $\nabla_YL$ from $\mathbb{R}^{N\times{}n}$ to $T_YSt(n,N)$. We can check that this mapping indeed maps to the Riemannian gradient

      using GeometricMachineLearning
       using LinearAlgebra: tr
       
       Y = rand(StiefelManifold{Float32}, 5, 3)
      @@ -17,22 +17,72 @@
       metric(Y, gradL, Δ) ≈ tr(∇L' * Δ)
      true

      The Grassmann Manifold

      The Grassmann manifold is closely related to the Stiefel manifold, and an element of the Grassmann manifold can be represented through an element of the Stiefel manifold (but not vice-versa). An element of the Grassmann manifold $G(n,N)$ is a vector subspace $\subset\mathbb{R}^N$ of dimension $n$. Each such subspace (i.e. element of the Grassmann manifold) can be represented by a full-rank matrix $A\in\mathbb{R}^{N\times{}n}$ and we identify two elements with the following equivalence relation:

      \[ A_1 \sim A_2 \iff \exists{}C\in\mathbb{R}^{n\times{}n}\text{ s.t. }A_1C = A_2.\]

      The resulting manifold is of dimension $n(N-n)$. One can find a parametrization of the manifold the following way: Because the matrix $Y$ has full rank, there have to be $n$ independent columns in it: $i_1, \ldots, i_n$. For simplicity assume that $i_1 = 1, i_2=2, \ldots, i_n=n$ and call the matrix made up of these columns $C$. Then the mapping to the coordinate chart is: $YC^{-1}$ and the last $N-n$ columns are the coordinates.

      We can also define the Grassmann manifold based on the Stiefel manifold since elements of the Stiefel manifold are already full-rank matrices. In this case we have the following equivalence relation (for $Y_1, Y_2\in{}St(n,N)$):

      \[ Y_1 \sim Y_2 \iff \exists{}C\in{}SO(n)\text{ s.t. }Y_1C = Y_2.\]

      In GeometricMachineLearning elements of the Grassmann manifold are drawn the same way as elements of the Stiefel manifold:

      using GeometricMachineLearning
       
       rand(GrassmannManifold{Float32}, 5, 3)
      5×3 GrassmannManifold{Float32, Matrix{Float32}}:
      - -0.435526    0.117112  -0.665815
      -  0.0495673  -0.611143   0.439047
      -  0.538415    0.121593  -0.095711
      - -0.52331     0.514674   0.587843
      - -0.494081   -0.577161  -0.0959633

      The Riemannian Gradient of the Grassmann Manifold

      Obtaining the Riemannian Gradient for the Grassmann manifold is slightly more difficult than it is in the case of the Stiefel manifold [6]. Since the Grassmann manifold can be obtained from the Stiefel manifold through an equivalence relation, we can however use this as a starting point.

      Theorem

      The Riemannian gradient of a function $L$ defined on the Grassmann manifold can be written as

      \[\mathrm{grad}_\mathcal{Y}^{Gr}L = \nabla_Y{}L - YY^T\nabla_YL,\]

      where $\nabla_Y{}L$ again is again the Euclidean gradient.

      Proof

      In a first step we identify charts on the Grassmann manifold to make dealing with it easier. For this consider the following open cover of the Grassmann manifold.

      \[\{\mathcal{U}_W\}_{W\in{}St(n, N)} \quad\text{where}\quad \mathcal{U}_W = \{\mathrm{span}(Y):\mathrm{det}(W^TY)\neq0\}.\]

      We can find a canonical bijective mapping from the set $\mathcal{U}_W$ to the set $\mathcal{S}_W := \{Y\in\mathbb{R}^{N\times{}n}:W^TY=\mathbb{I}_n\}$:

      \[\sigma_W: \mathcal{U}_W \to \mathcal{S}_W,\, \mathcal{Y}=\mathrm{span}(Y)\mapsto{}Y(W^TY)^{-1} =: \hat{Y}.\]

      That $\sigma_W$ is well-defined is easy to see: Consider $YC$ with $C\in\mathbb{R}^{n\times{}n}$ non-singular. Then $YC(W^TYC)^{-1}=Y(W^TY)^{-1} = \hat{Y}$. With this isomorphism we can also find a representation of elements of the tangent space:

      \[T_\mathcal{Y}\sigma_W: T_\mathcal{Y}Gr(n,N)\to{}T_{\hat{Y}}\mathcal{S}_W.\]

      We give an explicit representation of this isomorphism; because the map $\sigma_W$ does not care about the representation of $\mathrm{span}(Y)$ we can perform the variations in $St(n,N)$. We write the variations as $Y(t)\in{}St(n,N)$ for $t\in(-\varepsilon,\varepsilon)$. We also set $Y(0) = Y$ and hence

      \[\frac{d}{dt}Y(t)(W^TY(t))^{-1} = (\dot{Y}(0) - Y(W^TY)^{-1}W^T\dot{Y}(0))(W^TY)^{-1},\]

      where $\dot{Y}(0)\in{}T_YSt(n,N)$. Also note note that we have $T_\mathcal{Y}\mathcal{U}_W = T_\mathcal{Y}Gr(n,N)$ because $\mathcal{U}_W$ is an open subset of $Gr(n,N)$. We thus can identify the tangent space $T_\mathcal{Y}Gr(n,N)$ with the following set:

      \[T_{\hat{Y}}\mathcal{S}_W = \{(\Delta - YW^T\Delta)(W^T\Delta)^{-1}: Y\in{}St(n,N)\text{ s.t. }\mathrm{span}(Y)=\mathcal{Y}\text{ and }\Delta\in{}T_YSt(n,N)\}.\]

      Further note that we can pick any element $W$ to construct the charts for a neighborhood around the point $\mathcal{Y}\in{}Gr(n,N)$ as long as we have $\mathrm{det}(W^TY)\neq0$ for $\mathrm{span}(Y)=\mathcal{Y}$. We hence take $W=Y$ and get the identification:

      \[T_\mathcal{Y}Gr(n,N) \equiv \{\Delta - YY^T\Delta: Y\in{}St(n,N)\text{ s.t. }\mathrm{span}(Y)=\mathcal{Y}\text{ and }\Delta\in{}T_YSt(n,N)\},\]

      which is very easy to handle computationally (we simply store and change the matrix $Y$ that represents an element of the Grassmann manifold). The Riemannian gradient is then

      \[\mathrm{grad}_\mathcal{Y}^{Gr}L = \mathrm{grad}_Y^{St}L - YY^T\mathrm{grad}_Y^{St}L = \nabla_Y{}L - YY^T\nabla_YL,\]

      where $\mathrm{grad}^{St}_YL$ is the Riemannian gradient of the Stiefel manifold at $Y$. We proved our assertion.

      Library Functions

      GeometricMachineLearning.StiefelManifoldType

      An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices $Y\in\mathbb{R}^{N\times{}n}$ whose columns are orthonormal, i.e.

      \[ St(n, N) = \{Y: Y^TY = \mathbb{I}_n \}.\]

      The Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.

      source
      Base.randMethod
      rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold

      Draw random elements from the Stiefel and the Grassmann manifold.

      Because both of these manifolds are compact spaces we can sample them uniformly [8].

      Examples

      When we call ...

      using GeometricMachineLearning
      + -0.157512    0.185112  -0.216194
      +  0.0595288  -0.523402  -0.567974
      + -0.379467    0.669637  -0.545434
      +  0.908048    0.323416  -0.259806
      + -0.0556817  -0.372523  -0.51543

      The Riemannian Gradient of the Grassmann Manifold

      Obtaining the Riemannian Gradient for the Grassmann manifold is slightly more difficult than it is in the case of the Stiefel manifold [6]. Since the Grassmann manifold can be obtained from the Stiefel manifold through an equivalence relation, we can however use this as a starting point.

      Theorem

      The Riemannian gradient of a function $L$ defined on the Grassmann manifold can be written as

      \[\mathrm{grad}_\mathcal{Y}^{Gr}L \simeq \nabla_Y{}L - YY^T\nabla_YL,\]

      where $\nabla_Y{}L$ again is again the Euclidean gradient.

      Proof

      In a first step we identify charts on the Grassmann manifold to make dealing with it easier. For this consider the following open cover of the Grassmann manifold.

      \[\{\mathcal{U}_W\}_{W\in{}St(n, N)} \quad\text{where}\quad \mathcal{U}_W = \{\mathrm{span}(Y):\mathrm{det}(W^TY)\neq0\}.\]

      We can find a canonical bijective mapping from the set $\mathcal{U}_W$ to the set $\mathcal{S}_W := \{Y\in\mathbb{R}^{N\times{}n}:W^TY=\mathbb{I}_n\}$:

      \[\sigma_W: \mathcal{U}_W \to \mathcal{S}_W,\, \mathcal{Y}=\mathrm{span}(Y)\mapsto{}Y(W^TY)^{-1} =: \hat{Y}.\]

      That $\sigma_W$ is well-defined is easy to see: Consider $YC$ with $C\in\mathbb{R}^{n\times{}n}$ non-singular. Then $YC(W^TYC)^{-1}=Y(W^TY)^{-1} = \hat{Y}$. With this isomorphism we can also find a representation of elements of the tangent space:

      \[T_\mathcal{Y}\sigma_W: T_\mathcal{Y}Gr(n,N)\to{}T_{\hat{Y}}\mathcal{S}_W.\]

      We give an explicit representation of this isomorphism; because the map $\sigma_W$ does not care about the representation of $\mathrm{span}(Y)$ we can perform the variations in $St(n,N)$. We write the variations as $Y(t)\in{}St(n,N)$ for $t\in(-\varepsilon,\varepsilon)$. We also set $Y(0) = Y$ and hence

      \[\frac{d}{dt}Y(t)(W^TY(t))^{-1} = (\dot{Y}(0) - Y(W^TY)^{-1}W^T\dot{Y}(0))(W^TY)^{-1},\]

      where $\dot{Y}(0)\in{}T_YSt(n,N)$. Also note note that we have $T_\mathcal{Y}\mathcal{U}_W = T_\mathcal{Y}Gr(n,N)$ because $\mathcal{U}_W$ is an open subset of $Gr(n,N)$. We thus can identify the tangent space $T_\mathcal{Y}Gr(n,N)$ with the following set:

      \[T_{\hat{Y}}\mathcal{S}_W = \{(\Delta - YW^T\Delta)(W^T\Delta)^{-1}: Y\in{}St(n,N)\text{ s.t. }\mathrm{span}(Y)=\mathcal{Y}\text{ and }\Delta\in{}T_YSt(n,N)\}.\]

      Further note that we can pick any element $W$ to construct the charts for a neighborhood around the point $\mathcal{Y}\in{}Gr(n,N)$ as long as we have $\mathrm{det}(W^TY)\neq0$ for $\mathrm{span}(Y)=\mathcal{Y}$. We hence take $W=Y$ and get the identification:

      \[T_\mathcal{Y}Gr(n,N) \equiv \{\Delta - YY^T\Delta: Y\in{}St(n,N)\text{ s.t. }\mathrm{span}(Y)=\mathcal{Y}\text{ and }\Delta\in{}T_YSt(n,N)\},\]

      which is very easy to handle computationally (we simply store and change the matrix $Y$ that represents an element of the Grassmann manifold). The Riemannian gradient is then

      \[\mathrm{grad}_\mathcal{Y}^{Gr}L = \mathrm{grad}_Y^{St}L - YY^T\mathrm{grad}_Y^{St}L = \nabla_Y{}L - YY^T\nabla_YL,\]

      where $\mathrm{grad}^{St}_YL$ is the Riemannian gradient of the Stiefel manifold at $Y$. We proved our assertion.

      Library Functions

      GeometricMachineLearning.StiefelManifoldType

      An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices $Y\in\mathbb{R}^{N\times{}n}$ whose columns are orthonormal, i.e.

      \[ St(n, N) = \{Y: Y^TY = \mathbb{I}_n \}.\]

      The Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.

      source
      Base.randMethod
      rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold

      Draw random elements from the Stiefel and the Grassmann manifold.

      Because both of these manifolds are compact spaces we can sample them uniformly [8].

      Examples

      When we call ...

      using GeometricMachineLearning
      +using GeometricMachineLearning: _round # hide
       import Random
       Random.seed!(123)
       
       N, n = 5, 3
      -rand(StiefelManifold{Float32}, N, n)
      +Y = rand(StiefelManifold{Float32}, N, n)
      +_round(Y; digits = 5) # hide
       
       # output
       
       5×3 StiefelManifold{Float32, Matrix{Float32}}:
      - -0.275746    0.329913   0.772753
      - -0.624851   -0.332242  -0.0685991
      - -0.693326    0.36724   -0.189882
      - -0.0929493  -0.731446   0.460639
      -  0.210203    0.333008   0.387173

      ... the sampling is done by first allocating a random matrix of size $N\times{}n$ via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix.

      source
      GeometricMachineLearning.rgradMethod

      Computes the Riemannian gradient for the Stiefel manifold given an element $Y\in{}St(N,n)$ and a matrix $\nabla{}L\in\mathbb{R}^{N\times{}n}$ (the Euclidean gradient). It computes the Riemannian gradient with respect to the canonical metric (see the documentation for the function metric for an explanation of this). The precise form of the mapping is:

      \[\mathtt{rgrad}(Y, \nabla{}L) \mapsto \nabla{}L - Y(\nabla{}L)^TY\]

      It is called with inputs:

      • Y::StiefelManifold
      • e_grad::AbstractMatrix: i.e. the Euclidean gradient (what was called $\nabla{}L$) above.
      source
      GeometricMachineLearning.metricMethod

      Implements the canonical Riemannian metric for the Stiefel manifold:

      \[g_Y: (\Delta_1, \Delta_2) \mapsto \mathrm{tr}(\Delta_1^T(\mathbb{I} - \frac{1}{2}YY^T)\Delta_2).\]

      It is called with:

      • Y::StiefelManifold
      • Δ₁::AbstractMatrix
      • Δ₂::AbstractMatrix
      source
      GeometricMachineLearning.ΩMethod

      Implements the canonical horizontal lift for the Stiefel manifold:

      \[ (\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      Internally this performs

      SkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')

      to save memory.

      source

      References

      [6]
      P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).
      [41]
      T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).
      [42]
      T. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).
      • 1Recall that a Lie group is a manifold that also has group structure. We say that a Lie group $G$ acts on a manifold $\mathcal{M}$ if there is a map $G\times\mathcal{M} \to \mathcal{M}$ such that $(ab)x = a(bx)$ for $a,b\in{}G$ and $x\in\mathcal{M}$. For us the Lie algebra belonging to a Lie group, denoted by $\mathfrak{g}$, is the tangent space to the identity element $T_\mathbb{I}G$.
      + -0.27575 0.32991 0.77275 + -0.62485 -0.33224 -0.0686 + -0.69333 0.36724 -0.18988 + -0.09295 -0.73145 0.46064 + 0.2102 0.33301 0.38717

      ... the sampling is done by first allocating a random matrix of size $N\times{}n$ via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix.

      source
      GeometricMachineLearning.rgradMethod
      rgrad(Y::StiefelManifold, e_grad::AbstractMatrix)

      Compute the Riemannian gradient for the Stiefel manifold at $Y\in{}St(N,n)$ based on $\nabla{}L\in\mathbb{R}^{N\times{}n}$ (the Euclidean gradient).

      The function computes the Riemannian gradient with respect to the canonical metric.

      The precise form of the mapping is:

      \[\mathtt{rgrad}(Y, \nabla{}L) \mapsto \nabla{}L - Y(\nabla{}L)^TY\]

      Note the property $Y^T\mathrm{rgrad}(Y, \nabla{}L)\in\mathcal{S}_\mathrm{skew}(n).$

      Examples

      using GeometricMachineLearning
      +
      +Y = StiefelManifold([1 0 ; 0 1 ; 0 0; 0 0])
      +Δ = [1 2; 3 4; 5 6; 7 8]
      +rgrad(Y, Δ)
      +
      +# output
      +
      +4×2 Matrix{Int64}:
      + 0  -1
      + 1   0
      + 5   6
      + 7   8
      source
      GeometricMachineLearning.rgradMethod
      rgrad(Y::GrassmannManifold, e_grad::AbstractMatrix)

      Compute the Riemannian gradient at $Y\in{}Gr(n, N)$.

      These gradient have the property that they are orthogonal to the space spanned by $Y$.

      The precise form of the mapping is:

      \[\mathtt{rgrad}(Y, \nabla{}L) \mapsto \nabla{}L - YY^T\nabla{}L\]

      Note the property $Y^T\mathrm{rgrad}(Y, \nabla{}L) = \mathbb{O}.$

      Also see rgrad(::StiefelManifold, ::AbstractMatrix).

      Examples

      using GeometricMachineLearning
      +
      +Y = GrassmannManifold([1 0 ; 0 1 ; 0 0; 0 0])
      +Δ = [1 2; 3 4; 5 6; 7 8]
      +rgrad(Y, Δ)
      +
      +# output
      +
      +4×2 Matrix{Int64}:
      + 0  0
      + 0  0
      + 5  6
      + 7  8
      source
      GeometricMachineLearning.metricMethod

      Implements the canonical Riemannian metric for the Stiefel manifold:

      \[g_Y: (\Delta_1, \Delta_2) \mapsto \mathrm{tr}(\Delta_1^T(\mathbb{I} - \frac{1}{2}YY^T)\Delta_2).\]

      It is called with:

      • Y::StiefelManifold
      • Δ₁::AbstractMatrix
      • Δ₂::AbstractMatrix
      source
      GeometricMachineLearning.metricMethod
      metric(Y::GrassmannManifold, Δ₁::AbstractMatrix, Δ₂::AbstractMatrix)

      Compute the metric for vectors Δ₁ and Δ₂ at Y.

      The representation of the Grassmann manifold is realized as a quotient space of the Stiefel manifold.

      The metric for the Grassmann manifold is:

      \[g^{Gr}_Y(\Delta_1, \Delta_2) = g^{St}_Y(\Delta_1, \Delta_2) = \mathrm{Tr}(\Delta_1^T (\mathbb{I} - Y Y^T) \Delta_2) = \mathrm{Tr}(\Delta_1^T \Delta_2).\]

      source
      GeometricMachineLearning.ΩMethod
      Ω(Y::StiefelManifold{T}, Δ::AbstractMatrix{T}) where T

      Perform canonical horizontal lift for the Stiefel manifold:

      \[ \Delta \mapsto (\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      Internally this performs

      SkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')

      to save memory.

      Examples

      using GeometricMachineLearning
      +E = StiefelManifold(StiefelProjection(5, 2))
      +Δ = [0. -1.; 1. 0.; 2. 3.; 4. 5.; 6. 7.]
      +GeometricMachineLearning.Ω(E, Δ)
      +
      +# output
      +
      +5×5 SkewSymMatrix{Float64, Vector{Float64}}:
      + 0.0  -1.0  -2.0  -4.0  -6.0
      + 1.0   0.0  -3.0  -5.0  -7.0
      + 2.0   3.0   0.0  -0.0  -0.0
      + 4.0   5.0   0.0   0.0  -0.0
      + 6.0   7.0   0.0   0.0   0.0

      Note that the output of Ω is a skew-symmetric matrix, i.e. an element of $\mathfrak{g}$.

      source
      GeometricMachineLearning.ΩMethod
      Ω(Y::GrassmannManifold{T}, Δ::AbstractMatrix{T}) where T

      Perform the canonical horizontal lift for the Grassmann manifold:

      \[ \Delta \mapsto \Omega^{St}(Y, Δ),\]

      where $\Omega^{St}$ is the canonical horizontal lift for the Stiefel manifold.

      using GeometricMachineLearning
      +E = GrassmannManifold(StiefelProjection(5, 2))
      +Δ = [0. 0.; 0. 0.; 2. 3.; 4. 5.; 6. 7.]
      +GeometricMachineLearning.Ω(E, Δ)
      +
      +# output
      +
      +5×5 SkewSymMatrix{Float64, Vector{Float64}}:
      + 0.0  -0.0  -2.0  -4.0  -6.0
      + 0.0   0.0  -3.0  -5.0  -7.0
      + 2.0   3.0   0.0  -0.0  -0.0
      + 4.0   5.0   0.0   0.0  -0.0
      + 6.0   7.0   0.0   0.0   0.0
      source

      References

      [6]
      P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).
      [43]
      T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).
      [44]
      T. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).
      • 1Recall that a Lie group is a manifold that also has group structure. We say that a Lie group $G$ acts on a manifold $\mathcal{M}$ if there is a map $G\times\mathcal{M} \to \mathcal{M}$ such that $(ab)x = a(bx)$ for $a,b\in{}G$ and $x\in\mathcal{M}$. For us the Lie algebra belonging to a Lie group, denoted by $\mathfrak{g}$, is the tangent space to the identity element $T_\mathbb{I}G$.
      diff --git a/latest/manifolds/inverse_function_theorem/index.html b/latest/manifolds/inverse_function_theorem/index.html index 0386b5f7c..388e911b0 100644 --- a/latest/manifolds/inverse_function_theorem/index.html +++ b/latest/manifolds/inverse_function_theorem/index.html @@ -1,5 +1,5 @@ -Foundations of Differential Manifolds · GeometricMachineLearning.jl

      Foundational Theorem for Differential Manifolds

      Here we state and proof all the theorem necessary to define differential manifold. All these theorems (including proofs) can be found in e.g. [2].

      The Fixed-Point Theorem

      The fixed-point theorem will be used in the proof of the inverse function theorem below and the existence-and-uniqueness theorem.

      Theorem (Banach Fixed-Point Theorem)

      A function $f:U \to U$ defined on an open subset $U$ of a complete metric vector space $\mathcal{V} \supset U$ that is contractive, i.e. $|f(z) - f(y)| \leq q|z - y|$ with $q < 1$, has a unique fixed point $y^*$ such that $f(y^*) = y^*$. Further $y^*$ can be found by taking any $y\in{}U$ through $y^* = \lim_{m\to\infty}f^m(y)$.

      Proof

      Fix a point $y\in{}U$. We proof that the sequence $(f^m(y))_{m\in\mathbb{N}}$ is Cauchy and because $\mathcal{V}$ is a complete metric space, the limit of this sequence exists. Take $\tilde{m} > m$ and we have

      \[\begin{aligned} +Foundations of Differential Manifolds · GeometricMachineLearning.jl

      Foundational Theorem for Differential Manifolds

      Here we state and proof all the theorem necessary to define differential manifold. All these theorems (including proofs) can be found in e.g. [2].

      The Fixed-Point Theorem

      The fixed-point theorem will be used in the proof of the inverse function theorem below and the existence-and-uniqueness theorem.

      Theorem (Banach Fixed-Point Theorem)

      A function $f:U \to U$ defined on an open subset $U$ of a complete metric vector space $\mathcal{V} \supset U$ that is contractive, i.e. $|f(z) - f(y)| \leq q|z - y|$ with $q < 1$, has a unique fixed point $y^*$ such that $f(y^*) = y^*$. Further $y^*$ can be found by taking any $y\in{}U$ through $y^* = \lim_{m\to\infty}f^m(y)$.

      Proof

      Fix a point $y\in{}U$. We proof that the sequence $(f^m(y))_{m\in\mathbb{N}}$ is Cauchy and because $\mathcal{V}$ is a complete metric space, the limit of this sequence exists. Take $\tilde{m} > m$ and we have

      \[\begin{aligned} |f^{\tilde{m}}(y) - f^m(y)| & \leq \sum_{i = m}^{\tilde{m} - 1}|f^{i+1}(y) - f^{i}(y)| \\ & \leq \sum_{i = m}^{\tilde{m} - 1}q^i|f(y) - y| \\ & \leq \sum_{i = m}^\infty{}q^i|f(y) - y| = (f(y) - y)\left( \frac{q}{1 - q} - \sum_{i = 1}^{m-1}q^i \right)\\ @@ -8,4 +8,4 @@ |H(\xi+\eta) - H(\xi) - F'(z)^{-1}\eta| & = |h - F'(x)^{-1}\xi| = |h - F'(z)^{-1}(F(z + h) - \xi)| \\ & \leq ||F'(z)^{-1}||\cdot|F'(z)h - F(z + h) + \xi| \\ & \leq ||F'(z)^{-1}||\cdot|h|\cdot\left| F'(z)\frac{h}{|h|} - \frac{F(z + h) - \xi}{|h|} \right|, -\end{aligned}\]

      and the rightmost expression is bounded because of the mean value theorem: $F(z + h) - F(z) \leq sup_{0<t<1}|h| \cdot ||F'(z + th)||$.

      The Implicit Function Theorem

      This theorem is a direct consequence of the inverse function theorem.

      Theorem (Implicit Function Theorem)

      Given a function $f:\mathbb{R}^{n+m}\to\mathbb{R}^n$ whose derivative at $x\in\mathbb{R}^{n+m}$ has full rank, we can find a map $h:U\to\mathbb{R}^{n+m}$ for a neighborhood $U\ni(f(x), x_{n+1}, \ldots, x_{n+m})$ such that $f\circ{}h$ is a projection onto the first factor, i.e. $f(h(x_1, \ldots, x_{n+m})) = (x_1, \ldots, x_n).$

      Proof

      Consider the map $x = (x_1, \ldots, x_{n+m}) = (f(x), x_{n+1}, \ldots, x_{n+m})$. The derivative of this map is clearly of full rank if $f'(x)$ is of full rank and therefore invertible in a neighborhood around $(f(x), x_{n+1}, \ldots, x_{n+m})$. We call this inverse map $h$. We then see that $f\circ{}h$ is a projection.

      The implicit function will be used to proof the preimage theorem which we use as a basis to construct all the manifolds in GeometricMachineLearning.

      References

      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      +\end{aligned}\]

      and the rightmost expression is bounded because of the mean value theorem: $F(z + h) - F(z) \leq sup_{0<t<1}|h| \cdot ||F'(z + th)||$.

      The Implicit Function Theorem

      This theorem is a direct consequence of the inverse function theorem.

      Theorem (Implicit Function Theorem)

      Given a function $f:\mathbb{R}^{n+m}\to\mathbb{R}^n$ whose derivative at $x\in\mathbb{R}^{n+m}$ has full rank, we can find a map $h:U\to\mathbb{R}^{n+m}$ for a neighborhood $U\ni(f(x), x_{n+1}, \ldots, x_{n+m})$ such that $f\circ{}h$ is a projection onto the first factor, i.e. $f(h(x_1, \ldots, x_{n+m})) = (x_1, \ldots, x_n).$

      Proof

      Consider the map $x = (x_1, \ldots, x_{n+m}) = (f(x), x_{n+1}, \ldots, x_{n+m})$. The derivative of this map is clearly of full rank if $f'(x)$ is of full rank and therefore invertible in a neighborhood around $(f(x), x_{n+1}, \ldots, x_{n+m})$. We call this inverse map $h$. We then see that $f\circ{}h$ is a projection.

      The implicit function will be used to proof the preimage theorem which we use as a basis to construct all the manifolds in GeometricMachineLearning.

      References

      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      diff --git a/latest/manifolds/manifolds/index.html b/latest/manifolds/manifolds/index.html index 879e2ccbe..2ad04e6b7 100644 --- a/latest/manifolds/manifolds/index.html +++ b/latest/manifolds/manifolds/index.html @@ -1,3 +1,3 @@ -General Theory on Manifolds · GeometricMachineLearning.jl

      (Matrix) Manifolds

      Manifolds are topological spaces that locally look like vector spaces. In the following we restrict ourselves to finite-dimensional smooth[1] manifolds. In this section we routinely denote points on a manifold by lower case letters like $x, y$ and $z$ if we speak about general properties and by upper case letters like $A$ and $B$ if we talk about specific examples of matrix manifolds. All manifolds that can be used to build neural networks in GeometricMachineLearning, such as the Stiefel manifold and the Grassmann manifold are matrix manifolds.

      Definition

      A finite-dimensional smooth manifold of dimension $n$ is a second-countable Hausdorff space $\mathcal{M}$ for which $\forall{}x\in\mathcal{M}$ we can find a neighborhood $U$ that contains $x$ and a corresponding homeomorphism $\varphi_U:U\cong{}W\subset\mathbb{R}^n$ where $W$ is an open subset. The homeomorphisms $\varphi_U$ are referred to as coordinate charts. If two such coordinate charts overlap, i.e. if $U_1\cap{}U_2\neq\{\}$, then the map $\varphi_{U_2}^{-1}\circ\varphi_{U_1}$ has to be $C^\infty$.

      One example of a manifold that is also important for GeometricMachineLearning is the Lie group[2] of orthonormal matrices $SO(N)$. Before we can proof that $SO(N)$ is a manifold we first need the preimage theorem.

      The Preimage Theorem

      Before we can state the preimage theorem we need another definition[3]:

      Definition

      Consider a smooth mapping $g: \mathcal{M}\to\mathcal{N}$ from one manifold to another. A point $y\in\mathcal{N}$ is called a regular value of $g$ if $\forall{}x\in{}g^{-1}\{y\}$ the map $T_xg:T_A\mathcal{M}\to{}T_{y}\mathcal{N}$ is surjective.

      We now state the preimage theorem:

      Theorem (Preimage Theorem)

      Consider a smooth map $g:\mathcal{M}\to\mathcal{N}$ from one manifold to another (we assume the dimensions of the two manifolds to be $m+n$ and $m$ respectively). Then the preimage of a regular point $y$ of $\mathcal{N}$ is a submanifold of $\mathcal{M}$. Furthermore the codimension of $g^{-1}\{y\}$ is equal to the dimension of $\mathcal{N}$ and the tangent space $T_x(g^{-1}\{y\})$ is equal to the kernel of $T_xg$.

      Proof

      Because $\mathcal{N}$ has manifold structure we can find a chart $\varphi_U:U\to\mathbb{R}^m$ for some neighborhood $U$ that contains $y$. We further consider a point $A\in{}g^{-1}\{y\}$ and a chart around it $\psi_V:V\to\mathbb{R}^{m+n}$. By the implicit function theorem we can then find a mapping $h$ that turns $\varphi_U\circ{}g\circ\psi_V^{-1}$ into a projection $(x_1, \ldots, x_{n+m}) \mapsto (x_{n+1}, \ldots, x_{n+m})$. We now consider the neighborhood $V_1\times\{0\} = \psi(V \cup f^{-1}\{y\})$ for $\psi(V) = V_1\times{}V_2$ with the coordinate chart $(x_1, \ldots, x_n) \mapsto \psi(x_1, \ldots, x_n, 0, \ldots, 0).$ As this map is also smooth by the implicit function theorem this proofs our assertion.

      Example

      The group $SO(N)$ is a Lie group (i.e. has manifold structure).

      Proof

      The vector space $\mathbb{R}^{N\times{}N}$ clearly has manifold structure. The group $SO(N)$ is equivalent to one of the level sets of the mapping: $g:\mathbb{R}^{N\times{}N}\to\mathcal{S}(N), A\mapsto{}A^TA - \mathbb{I}$, i.e. it is the component of $f^{-1}\{\mathbb{I}\}$ that contains $\mathbb{I}$. We still need to proof that $\mathbb{I}$ is a regular point of $g$, i.e. that for $A\in{}SO(N)$ the mapping $T_Ag$ is surjective. This means that $\forall{}B\in\mathcal{S}(N), A\in\mathbb{R}^{N\times{}N}$ $\exists{}C\in\mathbb{R}^{N\times{}N}$ s.t. $C^TA + A^TC = B$. The element $C=\frac{1}{2}AB\in\mathcal{R}^{N\times{}N}$ satisfies this property.

      Similarly we can also proof:

      Example

      The sphere $S^n:=\{x\in\mathbb{R}^{n+1}: x^Tx = 1\}$ is a manifold of dimension $n$.

      Proof

      Take $g(x) = x^x - 1$ and proceed as in the case of $SO(N)$.

      Note that both these manifolds, $SO(N)$ and $S^n$ are matrix manifolds, i.e. an element of $\mathcal{M}$ can be written as an element of $\mathbb{R}^{N\times{}N}$ in the first case and $\mathbb{R}^{n\times{}1}$ in the second case. The additional conditions we impose on these manifolds are $A^TA = \mathbb{I}$ in the first case and $x^Tx = 1$ in the second case. Both of these manifolds belong to the category of Stiefel manifolds.

      Tangent Spaces

      A tangent space can be seen as the collection of all possible velocities a curve can take at a point on a manifold. For this consider a manifold $\mathcal{M}$ and a point $x$ on it and the collection of $C^\infty$ curves through $x$:

      Definition

      A mapping $\gamma:(-\epsilon, \epsilon)\to\mathcal{M}$ that is $C^\infty$ and for which we have $\gamma(0) = x$ is called a $C^\infty$ curve through $x$.

      The tangent space of $\mathcal{M}$ at $x$ is the collection of the first derivatives of all $\gamma$:

      Definition

      The tangent space of $\mathcal{M}$ at $x$ is the collection of all $C^\infty$ curves at $x$ modulo the equivalence class $\gamma_1 \sim \gamma_2 \iff \gamma_1'(0) = \gamma_2'(0)$. It is denoted by $T_x\mathcal{M}$.

      As is customary we write $[\gamma]$ for the equivalence class of $\gamma$ and this is by definition equivalent to $\gamma'(0)$. The tangent space $T_x\mathcal{M}$ can be shown to be homeomorphic[4] to $\mathbb{R}^n$ where $n$ is the dimension of the manifold $\mathcal{M}$. If the homeomorphism is constructed through the coordinate chart $(\varphi, U)$ we call it $\varphi'(x)$ or simply[5] $\varphi'$. If we are given a map $g:\mathcal{M}\to\mathcal{N}$ we further define $T_xg = (\varphi')^{-1}\circ(\varphi\circ{}g\psi^{-1})'\circ{}\psi'$, i.e. a smooth map between two manifolds $\mathcal{M}$ and $\mathcal{N}$ induces a smooth map between the tangent spaces $T_x\mathcal{M}$ and $T_{g(x)}\mathcal{N}$.

      We want to demonstrate this principle of constructing the tangent space from curves through the example of $S^2$. We consider the following curves:

      1. $\gamma_1(t) = \begin{pmatrix} 0 \\ sin(t) \\ cos(t) \end{pmatrix},$
      2. $\gamma_2(t) = \begin{pmatrix} sin(t) \\ 0 \\ cos(t) \end{pmatrix},$
      3. $\gamma_3(t) = \begin{pmatrix} \exp(-t ^ 2 / 2) t \sin(t) \\ \exp(-t ^ 2 / 2) t cos(t) \\ \sqrt{1 - (t ^ 2) exp(-t^2)} \end{pmatrix}.$

      We now plot the manifold $S^2$, the three curves described above and the associated tangent vectors (visualized as arrows). Note that the tangent vectors induced by $\gamma_1$ and $\gamma_3$ are the same; for these curves we have $\gamma_1 \sim \gamma_3$ and the tangent vectors of those two curves coincide:

      The tangent space $T_x\mathcal{M}$ for

      \[x = \begin{pmatrix}0 \\ 0 \\ 1 \end{pmatrix} -\]

      is also shown.

      Vector Fields

      A time-independent vector field[6] is an object that specifies a velocity for every point on a domain. We first give the definition of a vector field on the vector space $\mathbb{R}^n$ and limit ourselves here to $C^\infty$ vector fields:

      Definition

      A vector field on $\mathbb{R}^n$ is a smooth map $X:\mathbb{R}^n\to\mathbb{R}^n$.

      The definition of a vector field on a manifold is not much more complicated:

      Definition

      A vector field on $\mathcal{M}$ is a map $X$ defined on $\mathcal{M}$ such that $X(x)\in{}T_x\mathcal{M}$ and $\varphi'\circ{}X\circ(\varphi)^{-1}$ is smooth for any coordinate chart $(\varphi, U)$ that contains $x$.

      In the section on the existence-and-uniqueness theorem we show that every vector field has a unique solution given an initial condition; i.e. given a point $x\in\mathcal{M}$ and a vector field $X$ we can find a curve $\gamma$ such that $\gamma(0) = x$ and $\gamma'(t) = X(\gamma(t))$ for all $t$ in some interval $(-\epsilon, \epsilon)$.

      The Tangent Bundle

      To each manifold $\mathcal{M}$ we can associate another manifold which we call the tangent bundle and denote by $T\mathcal{M}$. The points on this manifold are:

      \[T\mathcal{M} = \{ (x, v_x): x\in\mathcal{M},\, v_x\in{}T_x\mathcal{M} \}.\]

      Coordinate charts on this manifold can be constructed in a straightforward manner; for every coordinate chart $\varphi_U$ the map $\varphi_U'(x)$ gives a homeomorphism between $T_x\mathcal{M}$ and $\mathbb{R}^n$for any $x\in{}U$. We can then find a neighborhood of any point $(x, v_x)$ by taking $\pi^{-1}(U) = \{(x, v_x): x\in{}U, v_x\in{}T_x\mathcal{M}\}$ and this neighborhood is isomorphic to $\mathbb{R}^{2n}$ via $(x, v_x) \mapsto (\varphi_U(x), \varphi'(x)v_x)$. The geodesic spray is an important vector field defined on $T\mathcal{M}$.

      Library Functions

      References

      [16]
      P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).
      • 1Smooth here means $C^\infty$.
      • 2Lie groups are manifolds that also have a group structure, i.e. there is an operation $\mathcal{M}\times\mathcal{M}\to\mathcal{M},(a,b)\mapsto{}ab$ s.t. $(ab)c = a(bc)$ and there exists a neutral element$e\mathcal{M}$ s.t. $ae$ = $a$ $\forall{}a\in\mathcal{M}$ as well as an (for every $a$) inverse element $a^{-1}$ s.t. $a(a^{-1}) = e$. The neutral element $e$ we refer to as $\mathbb{I}$ when dealing with matrix manifolds.
      • 3In this definition we use the notation $T_xg$. This will be explained below. For we will interpret $T_xg$ simply as $(\varphi_U\circ{}g\circ\psi_V^{-1})'$ where $\varphi_U$ is a coordinate chart around $y = g(x)$ and $\psi_V$ is a coordinate chart around $x$.
      • 4Note that we have not formally defined addition for $T_x\mathcal{M}$. This can be done through the definition $[\gamma] + [\beta] = [\alpha]$ where $\alpha$ is any $C^\infty$ curve through $x$ that satisfies $\alpha'(0) = \beta(0) + \gamma(0)$. Note that we can always find such an $\alpha$ by the existence and uniqueness theorem.
      • 5We will further discuss this when we introduce the tangent bundle.
      • 6Also called ordinary differential equation (ODE).
      +General Theory on Manifolds · GeometricMachineLearning.jl

      (Matrix) Manifolds

      Manifolds are topological spaces that locally look like vector spaces. In the following we restrict ourselves to finite-dimensional smooth[1] manifolds. In this section we routinely denote points on a manifold by lower case letters like $x, y$ and $z$ if we speak about general properties and by upper case letters like $A$ and $B$ if we talk about specific examples of matrix manifolds. All manifolds that can be used to build neural networks in GeometricMachineLearning, such as the Stiefel manifold and the Grassmann manifold are matrix manifolds.

      Definition

      A finite-dimensional smooth manifold of dimension $n$ is a second-countable Hausdorff space $\mathcal{M}$ for which $\forall{}x\in\mathcal{M}$ we can find a neighborhood $U$ that contains $x$ and a corresponding homeomorphism $\varphi_U:U\cong{}W\subset\mathbb{R}^n$ where $W$ is an open subset. The homeomorphisms $\varphi_U$ are referred to as coordinate charts. If two such coordinate charts overlap, i.e. if $U_1\cap{}U_2\neq\{\}$, then the map $\varphi_{U_2}^{-1}\circ\varphi_{U_1}$ has to be $C^\infty$.

      One example of a manifold that is also important for GeometricMachineLearning is the Lie group[2] of orthonormal matrices $SO(N)$. Before we can proof that $SO(N)$ is a manifold we first need the preimage theorem.

      The Preimage Theorem

      Before we can state the preimage theorem we need another definition[3]:

      Definition

      Consider a smooth mapping $g: \mathcal{M}\to\mathcal{N}$ from one manifold to another. A point $y\in\mathcal{N}$ is called a regular value of $g$ if $\forall{}x\in{}g^{-1}\{y\}$ the map $T_xg:T_A\mathcal{M}\to{}T_{y}\mathcal{N}$ is surjective.

      We now state the preimage theorem:

      Theorem (Preimage Theorem)

      Consider a smooth map $g:\mathcal{M}\to\mathcal{N}$ from one manifold to another (we assume the dimensions of the two manifolds to be $m+n$ and $m$ respectively). Then the preimage of a regular point $y$ of $\mathcal{N}$ is a submanifold of $\mathcal{M}$. Furthermore the codimension of $g^{-1}\{y\}$ is equal to the dimension of $\mathcal{N}$ and the tangent space $T_x(g^{-1}\{y\})$ is equal to the kernel of $T_xg$.

      Proof

      Because $\mathcal{N}$ has manifold structure we can find a chart $\varphi_U:U\to\mathbb{R}^m$ for some neighborhood $U$ that contains $y$. We further consider a point $A\in{}g^{-1}\{y\}$ and a chart around it $\psi_V:V\to\mathbb{R}^{m+n}$. By the implicit function theorem we can then find a mapping $h$ that turns $\varphi_U\circ{}g\circ\psi_V^{-1}$ into a projection $(x_1, \ldots, x_{n+m}) \mapsto (x_{n+1}, \ldots, x_{n+m})$. We now consider the neighborhood $V_1\times\{0\} = \psi(V \cup f^{-1}\{y\})$ for $\psi(V) = V_1\times{}V_2$ with the coordinate chart $(x_1, \ldots, x_n) \mapsto \psi(x_1, \ldots, x_n, 0, \ldots, 0).$ As this map is also smooth by the implicit function theorem this proofs our assertion.

      Example

      The group $SO(N)$ is a Lie group (i.e. has manifold structure).

      Proof

      The vector space $\mathbb{R}^{N\times{}N}$ clearly has manifold structure. The group $SO(N)$ is equivalent to one of the level sets of the mapping: $g:\mathbb{R}^{N\times{}N}\to\mathcal{S}(N), A\mapsto{}A^TA - \mathbb{I}$, i.e. it is the component of $f^{-1}\{\mathbb{I}\}$ that contains $\mathbb{I}$. We still need to proof that $\mathbb{I}$ is a regular point of $g$, i.e. that for $A\in{}SO(N)$ the mapping $T_Ag$ is surjective. This means that $\forall{}B\in\mathcal{S}(N), A\in\mathbb{R}^{N\times{}N}$ $\exists{}C\in\mathbb{R}^{N\times{}N}$ s.t. $C^TA + A^TC = B$. The element $C=\frac{1}{2}AB\in\mathcal{R}^{N\times{}N}$ satisfies this property.

      Similarly we can also proof:

      Example

      The sphere $S^n:=\{x\in\mathbb{R}^{n+1}: x^Tx = 1\}$ is a manifold of dimension $n$.

      Proof

      Take $g(x) = x^x - 1$ and proceed as in the case of $SO(N)$.

      Note that both these manifolds, $SO(N)$ and $S^n$ are matrix manifolds, i.e. an element of $\mathcal{M}$ can be written as an element of $\mathbb{R}^{N\times{}N}$ in the first case and $\mathbb{R}^{n\times{}1}$ in the second case. The additional conditions we impose on these manifolds are $A^TA = \mathbb{I}$ in the first case and $x^Tx = 1$ in the second case. Both of these manifolds belong to the category of Stiefel manifolds.

      Tangent Spaces

      A tangent space can be seen as the collection of all possible velocities a curve can take at a point on a manifold. For this consider a manifold $\mathcal{M}$ and a point $x$ on it and the collection of $C^\infty$ curves through $x$:

      Definition

      A mapping $\gamma:(-\epsilon, \epsilon)\to\mathcal{M}$ that is $C^\infty$ and for which we have $\gamma(0) = x$ is called a $C^\infty$ curve through $x$.

      The tangent space of $\mathcal{M}$ at $x$ is the collection of the first derivatives of all $\gamma$:

      Definition

      The tangent space of $\mathcal{M}$ at $x$ is the collection of all $C^\infty$ curves at $x$ modulo the equivalence class $\gamma_1 \sim \gamma_2 \iff \gamma_1'(0) = \gamma_2'(0)$. It is denoted by $T_x\mathcal{M}$.

      As is customary we write $[\gamma]$ for the equivalence class of $\gamma$ and this is by definition equivalent to $\gamma'(0)$. The tangent space $T_x\mathcal{M}$ can be shown to be homeomorphic[4] to $\mathbb{R}^n$ where $n$ is the dimension of the manifold $\mathcal{M}$. If the homeomorphism is constructed through the coordinate chart $(\varphi, U)$ we call it $\varphi'(x)$ or simply[5] $\varphi'$. If we are given a map $g:\mathcal{M}\to\mathcal{N}$ we further define $T_xg = (\varphi')^{-1}\circ(\varphi\circ{}g\psi^{-1})'\circ{}\psi'$, i.e. a smooth map between two manifolds $\mathcal{M}$ and $\mathcal{N}$ induces a smooth map between the tangent spaces $T_x\mathcal{M}$ and $T_{g(x)}\mathcal{N}$.

      We want to demonstrate this principle of constructing the tangent space from curves through the example of $S^2$. We consider the following curves:

      1. $\gamma_1(t) = \begin{pmatrix} 0 \\ sin(t) \\ cos(t) \end{pmatrix},$
      2. $\gamma_2(t) = \begin{pmatrix} sin(t) \\ 0 \\ cos(t) \end{pmatrix},$
      3. $\gamma_3(t) = \begin{pmatrix} \exp(-t ^ 2 / 2) t \sin(t) \\ \exp(-t ^ 2 / 2) t cos(t) \\ \sqrt{1 - (t ^ 2) exp(-t^2)} \end{pmatrix}.$

      We now plot the manifold $S^2$, the three curves described above and the associated tangent vectors (visualized as arrows). Note that the tangent vectors induced by $\gamma_1$ and $\gamma_3$ are the same; for these curves we have $\gamma_1 \sim \gamma_3$ and the tangent vectors of those two curves coincide:

      The tangent space $T_x\mathcal{M}$ for

      \[x = \begin{pmatrix}0 \\ 0 \\ 1 \end{pmatrix} +\]

      is also shown.

      Vector Fields

      A time-independent vector field[6] is an object that specifies a velocity for every point on a domain. We first give the definition of a vector field on the vector space $\mathbb{R}^n$ and limit ourselves here to $C^\infty$ vector fields:

      Definition

      A vector field on $\mathbb{R}^n$ is a smooth map $X:\mathbb{R}^n\to\mathbb{R}^n$.

      The definition of a vector field on a manifold is not much more complicated:

      Definition

      A vector field on $\mathcal{M}$ is a map $X$ defined on $\mathcal{M}$ such that $X(x)\in{}T_x\mathcal{M}$ and $\varphi'\circ{}X\circ(\varphi)^{-1}$ is smooth for any coordinate chart $(\varphi, U)$ that contains $x$.

      In the section on the existence-and-uniqueness theorem we show that every vector field has a unique solution given an initial condition; i.e. given a point $x\in\mathcal{M}$ and a vector field $X$ we can find a curve $\gamma$ such that $\gamma(0) = x$ and $\gamma'(t) = X(\gamma(t))$ for all $t$ in some interval $(-\epsilon, \epsilon)$.

      The Tangent Bundle

      To each manifold $\mathcal{M}$ we can associate another manifold which we call the tangent bundle and denote by $T\mathcal{M}$. The points on this manifold are:

      \[T\mathcal{M} = \{ (x, v_x): x\in\mathcal{M},\, v_x\in{}T_x\mathcal{M} \}.\]

      Coordinate charts on this manifold can be constructed in a straightforward manner; for every coordinate chart $\varphi_U$ the map $\varphi_U'(x)$ gives a homeomorphism between $T_x\mathcal{M}$ and $\mathbb{R}^n$for any $x\in{}U$. We can then find a neighborhood of any point $(x, v_x)$ by taking $\pi^{-1}(U) = \{(x, v_x): x\in{}U, v_x\in{}T_x\mathcal{M}\}$ and this neighborhood is isomorphic to $\mathbb{R}^{2n}$ via $(x, v_x) \mapsto (\varphi_U(x), \varphi'(x)v_x)$. The geodesic spray is an important vector field defined on $T\mathcal{M}$.

      Library Functions

      References

      [10]
      P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).
      • 1Smooth here means $C^\infty$.
      • 2Lie groups are manifolds that also have a group structure, i.e. there is an operation $\mathcal{M}\times\mathcal{M}\to\mathcal{M},(a,b)\mapsto{}ab$ s.t. $(ab)c = a(bc)$ and there exists a neutral element$e\mathcal{M}$ s.t. $ae$ = $a$ $\forall{}a\in\mathcal{M}$ as well as an (for every $a$) inverse element $a^{-1}$ s.t. $a(a^{-1}) = e$. The neutral element $e$ we refer to as $\mathbb{I}$ when dealing with matrix manifolds.
      • 3In this definition we use the notation $T_xg$. This will be explained below. For we will interpret $T_xg$ simply as $(\varphi_U\circ{}g\circ\psi_V^{-1})'$ where $\varphi_U$ is a coordinate chart around $y = g(x)$ and $\psi_V$ is a coordinate chart around $x$.
      • 4Note that we have not formally defined addition for $T_x\mathcal{M}$. This can be done through the definition $[\gamma] + [\beta] = [\alpha]$ where $\alpha$ is any $C^\infty$ curve through $x$ that satisfies $\alpha'(0) = \beta(0) + \gamma(0)$. Note that we can always find such an $\alpha$ by the existence and uniqueness theorem.
      • 5We will further discuss this when we introduce the tangent bundle.
      • 6Also called ordinary differential equation (ODE).
      diff --git a/latest/manifolds/metric_and_vector_spaces/index.html b/latest/manifolds/metric_and_vector_spaces/index.html index 7f741d361..4be0e1224 100644 --- a/latest/manifolds/metric_and_vector_spaces/index.html +++ b/latest/manifolds/metric_and_vector_spaces/index.html @@ -1,5 +1,5 @@ -Metric and Vector Spaces · GeometricMachineLearning.jl

      (Topological) Metric Spaces

      A metric space is a certain class of a topological space where the topology is induced through a metric.

      Definition

      A metric on a topological space $\mathcal{M}$ is a mapping $d:\mathcal{M}\times\mathcal{M}\to\mathbb{R}$ such that the following three conditions hold:

      1. $d(x, y) = 0 \iff x = y$ for every $x,y\in\mathcal{M}$, i.e. the distance between 2 points is only zero if and only if they are the same,
      2. $d(x, y) = d(y, x)$,
      3. $d(x, z) \leq d(x, y) + d(y, z)$.

      The second condition is referred to as symmetry and the third condition is referred to as the triangle inequality.

      We give some examples of metric spaces that are relevant for us:

      Example

      The real line $\mathbb{R}$ with the metric defined by the absolute distance between two points: $d(x, y) = |y - x|$.

      Example

      The vector space $\mathbb{R}^n$ with the Euclidean distance $d_2(x, y) = \sqrt{\sum_{i=1}^n (x_i - y_i)^2}$.

      Example

      The space of continuous functions $\mathcal{C} = \{f:(-\epsilon, \epsilon)\to\mathbb{R}^n\}$ with the metric $d_\infty(f_1, f_2) = \mathrm{sup}_{t\in(-\epsilon, \epsilon)}|f_1(t) - f_2(t)|.$

      Proof

      We have to show the triangle inequality:

      \[\begin{aligned} +Metric and Vector Spaces · GeometricMachineLearning.jl

      (Topological) Metric Spaces

      A metric space is a certain class of a topological space where the topology is induced through a metric.

      Definition

      A metric on a topological space $\mathcal{M}$ is a mapping $d:\mathcal{M}\times\mathcal{M}\to\mathbb{R}$ such that the following three conditions hold:

      1. $d(x, y) = 0 \iff x = y$ for every $x,y\in\mathcal{M}$, i.e. the distance between 2 points is only zero if and only if they are the same,
      2. $d(x, y) = d(y, x)$,
      3. $d(x, z) \leq d(x, y) + d(y, z)$.

      The second condition is referred to as symmetry and the third condition is referred to as the triangle inequality.

      We give some examples of metric spaces that are relevant for us:

      Example

      The real line $\mathbb{R}$ with the metric defined by the absolute distance between two points: $d(x, y) = |y - x|$.

      Example

      The vector space $\mathbb{R}^n$ with the Euclidean distance $d_2(x, y) = \sqrt{\sum_{i=1}^n (x_i - y_i)^2}$.

      Example

      The space of continuous functions $\mathcal{C} = \{f:(-\epsilon, \epsilon)\to\mathbb{R}^n\}$ with the metric $d_\infty(f_1, f_2) = \mathrm{sup}_{t\in(-\epsilon, \epsilon)}|f_1(t) - f_2(t)|.$

      Proof

      We have to show the triangle inequality:

      \[\begin{aligned} d_\infty(d_1, d_3) = \mathrm{sup}_{t\in(-\epsilon, \epsilon)}|f_1(t) - f_3(t)| & \leq \mathrm{sup}_{t\in(-\epsilon, \epsilon)}(|f_1(t) - f_2(t)| + |f_2(t) - f_3(t)|) \\ & \leq \mathrm{sup}_{t\in(-\epsilon, \epsilon)}|f_1(t) - f_2(t)| + \mathrm{sup}_{t\in(-\epsilon, \epsilon)}|f_1(t) - f_2(t)|. -\end{aligned}\]

      This shows that $d_\infty$ is indeed a metric.

      Example

      Any Riemannian manifold is a metric space.

      This last example shows that metric spaces need not be vector spaces, i.e. spaces for which we can define a metric but not addition of two elements. This will be discussed in more detail in the section on riemannian manifolds.

      Complete Metric Spaces

      To define complete metric spaces we first need the definition of a Cauchy sequence.

      Definition

      A Cauchy sequence is a sequence $(a_n)_{n\in\mathbb{N}}$ for which, given any epsilon>0, we can find an integer $N$ such that $d(a_n, a_m) < \epsilon$ for all $n, m \geq N$.

      Now we can give the definition of a complete metric space:

      Definition

      A complete metric space is one for which every Cauchy sequence converges.

      Completeness of the real numbers is most often seen as an axiom and therefore stated without proof. This also implies completeness of $\mathbb{R}^n$ [4].

      (Topological) Vector Spaces

      Vector Spaces are, like metric spaces, topological spaces which we endow with additional structure.

      Definition

      A vector space $\mathcal{V}$ is a topological space for which we define an operation called addition and denoted by $+$ and an operation called scalar multiplication (by elements of $\mathbb{R}$) denoted by $x \mapsto ax$ for $x\in\mathcal{V}$ and $x\in\mathbb{R}$ for which the following hold for all $x, y, z\in\mathcal{V}$ and $a, b\in\mathbb{R}$:

      1. $x + (y + z) = (x + y) + z,$
      2. $x + y = y + x,$
      3. $\exists 0 \in \mathcal{V}$ such that $x + 0 = x,$
      4. $\exists -x \in \mathcal{V} such that$x + (-x) = 0,``
      5. $a(ax) = (ab)x,$
      6. $1x = x$ for $1\in\mathbb{R},$
      7. $a(x + y) = ax + ay,$
      8. $(a + b)x = ax + bx.$

      The first law is known as associativity, the second one as commutativity and the last two ones are known distributivity.

      The topological spaces $\mathbb{R}$ and $\mathbb{R}^{n}$ are (almost) trivially vector spaces. The same is true for many function spaces. One of the special aspects of GeometricMachineLearning is that it can deal with spaces that are not vector spaces, but manifolds. All vector spaces are however manifolds.

      [4]
      S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).
      +\end{aligned}\]

      This shows that $d_\infty$ is indeed a metric.

      Example

      Any Riemannian manifold is a metric space.

      This last example shows that metric spaces need not be vector spaces, i.e. spaces for which we can define a metric but not addition of two elements. This will be discussed in more detail in the section on riemannian manifolds.

      Complete Metric Spaces

      To define complete metric spaces we first need the definition of a Cauchy sequence.

      Definition

      A Cauchy sequence is a sequence $(a_n)_{n\in\mathbb{N}}$ for which, given any epsilon>0, we can find an integer $N$ such that $d(a_n, a_m) < \epsilon$ for all $n, m \geq N$.

      Now we can give the definition of a complete metric space:

      Definition

      A complete metric space is one for which every Cauchy sequence converges.

      Completeness of the real numbers is most often seen as an axiom and therefore stated without proof. This also implies completeness of $\mathbb{R}^n$ [4].

      (Topological) Vector Spaces

      Vector Spaces are, like metric spaces, topological spaces which we endow with additional structure.

      Definition

      A vector space $\mathcal{V}$ is a topological space for which we define an operation called addition and denoted by $+$ and an operation called scalar multiplication (by elements of $\mathbb{R}$) denoted by $x \mapsto ax$ for $x\in\mathcal{V}$ and $x\in\mathbb{R}$ for which the following hold for all $x, y, z\in\mathcal{V}$ and $a, b\in\mathbb{R}$:

      1. $x + (y + z) = (x + y) + z,$
      2. $x + y = y + x,$
      3. $\exists 0 \in \mathcal{V}$ such that $x + 0 = x,$
      4. $\exists -x \in \mathcal{V} such that$x + (-x) = 0,``
      5. $a(ax) = (ab)x,$
      6. $1x = x$ for $1\in\mathbb{R},$
      7. $a(x + y) = ax + ay,$
      8. $(a + b)x = ax + bx.$

      The first law is known as associativity, the second one as commutativity and the last two ones are known distributivity.

      The topological spaces $\mathbb{R}$ and $\mathbb{R}^{n}$ are (almost) trivially vector spaces. The same is true for many function spaces. One of the special aspects of GeometricMachineLearning is that it can deal with spaces that are not vector spaces, but manifolds. All vector spaces are however manifolds.

      [4]
      S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).
      diff --git a/latest/manifolds/riemannian_manifolds/index.html b/latest/manifolds/riemannian_manifolds/index.html index 3ec971f6d..067e55bb8 100644 --- a/latest/manifolds/riemannian_manifolds/index.html +++ b/latest/manifolds/riemannian_manifolds/index.html @@ -1,5 +1,5 @@ -Riemannian Manifolds · GeometricMachineLearning.jl

      Riemannian Manifolds

      A Riemannian manifold is a manifold $\mathcal{M}$ that we endow with a mapping $g$ that smoothly[1] assigns a metric $g_x$ to each tangent space $T_x\mathcal{M}$. By a slight abuse of notation we will also refer to this $g$ as a metric.

      After having defined a metric $g$ we can associate a length to each curve $\gamma:[0, t] \to \mathcal{M}$ through:

      \[L(\gamma) = \int_0^t \sqrt{g_{\gamma(s)}(\gamma'(s), \gamma'(s))}ds.\]

      This $L$ turns $\mathcal{M}$ into a metric space:

      Definition

      The metric on a Riemannian manifold $\mathcal{M}$ is

      \[d(x, y) = \mathrm{inf}_{\text{$\gamma(0) = x$ and $\gamma(t) = y$}}L(\gamma),\]

      where $t$ can be chosen arbitrarily.

      If a curve is minimal with respect to the function $L$ we call it the shortest curve or a geodesic. So we say that a curve $\gamma:[0, t]\to\mathcal{M}$ is a geodesic if there is no shorter curve that can connect two points in $\gamma([0, t])$, i.e.

      \[d(\gamma(t_i), \gamma(t_f)) = \int_{t_i}^{t_f}\sqrt{g_{\gamma(s)}(\gamma'(s), \gamma'(s))}ds,\]

      for any $t_i, t_f\in[0, t]$.

      An important result of Riemannian geometry states that there exists a vector field $X$ on $T\mathcal{M}$, called the geodesic spray, whose integral curves are derivatives of geodesics.

      Geodesic Sprays and the Exponential Map

      To every Riemannian manifold we can naturally associate a vector field called the geodesic spray or geodesic equation. For our purposes it is enough to state that this vector field is unique and well-defined [5].

      The important property of the geodesic spray is

      Theorem

      Given an initial point $x$ and an initial velocity $v_x$, an integral curve for the geodesic spray is of the form $t \mapsto (\gamma_{v_x}(t), \gamma_{v_x}'(t))$ where $\gamma_{v_x}$ is a geodesic. We further have the property that the integral curve for the geodesic spray for an initial point $x$ and an initial velocity $\eta\cdot{}v_x$ (where $\eta$ is a scalar) is of the form $t \mapsto (\gamma_{\eta\cdot{}v_x}(t), \gamma_{\eta\cdot{}v_x}'(t)) = (\gamma_{v_x}(\eta\cdot{}t), \eta\cdot\gamma_{v_x}'(\eta\cdot{}t)).$

      It is therefore customary to introduce the exponential map $\exp:T_x\mathcal{M}\to\mathcal{M}$ as

      \[\exp(v_x) := \gamma_{v_x}(1),\]

      and we see that $\gamma_{v_x}(t) = \exp(t\cdot{}v_x)$. In GeometricMachineLearning we denote the exponential map by geodesic to avoid confusion with the matrix exponential map[2]:

      \[ \mathtt{geodesic}(x, v_x) \equiv \exp(v_x).\]

      We give an example here:

      using GeometricMachineLearning
      +Riemannian Manifolds · GeometricMachineLearning.jl

      Riemannian Manifolds

      A Riemannian manifold is a manifold $\mathcal{M}$ that we endow with a mapping $g$ that smoothly[1] assigns a metric $g_x$ to each tangent space $T_x\mathcal{M}$. By a slight abuse of notation we will also refer to this $g$ as a metric.

      After having defined a metric $g$ we can associate a length to each curve $\gamma:[0, t] \to \mathcal{M}$ through:

      \[L(\gamma) = \int_0^t \sqrt{g_{\gamma(s)}(\gamma'(s), \gamma'(s))}ds.\]

      This $L$ turns $\mathcal{M}$ into a metric space:

      Definition

      The metric on a Riemannian manifold $\mathcal{M}$ is

      \[d(x, y) = \mathrm{inf}_{\text{$\gamma(0) = x$ and $\gamma(t) = y$}}L(\gamma),\]

      where $t$ can be chosen arbitrarily.

      If a curve is minimal with respect to the function $L$ we call it the shortest curve or a geodesic. So we say that a curve $\gamma:[0, t]\to\mathcal{M}$ is a geodesic if there is no shorter curve that can connect two points in $\gamma([0, t])$, i.e.

      \[d(\gamma(t_i), \gamma(t_f)) = \int_{t_i}^{t_f}\sqrt{g_{\gamma(s)}(\gamma'(s), \gamma'(s))}ds,\]

      for any $t_i, t_f\in[0, t]$.

      An important result of Riemannian geometry states that there exists a vector field $X$ on $T\mathcal{M}$, called the geodesic spray, whose integral curves are derivatives of geodesics.

      Geodesic Sprays and the Exponential Map

      To every Riemannian manifold we can naturally associate a vector field called the geodesic spray or geodesic equation. For our purposes it is enough to state that this vector field is unique and well-defined [5].

      The important property of the geodesic spray is

      Theorem

      Given an initial point $x$ and an initial velocity $v_x$, an integral curve for the geodesic spray is of the form $t \mapsto (\gamma_{v_x}(t), \gamma_{v_x}'(t))$ where $\gamma_{v_x}$ is a geodesic. We further have the property that the integral curve for the geodesic spray for an initial point $x$ and an initial velocity $\eta\cdot{}v_x$ (where $\eta$ is a scalar) is of the form $t \mapsto (\gamma_{\eta\cdot{}v_x}(t), \gamma_{\eta\cdot{}v_x}'(t)) = (\gamma_{v_x}(\eta\cdot{}t), \eta\cdot\gamma_{v_x}'(\eta\cdot{}t)).$

      It is therefore customary to introduce the exponential map $\exp:T_x\mathcal{M}\to\mathcal{M}$ as

      \[\exp(v_x) := \gamma_{v_x}(1),\]

      and we see that $\gamma_{v_x}(t) = \exp(t\cdot{}v_x)$. In GeometricMachineLearning we denote the exponential map by geodesic to avoid confusion with the matrix exponential map[2]:

      \[ \mathtt{geodesic}(x, v_x) \equiv \exp(v_x).\]

      We give an example here:

      using GeometricMachineLearning
       
       Y = rand(StiefelManifold, 3, 1)
       
      @@ -25,6 +25,6 @@
               color = mred, markersize = 5)
       end
       
      -fig
      Example block output

      So a geodesic can be seen as the equivalent of a straight line on a manifold. Also note that we drew a random element form StiefelManifold here and not from $S^2$. This is because Stiefel manifolds are more general spaces than $S^n$ and also comprise them.

      The Riemannian Gradient

      The Riemannian gradient of a function $L\mathcal{M}\to\mathbb{R}$ is a vector field[3] $\mathrm{grad}^gL$ (or simply $\mathrm{grad}L$) for which we have

      \[ g_x(\mathrm{grad}_x^gL, v_x) = (\nabla_{\varphi_U(x)}(L\circ\varphi_U^{-1}))^T \varphi_U'(v_x), \]

      where

      \[ \nabla_xf = \begin{pmatrix} \frac{\partial{}f}{\partial{}x_1} \\ \cdots \\ \frac{\partial{}f}{\partial{}x_n} \end{pmatrix},\]

      is the Euclidean gradient. By the non-degeneracy of $g$ the Riemannian gradient always exists [3]. We will give specific examples of this when discussing the Stiefel manifold and the Grassmann manifold.

      Gradient Flows and Riemannian Optimization

      In GeometricMachineLearning we can include weights in neural networks that are part of a manifold. Training such neural networks amounts to Riemannian optimization and hence solving the gradient flow equation. The gradient flow equation is given by

      \[X(x) = - \mathrm{grad}_xL.\]

      Solving this gradient flow equation will then lead us to a local minimum on $\mathcal{M}$. This will be elaborated on when talking about optimizers. In practice we cannot solve the gradient flow equation directly and have to rely on approximations. The most straightforward approximation (and one that serves as a basis for all the optimization algorithms in GeometricMachineLearning) is to take the point $(x, X(x))$ as an initial condition for the geodesic spray and then solve the ODE for a small time step. We will call this

      Library Functions

      GeometricMachineLearning.geodesicMethod

      The geodesic map for the manifolds. It takes as input an element $x$ of $\mathcal{M}$ and an element of $T_x\mathcal{M}$ and returns $\mathtt{geodesic}(x, v_x) = \exp(v_x).$ For example:

      Y = rand(StiefelManifold{Float64}, N, n)
      +fig
      Example block output

      So a geodesic can be seen as the equivalent of a straight line on a manifold. Also note that we drew a random element form StiefelManifold here and not from $S^2$. This is because Stiefel manifolds are more general spaces than $S^n$ and also comprise them.

      The Riemannian Gradient

      The Riemannian gradient of a function $L\mathcal{M}\to\mathbb{R}$ is a vector field[3] $\mathrm{grad}^gL$ (or simply $\mathrm{grad}L$) for which we have

      \[ g_x(\mathrm{grad}_x^gL, v_x) = (\nabla_{\varphi_U(x)}(L\circ\varphi_U^{-1}))^T \varphi_U'(v_x), \]

      where

      \[ \nabla_xf = \begin{pmatrix} \frac{\partial{}f}{\partial{}x_1} \\ \cdots \\ \frac{\partial{}f}{\partial{}x_n} \end{pmatrix},\]

      is the Euclidean gradient. By the non-degeneracy of $g$ the Riemannian gradient always exists [3]. We will give specific examples of this when discussing the Stiefel manifold and the Grassmann manifold.

      Gradient Flows and Riemannian Optimization

      In GeometricMachineLearning we can include weights in neural networks that are part of a manifold. Training such neural networks amounts to Riemannian optimization and hence solving the gradient flow equation. The gradient flow equation is given by

      \[X(x) = - \mathrm{grad}_xL.\]

      Solving this gradient flow equation will then lead us to a local minimum on $\mathcal{M}$. This will be elaborated on when talking about optimizers. In practice we cannot solve the gradient flow equation directly and have to rely on approximations. The most straightforward approximation (and one that serves as a basis for all the optimization algorithms in GeometricMachineLearning) is to take the point $(x, X(x))$ as an initial condition for the geodesic spray and then solve the ODE for a small time step. We will call this

      Library Functions

      GeometricMachineLearning.geodesicMethod
      geodesic(Y::Manifold, Δ)

      Take as input an element of a manifold Y and a tangent vector in Δ in the corresponding tangent space and compute the geodesic (exponential map).

      In different notation: take as input an element $x$ of $\mathcal{M}$ and an element of $T_x\mathcal{M}$ and return $\mathtt{geodesic}(x, v_x) = \exp(v_x).$ For example:

      Y = rand(StiefelManifold{Float64}, N, n)
       Δ = rgrad(Y, rand(N, n))
      -geodesic(Y, Δ)

      See the docstring for $rgrad$ for details on this function.

      source

      References

      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      [5]
      M. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).
      • 1Smooth here refers to the fact that $g:\mathcal{M}\to\text{(Space of Metrics)}$ has to be a smooth map. But in order to discuss this in detail we would have to define a topology on the space of metrics. A more detailed discussion can be found in [2, 3, 5].
      • 2The Riemannian exponential map and the matrix exponential map coincide for many matrix Lie groups.
      • 3We also write $\mathrm{grad}^gL(x) = \mathrm{grad}^g_xL.$
      +geodesic(Y, Δ)

      See the docstring for rgrad for details on this function.

      source

      References

      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      [5]
      M. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).
      • 1Smooth here refers to the fact that $g:\mathcal{M}\to\text{(Space of Metrics)}$ has to be a smooth map. But in order to discuss this in detail we would have to define a topology on the space of metrics. A more detailed discussion can be found in [2, 3, 5].
      • 2The Riemannian exponential map and the matrix exponential map coincide for many matrix Lie groups.
      • 3We also write $\mathrm{grad}^gL(x) = \mathrm{grad}^g_xL.$
      diff --git a/latest/objects.inv b/latest/objects.inv index 1ba8407c596ae6e5b0bb21ec907d2956bb3a43a3..b919cfc9786dd73dd3364a964d067ff47af866ac 100644 GIT binary patch delta 7889 zcmV;?9xmb6IGjF^kAGczZ``()|6iYi#jY0(5XCocvAqonbp42v;M#Q>Puo8S@(jmT zqmibP#*Q201KlsPpJ@-DA}Nw0rLmJ13$*c!hCF;7p7T2oQhB+=A5haSUo5KyN}nV2 zx?1C8ot62jDwfUuRo3K-d0TC&qPo6UOS;PJqvNVv;7!}0RexQr(JNfyIxEnO|E>^U zNnScMWg?dfnAJ0&&eSIJ=|Jd>&k|~8U=;h%rh?iA5Q|B1IlzCRp2gYeD zxsTq-r`ECFRO{**`dMuoFJtt}?GrW@&jd4Mb)DTe`^|fNH*fCOoSoJg{L!|6uiQSm z6i?{fTtcC=dvs0;DQ#UB)m2u^+pL7Xw3b2m$^lEz$A4(X9*c2YY}$OC|A}jrkbJn` zeQc6Se%!xi>^0|)==!Pz28wq4d7(T_Hrt}O$`ls z2i`Vo;D4(uu05p+?6B^Oe{8r~Vvyp&gW}~0Tkx7)Q{I*q*?ocUL)&4cL8Y)-_BOI* zww_nEUEwUWj}GB`w^dYfSF7u0kepsU#&**z%*;$u_dc79!F%baXsg_xrV zl`!N9dL=5I(UOAZZB=a7c)qD|gX`P8yq>QxUVpBt`Yx-NmIl1#_tCHP=k%@o*=R0n zTcQ%GvECMKeuJ~+e3+WEEU0<$Q@@2GFL72AalI+XzFKs(Ep2#8 z?}Hp5KhjlTKPs(d_2MRP;prBpYhSIuZ7a~PuoY@uybqV(*P7~{H%4nE-V&h99B>>-fFgCxqHxOUV}|=ClO;gzMUnQ1P{466B9`F5Za(=p`wTv@jtx7=JFE zHWg3QhTWy4&TR}*{*@dm>z3?Y&24dUH8bXn@#m8e&NGm8!GQK7kkP zfwZI)l6@&<)u%xfY(lM5sU3-6aTZu*NR-u>TD*ou);%TYS$s~4{Ho4s4Y%?~q#61l zIV_6w4(Hc5ZIf1Inx)K5ky3Mx)_*LrD_r=2>3{qu0y>D(A#Q*SF*=CSNw|`2U^BKi zDZp=Ld!AnQKae9?C zc*$6a7w1-F_$-;$C-htPgZ7;jK)fG}$B^C+MEk*5kr(Im66S78Pjf6>KeAUgCl|EN z9dxWl@(wdbC$mzm=)~kD^nVTpN?cUtwsBY-Kvp3{{BmSuln@ex5&;-e1!!3L)`S{l zd+1;zxdeemS4a#^rxpMz2vJU?lfS#0Dt}PBL?cb<&7fN zXmYv1i+pu2_NR)Oqa>*$Lzz$|ll1gpdfJ&Z>6ujCCcpblA+o8g#*i?vC}ApYFWVec zT9KYx+1Be(<#Lu&kp9}_P1&?rQDl9w1U1ruAu}5^6*j0=J;oXwoC4k0HC(8&$qQ8r zr-`xyRrE%=Sn8ECE`Ro;)rSKcN|!rTDx7-qQ`KUk2n0@@G*yHck>(QgatCvI+4-l3 z#JJ&;ew)Gc7Qk7@onzPRqsOaGHY}%r6bgxOE{*~S(nkr@ZC=V5=;ibTZ*aLJgAY<= zgMkz>H5y$*;elMbD)1V>6ZCN{T!I$HG4VQ+X{0Wab(13!d4Eeqye%d1wv_wn)0rBx zvkb+E+5vq^Z76ytypriDs1vpqx-d?9VMZ4$4O?SDWQ~NslTv14y9*F$VTaeCBlCtM zP+Qu-UPkoDPF z6t|zgx=arXP=C*W^4HFtWcaoPb|)TgS(z?DTk-u4ZEEozNi`ti1LVnkw-w*{SP220 z04xKg#h0NW;C)91521`Y$^^ChtR4v-nUiT8c;pT$vJ!7tfH^(o0p@h3gl{c-#uNk3yoMra&kC3OwZ_u(|_5;$%T79$ggz1p$1|#5P6oo z{E!j-N6=rE5?Q$g-sLuJDw>U0P)?5ky9^0>+?*BklHYE!mh5|Ik@MJ(@%-8+LSzPc zN`yuxY5Yhnz5>JUU=VOrNE8pLCrcy(E8Ge5 zaD*8v5Q3nQ{8Hvb7y;RA16JCHe9ab$i>%OPmVacBMnZRkfkzs zVZ$?CBTB;wMhu*yg-nTzFtqa;q-ez)#?*8UMB`mGYH`y|beey_%k*uP1Gi3ud3-2$hg(Zlx5FPJQ89ceM$z=u}KP@=z7m zEA@Ns*t5!_ukVZ;wK4O`P-ayUX+?Y^*CQNigo_ykL@6ZaRh!>(kVrfuS*I$8AsbOW98)4Xn4W>JCOd670yA8;UiB_g^Jz=+tKskAxME+`rV zTJ%QtzB!C;hEhPXn~2p%qP~DwdVM78GeNe4tzqGRLYid#rjF>3Hz$M8G=EZtyLkR_ zRnzT4Hb~0Fp2bh>@x6Jy#a+N&Z3uDUXmdz2+$xe7Lt(oC(ycj=1{{B5WTB4QlAcUd zj@G@|8Kx&NRWnQEm_kvzl1w7&CO>cU#rw3nCB_OcE*Ifce%+IAksd5c4heEXWVig_ z&q`+FB#QnR#EmY$oAr>*Z+~5y(%MegyBmJ%vXf->Lw&nGfNFw11dlna7?9S*IM|Mblxwhb^k!K5kXw($rItkOZ8PU13^Ql2pujADZN&r?;Wf_E9m7xaODgo4#Nd1_>9a`TS(l3*l95PbfUHg;obiT! zvuJFQ<3lx;rntD)C4an6n4H_3LlHxxQ=+;sC3(>B*9>@(`xe}?!S9GsXy`)V6>h0Q zBVCE#1W=pmFH)O$9Kgy|x4{V}a-(W2zR+ksVAR#Epi!rrp*zx-o{6J@L6(krIFK{H zc|?%H$N)&^z!oM*^KSq7P=sfq@VJE8gB?9&Rg`H~U*#E)dIhbApGL4c-~eof)jhbP16Re`KvL9=Sq zc$O6fEd|9&aDR|;bdcH|yef`ByeN`>TSZpMyk5KEIVGMZvwfvKDpE5J0|S?e0U>~n zS~XFQybpLU>}Ip&$h^(r)>Aq0ZnY+v2?=KLbjYN?%MBqd(=A_q*i@zTqHm(p%R~ww zBimhV(2<3;*(CDN#}w3*+Oz#1EPY?(5ckjECy@TVugMWx8TKlvF{rMYN&=^!# zA=`~J-_s5*U+TBE+k*nb_lnQxJpg6{;4#20VLl(qL8j5LF445^=^NpZ2v zI7Is5gy`$mn#pwY`=`z+kyZF~G5sF=~+jO5ftTozr7pR25&5(}nPTNgR0dwwhCHqOIPMvV7gg z?h|ndFpmIPBwCuDS$vAb($HdW%J#p%eTwLxPu*g%h3%G8V}c|2;eUS0)_BQi{PE(& zA%Fa|A%B2E+%m@vURizH^7kM0pL7$`Ev{0GP7?drEh(=`XNmpBKFXEW7F%N)jY*tr zcf`Ou^hd+{^RvhZl+k-Nh=P zw0m`33?k%#YyJEEROU5k-L$^LcQ#?x5r4?Kanl*Pii&6u%iv^h1%oI@?_>4a|Nc8n zm$~J!TptsmXd?Q-(aPTO%e}+zN2tYMIjk>^73erb^l13gNfd3BWO-pQH_u*{4PIXr zSUe%?HG3{a@+DadxwMKbSN8IAWm~J8k&roo1^?R8Ea~7S26M9MGYIRkA?HaOpMRc! zx_1!lJrX@I1jh*z9MJzyPhu5=Ijai01k7y5Ftg`|G{01rU*kE8vam>|=XPP`5FVoW z6CvRb{^?zzVNMsb(N`+6U-Z-W9MO0i?mg=F+#y6ybRl{oC*gZVGu0Ujv&cQG>`2r17=GWK;je~;@DgHgZ%LwO&&MF(jU)Gn2Lc|WWxRqP!-)T2CWSQKNL-56yi z#al#w0sqjBtf1Z6vg>R^tOERDzN)l9(WYv-Gu2|#L{Dc}BX(7|yQ!7k{nWIUxCi`j$3^P+|INq^_`0Rs;jrg~1Af9gzgl>Bd?nX|0D1U9<)8gxQczJmr-_u21ZCG%M8jEVSV*w)%3D+Gsw8oL)+&vfw z#~D8yP%ANZCx#3sJ^*(Rp;7#v)Ps>hwO+vk%~XXY79}(|(mXr9AxA&eWjs6N32Stf zR$0;Dr>L#BSOfN*(zY+baC6Eu%O;h1q7Ac;q;WS?H)o zi0G|WUpuFn`+u#dSlUz#7MUSW9H}avFsFTi-F@`}5!P=tx)R!RIi102D+3YCQ6c1B z#@OE{h}=(x`;x=R?xe+(ok>W2?}kypOVD*W5eQ7Pb9?LF_hQ9*rLFC$MyYG!2OLq} z{Xi~(Jcei31^Wi$m;a(6F3~v}O+CsM?{T?=5^=FjkAKn)ja0gO!^`a^lIcbMym9Ze z{uQz)c9Hn-{}~S=LEr%s5cG@#=fsxV^%Yn#)r#d7V%;>mu)@$In?tV;JTKt?7v!E~ zmc1deEH$RG7-$@#$ow0FeU4bXfMsoxe=ef^DvmKLJarKSPc3RSET>Wc(F^{UBTEQcZhz@!CZ`S8gGfepjDu;8^e~5-_}P8D7;^GxhFj;R(Qa;RQqOzNRKG>|^%A~t zi8x3V?QVIG@1U+fxZ*Wfxt!)5aPa9c1tR67rR$x29#yUJW^{zqiC(2Qehe&*miXpK z9w1&C1X`_5`Q4%CA^E&yK2vs-tC^GY;&YZOOn)ND1`hD}8@q+&P(V5XqMcy@T~gJu zr&IbmNS&-(-xD2C#0kXRQIG40qTVOPB^BX_a!-Ao-7%##l!z5(kH)UNY(6fTYP`kh zb)0Rl$@>xJf8x4orf2Ng$i*e;jmp-E?B%4~n+Hfe-GO=b2mgGgt(WC3r8>KRS;x5|x#&wi zob^!I`>OT(H_6>shZMdv51F|y2r2JAi_R)ph?}V?ym@P+H^w4NJGnh0MbVFh-(H@O zTWg|+0`4-JYvq8ebqY)FhXPYv5Hk?2I@`{ecuK18~&jo!&J`| z9b=S=HR&Yg?k=Z#``CFNDpLGR>VL=5hn*@P_B6GcSA&;##cmO$EsK?8LpxRkP4`cnL9oi%sQ4( zcdqjaJcp-nyIE!})=lAKoBEbQ_~Q>RG@%S7j%Ph{SJ&LfBv2|7sru$~KY!`g6(RKw z!)6#ChbZmNwI$+h+7M;=@?Uz+OUkFH;nH|mB7(^%oD zkPeic1D8o?isj)Ckr;NAH>>a#OO2%V(wBQLtB}djJv?KNKN^Oy_X2LGsXTaXx36!9 zW0O`k0@>pa4rN2q^2lklr+?@t_TXFcP3jn#7!b*&y@tJ-L*%iEKhn)D3Vf^lqALz9 zJs>P?a?D99zG!d7}%0evb#!{#byN7oJTdOdgRVBWuS}JI* zXJooNkS%;W7J>8{RdYtIgAQQDV6?#j8x^jpg$cDxiTGnc2JIea^pgZ)F2CM^7qovV z^%O6t`Sx`;IQPDX+08d;8bgvwZZPPSteo@Lo?3Zb`Uj?q=%JP^3~w6j zQ$(+MJIAA&j*rlyhw-8T55Lp+OG@4j=+Z+ZZ@WWN+opvsh@iPWQ&wr@f`HHH_?M*- zN2;vZ)Ij%Du_U8;TGnirJ{#HT9%&XY>nFw|zcFGzTV>x#eV>k%I7&SEVOY%(@h;QpYC*$()J zT+HFPzp08_EVh_wIP%jqJzk9EA=-$#_w0roZ_ddWp@9bDJvd=^gb(kw9puLkbuxBy z9nPk0p?~q&?PAM~+Yt_veT-{x@UYQi-T>r@wa1I-Y$q&S|C)2*3R#n1Zs3>BtHHuZeMgA1=zq{_%1G>f{F#?Wyr4s@+2bFxS-W4B zrD##*a!2dgw8MS^yO!D_PM_H>hbA{bTIA-@jNjYGoN6#F7*-GD$nO=1>W}p5$Y|AU z8?9d*TP|)4I>u%y?T{Rp{=u}N>!HPsk4_n{I>mTd>wI=KE(2WK{;-%mzfaubcq#Sa zM}KDBL|zQQImUwI9wYc>ADgCAm08r5R<&q$a&v&e173JIMRVI_(_{l_{EIJ4N;HvV z97es%TlhW?PjjD~R_1CkHotvrUKwZLQL|;|e)olBW4unjHXKAg0$Bj2fPJh2} z#h}k+lI~#ezf4Fw;cK&T{T)ytdYrxXllrE}H8>@^9;e3bwl4XlIF@YwIE*y~N`>~%ejX$WBgFwA zaui`MF4wH*3zCJ}@h}a`>GzJtj!N9!0OMo5iBPpQ_E>ey@%~3nmvR(-RL>2^DAIeu9{U`r)6 zV~SKr%IzM*AGANS-?Zn{ja9{}Vv&;VJS^a(C9>+Ax|}-qQ_6}Z`54#D^4YRn#L1I5 zeo<~nv`LF>Rp!fja+}uKV&0V7GB4K;YD-s}Nqkxs3$ktMcz;!uoA@~?NR{UCjK5ca zZ%9)?&$L+1KafRJR`YtBE=X-h<7z#Lujrc?-o(GrH}MU9qoB%)52UKee6=eUO;#53 z=8lvV+1OFJT2JDOa#t){>P3_5tkI&#OKquF>k93rpgF>DYCSf_srAQ zauUChPpxykD>vmD##!!aFJW}sO^}SmGeHb#RizL0q<&BC=k>#elhY=JKiU!SwVT8@ z;t5@w8)%fYkFKzhimS=UisbV=gO$(M^SiRj{#_Ogtbefup|deyW$)tWd3lR8k`^$z zhRsYMTUTja<9ezP-1GA_2Wk{=?I^FeO}5GYO)3?zd^l-8)=?=xPS%XX=ll_+F%>{h z)Ao#9D^H{CF3)e%#rt})C^y?(L!Zve)m*+%arC#I#IMCib?}1T<`v1)hAb!T{=f>Z z8plNXx_=?5N}{-g2VR_Ox*7ADEZF%B#A$S!h)uoW54<=lpzU-RKBWts^7e~=Zlqii zkR-vA;_ZrGWXN%y%0y-&Uk1 z)rYKD&sT&jS7mjdR!a*4Z~IC78-1O;mah$TY15!&f^$kASP&R3U}?nz0b{evo9vFH z%XvREXIoG;#H%ibJS#|Ap~%?gc&-+0Zwn4j>k04yd`WkKy;QKJ)#5H|;OP#QYohkw zHGd_@csL4`&hGo$Pc&5R^V+~xkR38*pp5&ndOy#Kh5*fg!d1I5y8BGxSM(kHq3`0C z?K{gfksiJCf(C3N0MF@C9ecC&% zAT-`?wrROX&?V!b5qP`V=wBTH=-=r)FMm@cT;;Qh&CvveQMqdXc&!1ci*#Gxm5oJN z@-%}T;tyip?>}QWq;*+E6oclwYKPO&v7aXl z>Ileh7;+braDaPX=AeX^Rrz6F%ha~Cj%Zl_87FZnDk68g-Aa$A+ZGo5BmTYu0` zq^b-KHM}=0f_zEW;*Dq!pJRilfd#3+cxt7oc%mY9w~{iqHbD7zvahY1v$s7r1z3kY z!`0h>9DMciW~RVUtO--7Ivc50hS0hckXgCo5Donh&42Dl!cDd0WLGS6LYwF5Em)5Z*TY}`1Dy`i!eiZ{ zcEkxkE(vqHB&&?%OC7Av-4*(y(3Qz^P%#4)Q&7<~&y#zSt?!yTDT^c(h^R5IBXVog zfCu2j91i+VQh|lSiQd6!Ps&xYfZ+)mvcdIQGk^u*pznvniCz{77$D38aDSD;Iuzs* zw!A=eFGAN9Ji3;%zwR!06AiuAyLlR%-}VSO>8C!@9x*6kBSm<&cNVShxel9EuhLHDMzPMgDv1w1)j zp$Sn10EH%WOh`^P`0&5saNaG_{L}eo>)G=<$p7jEee}0e4*ck+lk=mW&ZKh1R2mSu z7**lTW8U^W`i)eus6ax<%G$G@ATgE&G#$ew1W?M*ln6u%RDXzsM7aBsOR2<5TrOW> z%lr**A>C@`G)-ET5zaBV-<9ui*Zcam23n-cJ}Z}L0-a!*1m$@Fntnysk$i+Bm~FZ% z7s6jC0=iL%Am#{AtV;y3f`Dljm#r@%M5s}nf%TChfqBIe?iI7`g~Iy{7aCjnn-u1R ztHaq~pS&^#C4agL?#4wH?kCt!*#>xR-;wQD7owv)n`hJ6l&@AYyFR;ij}G6`wSpde ztzvu@oqxoE5>)C_;OWJ32R0s8YZ5_}1C$sJ50(2g7Vx-6Bj82v?$Rd7YQ)H8!as-0 zkS>s;rx!6kjk6k&3((P2RsA|tkbuO`ibb|H?L518lz;ZPoZ8pM)8)4mmWUezsuN*| z5vah>9T7-YZ6vtmL9nZMxTTLkS{p8!v&4Zp@hO33$)jMC@o2X$2mwA~IRwcC+NMs+ zMk8>Zb-;Q=Xien=Of#03kleJfYa1X}S_8lpkVJ+9k7P4*O>jRJ+)ElwAXS9AK?+8F z$!%JL{eMnvYu=p40>jy{8AIg5qvN0}$r{9sCh7EX#B?0YHs{Sbyy)7D36C8V(=1;i z2U}##;dBfR(4u76>DPtGnhN2j)95+KW!z@uh{I(V^roAGy(6#!K{_HZ9*r5Hb0Y`} zk%K^!14N)|#-0(XTGZJA6iNnNHDxvEV2dz^4}Ts8&M>k9h!W@1AVz`H=7mJtlzV z^xynMX0SHt7N(Uh?w%zz7!_%iz*ihN_kYoWR7tDbtU(Wd(z2T4wV|Wqe<kY;cg1 zQ-Y!LK?M^zT;Y_rzAQ587{aBr2t~i%%sa=jb99kab<;~T2|79x>F69vH#!nG*?+jv zPzb%06YKpI>Uo2t$6UR@B5EO81}vQhAr=s#PAD+BXtO#ze9xZ^f@XjbT*_UvuMkeP4 zsWI*>Kh3mN5yjm0u^{5do1X!$s(U2OzyrZaUf69`(UUw+b5gyB#pr0)et$kGkBk67 zm1qYYT&w$@sA&SKSBKKE^vzy!*Xdw$(~-O)>k_DAmP0qx4erG`oVuX{)YT2W>_lYl zg$8sDmC>5`_o{3#UY3X~q5Y@{fT0&grW}38BAnm%NAwk+1uPNkMwKzyQNU6|_uMzt zvl*H@M6EXfi&n<&ct|<1=YLpGrcugr4B9l7sS^%j-P00vWmq9cwFPm6qvA3HRWx6d z+bU(js8yA||Fr&0tv%iKX*bHl0#yS>!Q)mqj-#JW)<-{``No1w-xg02{u6*4bI7p( zyQY?>?s%2Xf)^teYj8j;6iIh2R1-aHvQFQ&Sk?VCL7Qt^EOAFm6n}*hqvCP0-h!R6 zPQ}G61C8yBVB7?!8019{3T$g|4PQIh`?opSU+T_dGlu}4bUS%IbAQ8Jqg(43hAsG#Y%p7ek8`6sNSdfGzHJ}x; zt}yFwL>7YrC~g5or#T#J3jK_%vclqB8IY7DfyFH_OPZeJIDZBd_E`Oa?Wt1|CZtL@ z!~@H7BxGNUy&kBM`YS}N2D+-OVv1^3^8`Ntm~cR3`OmQVxB+0C9J5xlI+kza7QCJl zVDNM9v1(Nri*e|c7+QjW!PvPwt5s{5LHO$~Tf9%o54cL?UMLz=yglHzfq@zdE*9W~ z^3m{`@s-TOjDM7)-mBBQ&bhTd`(2x;y|DxKdJ5jP`Ps62#L7_LfhECRahcF*C?WTBYn}i+oU8&gUPQhRG7G zw}Y9qP|+CJwv@jG3%k0FjJV7pf-q)njI3eUM-5EuX@3u2ERR7XCWtd1&Lxt_<{ujD zg0hvG>I5tHLpluy8MKDFqZ7Gk)h#-Enddv;yeTHNCd9TlVSY&!PR!w01eSjREz9OB z^l{~-C%k+{W{VgrD9UV9?}AzC!9mFU5N7{hWFKL>UYB4L?3;n2fYNeLu2k`)_h~wN1d7g%Jc?H^F7o;FqnI_B&OU#R@9%)&#M2ofWJwV! z%6@cU@l$BCy2Ih?sq%$d7(H!+(GDf#?QJ`O_chVBjwd$Fo#RLJt@t zNbBsYJ3 zI8eV129@sC=rNiDD=q8!dr}o7{|hJ0Y4I?W{A}QG=j8qr4?P4YZ1Qo7Ymi>%(DRPDPNN#@htq??UqzCkfgi|9{A#5K=%CoM=AjS{^RE$`aod6Jo~R# z>4q#Bj6a?|Lu%XNpTXoaZ1M^**-59LzS@Opi9>02!jxV7D9sDem9CPM*?;g+6678< zDF#3r9{u=3yRgvk4}1pwd55k1+DpvRe>_@LY{fp1j}Bf8n~wkB`z!X~NBkz4uQV73 zl0*lwmj&~sk$=SdSE9j>${U^cYG$KvDOf;fl`W{nI z9*ix$4<4+hN3{Xx(d=<>$A34Z9&PWnI?n5|;eP5b;Wonme=DyM{WQR7M0I)Qc_IBO zgsHD9bl(yKw06q^mpEw^5|7QHjdQ3LVK)J+Gmw3UG2c+0Z|W~yE6iO1Ew^0KbKf8@Syho7TM zXg1!svM*JbUr{a`KVz^j^n>1$$$jMrb0{{Ik8=@)KeA&hAa+$YsaSLoC`zN~&zAh? z_#fUyKRW)IbqiVgQGfnY?x9p}{Tu7Ez>XrO6x{iL|0m3j?^5|Nzjm)8B?;B9N;xNT zbb5X?{dt5-pqJwI_nG@AAL`W_P8c-6%@ZUB_inHPT(WMvPGo)jsih6Gxguf|a-p^6 zfD+ljqgZ@Jg zE;VPLLGs-=0CpaU5g0n}8Pj=T{GY!cN>1jgD%>StWfcqfiQ%@mQMrZ2qAXq!i&Fi> zu4g;+UV;8hxPLMf(P!JCLx^JbiHp!1FXGOl?#Lbb|Cz4;pUFk|xzC?Sec{x4eLY@9 z?LrAsOuv@k9XOjmvl#sX!NWtTCEp_4I9jXVTX&g}X}*3@R{S2!v3lot1x4(;Ql&E3 zbyZ?KeYb~}G+l^G9!*X_jliMozz&YH1Zf=kc>|Ax4u4NpVAgWEYtUCu>IUnNNl#vi z7CPW@hLOy}hd(GS#p!=n-T8sW62w1O)Tu=Aks>NnlVM?25f}(ige6B{i^k zF4!xkEyL{?i-Je-pl-*M$tkNZ1@r%R%mT(1H^U>hJ;@RlH@(H~um<${QUXE}E41LQ zunX*#gke88HBVKei3iMZ_?%WJwG@CSgeI`2|B*XvObD!aJjBP2{0B0j@~rQ(+up7FG+8(27$ASgO?KE)2R zwtr{Wc)|x~7Fm#VQ`F5}ps0El9ZNK`Vji9#v$x*w!Z6xXSIq$_do@2>>u?*m;T_OS;&vTl zhLdu1B)4z*1%c#&;n9^-RbZnQ|nn~|3TdSzYg#LsFA_JBbUxiTcL)Uiz3rXxI!G1 zFfwofjKOqrxy+0XGXX9#Fw)O;E^b{FW^j)mZGGSUTW1L& zwRe2jvYNFUCw#gj8J)w#Ri#lX?0@*^kJmmMQ7OWXxU@KH)2YCIobb@9(=faT(5v<> zi8*8)%*kQuqz+l=@@=bHPWG;}GJ=@D9yTWW>zJ4*USKi)m}Bp=-DdM}3NOgs3Yh$U zMHj(P&-d4;4ER=U+q+t0GswD{g||}KR;tif_-0x%jv^LX$L91Z$l&d`YJWFx{leJ7 zHM=$Bm|lDGhj<_K(D4_lO!IgfSeTK=ltXU_2)$K5EGZIgf&d;;1#KZr5Ny~$xQz=Cy8oy6$ zyUib1fCQDAf+;%{X?9h?Rev07lEM&^Wgq)1bbR(ihe3wbFstds8?wB47*2>|tJa;@ z*a!~nxq`IIe|zpL^(p&Rxn&DatqLUsHOCs65b1yH$fGbs{Oq`=tlfBVJWA0Ina*&1 z;CMiygg;UH*e?8gi_<#|kAoF{Oc-b{jhin}OBMlKGtG093E)P6;eS{Yg<%xzC)dAz z0k9(0hgJZ)nSr=k1{Vo`Ty=EOcDh)=ycR*%RXnkrTV2qZg^=t_>jfb)s>)8cEiuZL zF!f}3t%g3g;fL5!QlUMuOI}9}qki1or5RdQfp+}+(xQCnQ%}@72s-XJyPVXgWknuW z6?}Af%KPe3d0H7|oqyZ1CL)Ocy)`p#M16vLtX@}ne^IAv$E=ve)+4>if}fIHA<;~wzJ!LPifL+;M};zhY*ZrdDP)^ZpZzTvAJ zTr`u_#2 zXKnKKm#QZXEFfi1!^8Sxos}yV`y#q+e{Ho90-}0o66`aNSJe?{;+j{n6&1Lvia^LVGspkiGo9=fTu=5&nhr4LEU8W7uZ<;!_@k;6S`t-{)Ey@ioPP?^c zx^v|e6$+_yb}tL)`dVD-gLfLXTcA!u7+&kr69G|eihr{Gsk4O4>p3}XbpW{t8fP^; zZ2c~kFDmoDxWRQWEn)oS0_yGK2S}j$@~Kh>ONy$udB%cOtDKtazA`@=x!Awb&w*1p zraeWOJ?i(Y&;8D=DbB2i@fHOF+9J}C?oSEF+P7KnP5SOqohrj5+=3k7=Nvk)tP$@2 I1323hJuv<1O8@`> diff --git a/latest/optimizers/adam_optimizer/index.html b/latest/optimizers/adam_optimizer/index.html index dda78b5ed..cbd3c8623 100644 --- a/latest/optimizers/adam_optimizer/index.html +++ b/latest/optimizers/adam_optimizer/index.html @@ -1,2 +1,2 @@ -Adam Optimizer · GeometricMachineLearning.jl

      The Adam Optimizer

      The Adam Optimizer is one of the most widely (if not the most widely used) neural network optimizer. Like most modern neural network optimizers it contains a cache that is updated based on first-order gradient information and then, in a second step, the cache is used to compute a velocity estimate for updating the neural networ weights.

      Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold.

      All weights on a vector space

      The cache of the Adam optimizer consists of first and second moments. The first moments $B_1$ store linear information about the current and previous gradients, and the second moments $B_2$ store quadratic information about current and previous gradients (all computed from a first-order gradient).

      If all the weights are on a vector space, then we directly compute updates for $B_1$ and $B_2$:

      1. \[B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,\]

      2. \[B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L,\]

        where $\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n$ is the Hadamard product: $[a\odot{}b]_i = a_ib_i$. $\rho_1$ and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from (Goodfellow et al., 2016, page 301). After having updated the cache (i.e. $B_1$ and $B_2$) we compute a velocity (step 3) with which the parameters $Y_t$ are then updated (step 4).

      3. \[W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},\]

      4. \[Y_{t+1} \gets Y_t + W_t,\]

      Here $\eta$ (with default 0.01) is the learning rate and $\delta$ (with default $3\cdot10^{-7}$) is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise.

      Weights on manifolds

      The problem with generalizing Adam to manifolds is that the Hadamard product $\odot$ as well as the other element-wise operations ($/$, $\sqrt{}$ and $+$ in step 3 above) lack a clear geometric interpretation. In GeometricMachineLearning we get around this issue by utilizing a so-called global tangent space representation.

      References

      • Goodfellow I, Bengio Y, Courville A. Deep learning[M]. MIT press, 2016.
      [39]
      I. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).
      +Adam Optimizer · GeometricMachineLearning.jl

      The Adam Optimizer

      The Adam Optimizer is one of the most widely (if not the most widely used) neural network optimizer. Like most modern neural network optimizers it contains a cache that is updated based on first-order gradient information and then, in a second step, the cache is used to compute a velocity estimate for updating the neural network weights.

      Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold.

      All weights on a vector space

      The cache of the Adam optimizer consists of first and second moments. The first moments $B_1$ store linear information about the current and previous gradients, and the second moments $B_2$ store quadratic information about current and previous gradients (all computed from a first-order gradient).

      If all the weights are on a vector space, then we directly compute updates for $B_1$ and $B_2$:

      1. \[B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,\]

      2. \[B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L,\]

        where $\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n$ is the Hadamard product: $[a\odot{}b]_i = a_ib_i$. $\rho_1$ and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301]. After having updated the cache (i.e. $B_1$ and $B_2$) we compute a velocity (step 3) with which the parameters $Y_t$ are then updated (step 4).

      3. \[W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},\]

      4. \[Y_{t+1} \gets Y_t + W_t,\]

      Here $\eta$ (with default 0.01) is the learning rate and $\delta$ (with default $3\cdot10^{-7}$) is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise.

      Weights on manifolds

      The problem with generalizing Adam to manifolds is that the Hadamard product $\odot$ as well as the other element-wise operations ($/$, $\sqrt{}$ and $+$ in step 3 above) lack a clear geometric interpretation. In GeometricMachineLearning we get around this issue by utilizing a so-called global tangent space representation.

      References

      [41]
      I. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).
      diff --git a/latest/optimizers/bfgs_optimizer/index.html b/latest/optimizers/bfgs_optimizer/index.html index cf376acf2..97c4c79bd 100644 --- a/latest/optimizers/bfgs_optimizer/index.html +++ b/latest/optimizers/bfgs_optimizer/index.html @@ -1,8 +1,8 @@ -BFGS Optimizer · GeometricMachineLearning.jl

      The BFGS Algorithm

      The presentation shown here is largely taken from chapters 3 and 6 of reference [12] with a derivation based on an online comment. The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.

      It is a version of a quasi-Newton method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:

      \[m_k(x) = f(x_k) + (\nabla_{x_k}f)^T(x - x_k) + \frac{1}{2}(x - x_k)^TB_k(x - x_k),\]

      where $B_k$ is referred to as the approximate Hessian. We further require $B_k$ to be symmetric and positive definite. Differentiating the above expression and setting the derivative to zero gives us:

      \[\nabla_xm_k = \nabla_{x_k}f + B_k(x - x_k) = 0,\]

      or written differently:

      \[x - x_k = -B_k^{-1}\nabla_{x_k}f.\]

      This value we will from now on call $p_k := x - x_k$ and refer to as the search direction. The new iterate then is:

      \[x_{k+1} = x_k + \alpha_kp_k,\]

      where $\alpha_k$ is the step length. Techniques that describe how to pick an appropriate $\alpha_k$ are called line-search methods and are discussed below. First we discuss what requirements we impose on $B_k$. A first reasonable condition would be to require the gradient of $m_k$ to be equal to that of $f$ at the points $x_{k-1}$ and $x_k$:

      \[\begin{aligned} +BFGS Optimizer · GeometricMachineLearning.jl

      The BFGS Algorithm

      The presentation shown here is largely taken from chapters 3 and 6 of reference [15] with a derivation based on an online comment. The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.

      It is a version of a quasi-Newton method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:

      \[m_k(x) = f(x_k) + (\nabla_{x_k}f)^T(x - x_k) + \frac{1}{2}(x - x_k)^TB_k(x - x_k),\]

      where $B_k$ is referred to as the approximate Hessian. We further require $B_k$ to be symmetric and positive definite. Differentiating the above expression and setting the derivative to zero gives us:

      \[\nabla_xm_k = \nabla_{x_k}f + B_k(x - x_k) = 0,\]

      or written differently:

      \[x - x_k = -B_k^{-1}\nabla_{x_k}f.\]

      This value we will from now on call $p_k := x - x_k$ and refer to as the search direction. The new iterate then is:

      \[x_{k+1} = x_k + \alpha_kp_k,\]

      where $\alpha_k$ is the step length. Techniques that describe how to pick an appropriate $\alpha_k$ are called line-search methods and are discussed below. First we discuss what requirements we impose on $B_k$. A first reasonable condition would be to require the gradient of $m_k$ to be equal to that of $f$ at the points $x_{k-1}$ and $x_k$:

      \[\begin{aligned} \nabla_{x_k}m_k & = \nabla_{x_k}f + B_k(x_k - x_k) & \overset{!}{=} \nabla_{x_k}f \text{ and } \\ \nabla_{x_{k-1}}m_k & = \nabla{x_k}f + B_k(x_{k-1} - x_k) & \overset{!}{=} \nabla_{x_{k-1}}f. -\end{aligned}\]

      The first one of these conditions is of course automatically satisfied. The second one can be rewritten as:

      \[B_k(x_k - x_{k-1}) = \overset{!}{=} \nabla_{x_k}f - \nabla_{x_{k-1}}f. \]

      The following notations are often used:

      \[s_{k-1} := \alpha_{k-1}p_{k-1} := x_{k} - x_{k-1} \text{ and } y_{k-1} := \nabla_{x_k}f - \nabla_{x_{k-1}}f. \]

      The conditions mentioned above then becomes:

      \[B_ks_{k-1} \overset{!}{=} y_{k-1},\]

      and we call it the secant equation. A second condition we impose on $B_k$ is that is has to be positive-definite at point $s_{k-1}$:

      \[s_{k-1}^Ty_{k-1} > 0.\]

      This is referred to as the curvature condition. If we impose the Wolfe conditions, the curvature condition hold automatically. The Wolfe conditions are stated with respect to the parameter $\alpha_k$.

      The Wolfe conditions are:

      1. $f(x_k+\alpha{}p_k)\leq{}f(x_k) + c_1\alpha(\nabla_{x_k}f)^Tp_k$ for $c_1\in(0,1)$.
      2. $(\nabla_{(x_k + \alpha_kp_k)}f)^Tp_k \geq c_2(\nabla_{x_k}f)^Tp_k$ for $c_2\in(c_1,1)$.

      A possible choice for $c_1$ and $c_2$ are $10^{-4}$ and $0.9$ (see [12]). The two Wolfe conditions above are respectively called the sufficient decrease condition and the curvature condition respectively. Note that the second Wolfe condition (also called curvature condition) is stronger than the one mentioned before under the assumption that the first Wolfe condition is true:

      \[(\nabla_{x_k}f)^Tp_{k-1} - c_2(\nabla_{x_{k-1}}f)^Tp_{k-1} = y_{k-1}^Tp_{k-1} + (1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1} \geq 0,\]

      and the second term in this expression is $(1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1}\geq\frac{1-c_2}{c_1\alpha_{k-1}}(f(x_k) - f(x_{k-1}))$, which is negative.

      In order to pick the ideal $B_k$ we solve the following problem:

      \[\begin{aligned} +\end{aligned}\]

      The first one of these conditions is of course automatically satisfied. The second one can be rewritten as:

      \[B_k(x_k - x_{k-1}) = \overset{!}{=} \nabla_{x_k}f - \nabla_{x_{k-1}}f. \]

      The following notations are often used:

      \[s_{k-1} := \alpha_{k-1}p_{k-1} := x_{k} - x_{k-1} \text{ and } y_{k-1} := \nabla_{x_k}f - \nabla_{x_{k-1}}f. \]

      The conditions mentioned above then becomes:

      \[B_ks_{k-1} \overset{!}{=} y_{k-1},\]

      and we call it the secant equation. A second condition we impose on $B_k$ is that is has to be positive-definite at point $s_{k-1}$:

      \[s_{k-1}^Ty_{k-1} > 0.\]

      This is referred to as the curvature condition. If we impose the Wolfe conditions, the curvature condition hold automatically. The Wolfe conditions are stated with respect to the parameter $\alpha_k$.

      The Wolfe conditions are:

      1. $f(x_k+\alpha{}p_k)\leq{}f(x_k) + c_1\alpha(\nabla_{x_k}f)^Tp_k$ for $c_1\in(0,1)$.
      2. $(\nabla_{(x_k + \alpha_kp_k)}f)^Tp_k \geq c_2(\nabla_{x_k}f)^Tp_k$ for $c_2\in(c_1,1)$.

      A possible choice for $c_1$ and $c_2$ are $10^{-4}$ and $0.9$ (see [15]). The two Wolfe conditions above are respectively called the sufficient decrease condition and the curvature condition respectively. Note that the second Wolfe condition (also called curvature condition) is stronger than the one mentioned before under the assumption that the first Wolfe condition is true:

      \[(\nabla_{x_k}f)^Tp_{k-1} - c_2(\nabla_{x_{k-1}}f)^Tp_{k-1} = y_{k-1}^Tp_{k-1} + (1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1} \geq 0,\]

      and the second term in this expression is $(1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1}\geq\frac{1-c_2}{c_1\alpha_{k-1}}(f(x_k) - f(x_{k-1}))$, which is negative.

      In order to pick the ideal $B_k$ we solve the following problem:

      \[\begin{aligned} \min_B & ||B - B_{k-1}||_W \\ \text{s.t.} & B = B^T\text{ and }Bs_{k-1}=y_{k-1}, \end{aligned}\]

      where the first condition is symmetry and the second one is the secant equation. For the norm $||\cdot||_W$ we pick the weighted Frobenius norm:

      \[||A||_W := ||W^{1/2}AW^{1/2}||_F,\]

      where $||\cdot||_F$ is the usual Frobenius norm[1] and the matrix $W=\tilde{B}_{k-1}$ is the inverse of the average Hessian:

      \[\tilde{B}_{k-1} = \int_0^1 \nabla^2f(x_{k-1} + \tau\alpha_{k-1}p_{k-1})d\tau.\]

      In order to find the ideal $B_k$ under the conditions described above, we introduce some notation:

      • $\tilde{B}_{k-1} := W^{1/2}B_{k-1}W^{1/2}$,
      • $\tilde{B} := W^{1/2}BW^{1/2}$,
      • $\tilde{y}_{k-1} := W^{1/2}y_{k-1}$,
      • $\tilde{s}_{k-1} := W^{-1/2}s_{k-1}$.

      With this notation we can rewrite the problem of finding $B_k$ as:

      \[\begin{aligned} @@ -14,4 +14,4 @@ u^T\tilde{B}_{k-1}u - 1 & u^T\tilde{B}_{k-1}u \\ u_\perp^T\tilde{B}_{k-1}u & u_\perp^T(\tilde{B}_{k-1}-\tilde{B}_k)u_\perp \end{bmatrix}. -\end{aligned}\]

      By a property of the Frobenius norm:

      \[||\tilde{B}_{k-1} - \tilde{B}||^2_F = (u^T\tilde{B}_{k-1} -1)^2 + ||u^T\tilde{B}_{k-1}u_\perp||_F^2 + ||u_\perp^T\tilde{B}_{k-1}u||_F^2 + ||u_\perp^T(\tilde{B}_{k-1} - \tilde{B})u_\perp||_F^2.\]

      We see that $\tilde{B}$ only appears in the last term, which should therefore be made zero. This then gives:

      \[\tilde{B} = U\begin{bmatrix} 1 & 0 \\ 0 & u^T_\perp\tilde{B}_{k-1}u_\perp \end{bmatrix} = uu^T + (\mathbb{I}-uu^T)\tilde{B}_{k-1}(\mathbb{I}-uu^T).\]

      If we now map back to the original coordinate system, the ideal solution for $B_k$ is:

      \[B_k = (\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}y_{k-1}s_{k-1}^T)B_{k-1}(\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}s_{k-1}y_{k-1}^T) + \frac{1}{y_{k-1}^Ts_{k-1}}y_ky_k^T.\]

      What we need in practice however is not $B_k$, but its inverse $H_k$. This is because we need to find $s_{k-1}$ based on $y_{k-1}$. To get $H_k$ based on the expression for $B_k$ above we can use the Sherman-Morrison-Woodbury formula[3] to obtain:

      \[H_{k} = H_{k-1} - \frac{H_{k-1}y_{k-1}y_{k-1}^TH_{k-1}}{y_{k-1}^TH_{k-1}y_{k-1}} + \frac{s_{k-1}s_{k-1}^T}{y_{k-1}^Ts_{k-1}}.\]

      TODO: Example where this works well!

      References

      [12]
      J. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).
      • 1The Frobenius norm is $||A||_F^2 = \sum_{i,j}a_{ij}^2$.
      • 2So we must have $u^Tu_\perp=0$ and further $u_\perp^Tu_\perp=\mathbb{I}$.
      • 3The Sherman-Morrison-Woodbury formula states $(A + UCV)^{-1} = A^{-1} - A^{-1} - A^{-1}U(C^{-1} + VA^{-1}U)^{-1}VA^{-1}$.
      +\end{aligned}\]

      By a property of the Frobenius norm:

      \[||\tilde{B}_{k-1} - \tilde{B}||^2_F = (u^T\tilde{B}_{k-1} -1)^2 + ||u^T\tilde{B}_{k-1}u_\perp||_F^2 + ||u_\perp^T\tilde{B}_{k-1}u||_F^2 + ||u_\perp^T(\tilde{B}_{k-1} - \tilde{B})u_\perp||_F^2.\]

      We see that $\tilde{B}$ only appears in the last term, which should therefore be made zero. This then gives:

      \[\tilde{B} = U\begin{bmatrix} 1 & 0 \\ 0 & u^T_\perp\tilde{B}_{k-1}u_\perp \end{bmatrix} = uu^T + (\mathbb{I}-uu^T)\tilde{B}_{k-1}(\mathbb{I}-uu^T).\]

      If we now map back to the original coordinate system, the ideal solution for $B_k$ is:

      \[B_k = (\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}y_{k-1}s_{k-1}^T)B_{k-1}(\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}s_{k-1}y_{k-1}^T) + \frac{1}{y_{k-1}^Ts_{k-1}}y_ky_k^T.\]

      What we need in practice however is not $B_k$, but its inverse $H_k$. This is because we need to find $s_{k-1}$ based on $y_{k-1}$. To get $H_k$ based on the expression for $B_k$ above we can use the Sherman-Morrison-Woodbury formula[3] to obtain:

      \[H_{k} = H_{k-1} - \frac{H_{k-1}y_{k-1}y_{k-1}^TH_{k-1}}{y_{k-1}^TH_{k-1}y_{k-1}} + \frac{s_{k-1}s_{k-1}^T}{y_{k-1}^Ts_{k-1}}.\]

      TODO: Example where this works well!

      References

      [15]
      J. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).
      • 1The Frobenius norm is $||A||_F^2 = \sum_{i,j}a_{ij}^2$.
      • 2So we must have $u^Tu_\perp=0$ and further $u_\perp^Tu_\perp=\mathbb{I}$.
      • 3The Sherman-Morrison-Woodbury formula states $(A + UCV)^{-1} = A^{-1} - A^{-1} - A^{-1}U(C^{-1} + VA^{-1}U)^{-1}VA^{-1}$.
      diff --git a/latest/optimizers/general_optimization/index.html b/latest/optimizers/general_optimization/index.html index 36e1dfb28..6a179c8d9 100644 --- a/latest/optimizers/general_optimization/index.html +++ b/latest/optimizers/general_optimization/index.html @@ -1,2 +1,2 @@ -General Optimization · GeometricMachineLearning.jl

      Optimization for Neural Networks

      Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the gradient flow equation:

      \[\dot{\theta} = -\nabla_\theta{}L,\]

      by means of a Euler time-stepping scheme:

      \[\theta^{t+1} = \theta^{t} - h\nabla_{\theta^{t}}L,\]

      where $\eta$ (the time step of the Euler scheme) is referred to as the learning rate

      This equation can easily be generalized to manifolds by replacing the Euclidean gradient $\nabla_{\theta^{t}L}$ by a Riemannian gradient $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a retraction by $-h\mathrm{grad}_{\theta^{t}}L$.

      +General Optimization · GeometricMachineLearning.jl

      Optimization for Neural Networks

      Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the gradient flow equation:

      \[\dot{\theta} = -\nabla_\theta{}L,\]

      by means of a Euler time-stepping scheme:

      \[\theta^{t+1} = \theta^{t} - h\nabla_{\theta^{t}}L,\]

      where $\eta$ (the time step of the Euler scheme) is referred to as the learning rate

      This equation can easily be generalized to manifolds by replacing the Euclidean gradient $\nabla_{\theta^{t}L}$ by a Riemannian gradient $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a retraction by $-h\mathrm{grad}_{\theta^{t}}L$.

      diff --git a/latest/optimizers/manifold_related/cayley/index.html b/latest/optimizers/manifold_related/cayley/index.html index b25e8a4d8..08d64db8b 100644 --- a/latest/optimizers/manifold_related/cayley/index.html +++ b/latest/optimizers/manifold_related/cayley/index.html @@ -1,5 +1,5 @@ -Cayley Retraction · GeometricMachineLearning.jl

      The Cayley Retraction

      The Cayley transformation is one of the most popular retractions. For several matrix Lie groups it is a mapping from the Lie algebra $\mathfrak{g}$ onto the Lie group $G$. They Cayley retraction reads:

      \[ \mathrm{Cayley}(C) = \left(\mathbb{I} -\frac{1}{2}C\right)^{-1}\left(\mathbb{I} +\frac{1}{2}C\right).\]

      This is easily checked to be a retraction, i.e. $\mathrm{Cayley}(\mathbb{O}) = \mathbb{I}$ and $\frac{\partial}{\partial{}t}\mathrm{Cayley}(tC) = C$.

      What we need in practice is not the computation of the Cayley transform of an arbitrary matrix, but the Cayley transform of an element of $\mathfrak{g}^\mathrm{hor}$, the global tangent space representation.

      The elements of $\mathfrak{g}^\mathrm{hor}$ can be written as:

      \[C = \begin{bmatrix} +Cayley Retraction · GeometricMachineLearning.jl

      The Cayley Retraction

      The Cayley transformation is one of the most popular retractions. For several matrix Lie groups it is a mapping from the Lie algebra $\mathfrak{g}$ onto the Lie group $G$. They Cayley retraction reads:

      \[ \mathrm{Cayley}(C) = \left(\mathbb{I} -\frac{1}{2}C\right)^{-1}\left(\mathbb{I} +\frac{1}{2}C\right).\]

      This is easily checked to be a retraction, i.e. $\mathrm{Cayley}(\mathbb{O}) = \mathbb{I}$ and $\frac{\partial}{\partial{}t}\mathrm{Cayley}(tC) = C$.

      What we need in practice is not the computation of the Cayley transform of an arbitrary matrix, but the Cayley transform of an element of $\mathfrak{g}^\mathrm{hor}$, the global tangent space representation.

      The elements of $\mathfrak{g}^\mathrm{hor}$ can be written as:

      \[C = \begin{bmatrix} A & -B^T \\ B & \mathbb{O} \end{bmatrix} = \begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix} \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T \end{bmatrix},\]

      where the second expression exploits the sparse structure of the array, i.e. it is a multiplication of a $N\times2n$ with a $2n\times{}N$ matrix. We can hence use the Sherman-Morrison-Woodbury formula to obtain:

      \[(\mathbb{I} - \frac{1}{2}UV)^{-1} = \mathbb{I} + \frac{1}{2}U(\mathbb{I} - \frac{1}{2}VU)^{-1}V\]

      So what we have to invert is the term

      \[\mathbb{I} - \frac{1}{2}\begin{bmatrix} \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T \end{bmatrix}\begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} = @@ -10,4 +10,4 @@ \begin{bmatrix} \mathbb{I} \\ \frac{1}{2}A \end{bmatrix} + \begin{bmatrix} \frac{1}{2}A \\ \frac{1}{4}A^2 - \frac{1}{2}B^TB \end{bmatrix} \right) - \right)\]

      Note that for computational reason we compute $\mathrm{Cayley}(C)E$ instead of just the Cayley transform (see the section on retractions).

      + \right)\]

      Note that for computational reason we compute $\mathrm{Cayley}(C)E$ instead of just the Cayley transform (see the section on retractions).

      diff --git a/latest/optimizers/manifold_related/geodesic/index.html b/latest/optimizers/manifold_related/geodesic/index.html index 1b45b346c..5d95fd3ae 100644 --- a/latest/optimizers/manifold_related/geodesic/index.html +++ b/latest/optimizers/manifold_related/geodesic/index.html @@ -1,2 +1,2 @@ -Geodesic Retraction · GeometricMachineLearning.jl

      Geodesic Retraction

      General retractions are approximations of the exponential map. In GeometricMachineLearning we can, instead of using an approximation, solve the geodesic equation exactly (up to numerical error) by specifying Geodesic() as the argument of layers that have manifold weights.

      +Geodesic Retraction · GeometricMachineLearning.jl

      Geodesic Retraction

      General retractions are approximations of the exponential map. In GeometricMachineLearning we can, instead of using an approximation, solve the geodesic equation exactly (up to numerical error) by specifying Geodesic() as the argument of layers that have manifold weights.

      diff --git a/latest/optimizers/manifold_related/global_sections/index.html b/latest/optimizers/manifold_related/global_sections/index.html index 283f03ed5..2bff2a449 100644 --- a/latest/optimizers/manifold_related/global_sections/index.html +++ b/latest/optimizers/manifold_related/global_sections/index.html @@ -1,5 +1,5 @@ -Global Sections · GeometricMachineLearning.jl

      Global Sections

      Global sections are needed needed for the generalization of Adam and other optimizers to homogeneous spaces. They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general optimizer framework.

      Computing the global section

      In differential geometry a section is always associated to some bundle, in our case this bundle is $\pi:G\to\mathcal{M},A\mapsto{}AE$. A section is a mapping $\mathcal{M}\to{}G$ for which $\pi$ is a left inverse, i.e. $\pi\circ\lambda = \mathrm{id}$.

      For the Stiefel manifold $St(n, N)\subset\mathbb{R}^{N\times{}n}$ we compute the global section the following way:

      1. Start with an element $Y\in{}St(n,N)$,
      2. Draw a random matrix $A\in\mathbb{R}^{N\times{}(N-n)}$,
      3. Remove the subspace spanned by $Y$ from the range of $A$: $A\gets{}A-YY^TA$
      4. Compute a QR decomposition of $A$ and take as section $\lambda(Y) = [Y, Q_{[1:N, 1:(N-n)]}] =: [Y, \bar{\lambda}]$.

      It is easy to check that $\lambda(Y)\in{}G=SO(N)$.

      In GeometricMachineLearning, GlobalSection takes an element of $Y\in{}St(n,N)\equiv$StiefelManifold{T} and returns an instance of GlobalSection{T, StiefelManifold{T}}. The application $O(N)\times{}St(n,N)\to{}St(n,N)$ is done with the functions apply_section! and apply_section.

      Computing the global tangent space representation based on a global section

      The output of the horizontal lift $\Omega$ is an element of $\mathfrak{g}^{\mathrm{hor},Y}$. For this mapping $\Omega(Y, B{}Y) = B$ if $B\in\mathfrak{g}^{\mathrm{hor},Y}$, i.e. there is no information loss and no projection is performed. We can map the $B\in\mathfrak{g}^{\mathrm{hor},Y}$ to $\mathfrak{g}^\mathrm{hor}$ with $B\mapsto{}\lambda(Y)^{-1}B\lambda(Y)$.

      The function global_rep performs both mappings at once[1], i.e. it takes an instance of GlobalSection and an element of $T_YSt(n,N)$, and then returns an element of $\frak{g}^\mathrm{hor}\equiv$StiefelLieAlgHorMatrix.

      In practice we use the following:

      \[\begin{aligned} +Global Sections · GeometricMachineLearning.jl

      Global Sections

      Global sections are needed needed for the generalization of Adam and other optimizers to homogeneous spaces. They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general optimizer framework.

      Computing the global section

      In differential geometry a section is always associated to some bundle, in our case this bundle is $\pi:G\to\mathcal{M},A\mapsto{}AE$. A section is a mapping $\mathcal{M}\to{}G$ for which $\pi$ is a left inverse, i.e. $\pi\circ\lambda = \mathrm{id}$.

      For the Stiefel manifold $St(n, N)\subset\mathbb{R}^{N\times{}n}$ we compute the global section the following way:

      1. Start with an element $Y\in{}St(n,N)$,
      2. Draw a random matrix $A\in\mathbb{R}^{N\times{}(N-n)}$,
      3. Remove the subspace spanned by $Y$ from the range of $A$: $A\gets{}A-YY^TA$
      4. Compute a QR decomposition of $A$ and take as section $\lambda(Y) = [Y, Q_{[1:N, 1:(N-n)]}] =: [Y, \bar{\lambda}]$.

      It is easy to check that $\lambda(Y)\in{}G=SO(N)$.

      In GeometricMachineLearning, GlobalSection takes an element of $Y\in{}St(n,N)\equiv$StiefelManifold{T} and returns an instance of GlobalSection{T, StiefelManifold{T}}. The application $O(N)\times{}St(n,N)\to{}St(n,N)$ is done with the functions apply_section! and apply_section.

      Computing the global tangent space representation based on a global section

      The output of the horizontal lift $\Omega$ is an element of $\mathfrak{g}^{\mathrm{hor},Y}$. For this mapping $\Omega(Y, B{}Y) = B$ if $B\in\mathfrak{g}^{\mathrm{hor},Y}$, i.e. there is no information loss and no projection is performed. We can map the $B\in\mathfrak{g}^{\mathrm{hor},Y}$ to $\mathfrak{g}^\mathrm{hor}$ with $B\mapsto{}\lambda(Y)^{-1}B\lambda(Y)$.

      The function global_rep performs both mappings at once[1], i.e. it takes an instance of GlobalSection and an element of $T_YSt(n,N)$, and then returns an element of $\frak{g}^\mathrm{hor}\equiv$StiefelLieAlgHorMatrix.

      In practice we use the following:

      \[\begin{aligned} \lambda(Y)^T\Omega(Y,\Delta)\lambda(Y) & = \lambda(Y)^T[(\mathbb{I} - \frac{1}{2}YY^T)\Delta{}Y^T - Y\Delta^T(\mathbb{I} - \frac{1}{2}YY^T)]\lambda(Y) \\ & = \lambda(Y)^T[(\mathbb{I} - \frac{1}{2}YY^T)\Delta{}E^T - Y\Delta^T(\lambda(Y) - \frac{1}{2}YE^T)] \\ & = \lambda(Y)^T\Delta{}E^T - \frac{1}{2}EY^T\Delta{}E^T - E\Delta^T\lambda(Y) + \frac{1}{2}E\Delta^TYE^T \\ @@ -7,4 +7,4 @@ & = \begin{bmatrix} Y^T\Delta{}E^T \\ \bar{\lambda}\Delta{}E^T \end{bmatrix} + E\Delta^TYE^T - \begin{bmatrix}E\Delta^TY & E\Delta^T\bar{\lambda} \end{bmatrix} \\ & = EY^T\Delta{}E^T + E\Delta^TYE^T - E\Delta^TYE^T + \begin{bmatrix} \mathbb{O} \\ \bar{\lambda}\Delta{}E^T \end{bmatrix} - \begin{bmatrix} \mathbb{O} & E\Delta^T\bar{\lambda} \end{bmatrix} \\ & = EY^T\Delta{}E^T + \begin{bmatrix} \mathbb{O} \\ \bar{\lambda}\Delta{}E^T \end{bmatrix} - \begin{bmatrix} \mathbb{O} & E\Delta^T\bar{\lambda} \end{bmatrix}, -\end{aligned}\]

      meaning that for an element of the horizontal component of the Lie algebra $\mathfrak{g}^\mathrm{hor}$ we store $A=Y^T\Delta$ and $B=\bar{\lambda}^T\Delta$.

      Optimization

      The output of global_rep is then used for all the optimization steps.

      References

      [41]
      T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).
      • 1For computational reasons.
      +\end{aligned}\]

      meaning that for an element of the horizontal component of the Lie algebra $\mathfrak{g}^\mathrm{hor}$ we store $A=Y^T\Delta$ and $B=\bar{\lambda}^T\Delta$.

      Optimization

      The output of global_rep is then used for all the optimization steps.

      References

      [43]
      T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).
      • 1For computational reasons.
      diff --git a/latest/optimizers/manifold_related/horizontal_lift/index.html b/latest/optimizers/manifold_related/horizontal_lift/index.html index c068208ef..ecfd67515 100644 --- a/latest/optimizers/manifold_related/horizontal_lift/index.html +++ b/latest/optimizers/manifold_related/horizontal_lift/index.html @@ -1,2 +1,2 @@ -Horizontal Lift · GeometricMachineLearning.jl

      The Horizontal Lift

      For each element $Y\in\mathcal{M}$ we can perform a splitting $\mathfrak{g} = \mathfrak{g}^{\mathrm{hor}, Y}\oplus\mathfrak{g}^{\mathrm{ver}, Y}$, where the two subspaces are the horizontal and the vertical component of $\mathfrak{g}$ at $Y$ respectively. For homogeneous spaces: $T_Y\mathcal{M} = \mathfrak{g}\cdot{}Y$, i.e. every tangent space to $\mathcal{M}$ can be expressed through the application of the Lie algebra to the relevant element. The vertical component consists of those elements of $\mathfrak{g}$ which are mapped to the zero element of $T_Y\mathcal{M}$, i.e.

      \[\mathfrak{g}^{\mathrm{ver}, Y} := \mathrm{ker}(\mathfrak{g}\to{}T_Y\mathcal{M}).\]

      The orthogonal complement[1] of $\mathfrak{g}^{\mathrm{ver}, Y}$ is the horizontal component and is referred to by $\mathfrak{g}^{\mathrm{hor}, Y}$. This is naturally isomorphic to $T_Y\mathcal{M}$. For the Stiefel manifold the horizontal lift has the simple form:

      \[\Omega(Y, V) = \left(\mathbb{I} - \frac{1}{2}\right)VY^T - YV^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      If the element $Y$ is the distinct element $E$, then the elements of $\mathfrak{g}^{\mathrm{hor},E}$ take a particularly simple form, see Global Tangent Space for a description of this.

      • 1The orthogonal complement is taken with respect to a metric defined on $\mathfrak{g}$. For the case of $G=SO(N)$ and $\mathfrak{g}=\mathfrak{so}(N) = \{A:A+A^T =0\}$ this metric can be chosen as $(A_1,A_2)\mapsto{}\frac{1}{2}A_1^TA_2$.
      +Horizontal Lift · GeometricMachineLearning.jl

      The Horizontal Lift

      For each element $Y\in\mathcal{M}$ we can perform a splitting $\mathfrak{g} = \mathfrak{g}^{\mathrm{hor}, Y}\oplus\mathfrak{g}^{\mathrm{ver}, Y}$, where the two subspaces are the horizontal and the vertical component of $\mathfrak{g}$ at $Y$ respectively. For homogeneous spaces: $T_Y\mathcal{M} = \mathfrak{g}\cdot{}Y$, i.e. every tangent space to $\mathcal{M}$ can be expressed through the application of the Lie algebra to the relevant element. The vertical component consists of those elements of $\mathfrak{g}$ which are mapped to the zero element of $T_Y\mathcal{M}$, i.e.

      \[\mathfrak{g}^{\mathrm{ver}, Y} := \mathrm{ker}(\mathfrak{g}\to{}T_Y\mathcal{M}).\]

      The orthogonal complement[1] of $\mathfrak{g}^{\mathrm{ver}, Y}$ is the horizontal component and is referred to by $\mathfrak{g}^{\mathrm{hor}, Y}$. This is naturally isomorphic to $T_Y\mathcal{M}$. For the Stiefel manifold the horizontal lift has the simple form:

      \[\Omega(Y, V) = \left(\mathbb{I} - \frac{1}{2}\right)VY^T - YV^T(\mathbb{I} - \frac{1}{2}YY^T).\]

      If the element $Y$ is the distinct element $E$, then the elements of $\mathfrak{g}^{\mathrm{hor},E}$ take a particularly simple form, see Global Tangent Space for a description of this.

      • 1The orthogonal complement is taken with respect to a metric defined on $\mathfrak{g}$. For the case of $G=SO(N)$ and $\mathfrak{g}=\mathfrak{so}(N) = \{A:A+A^T =0\}$ this metric can be chosen as $(A_1,A_2)\mapsto{}\frac{1}{2}A_1^TA_2$.
      diff --git a/latest/optimizers/manifold_related/retractions/index.html b/latest/optimizers/manifold_related/retractions/index.html index 9f7ed3045..7ee36be5e 100644 --- a/latest/optimizers/manifold_related/retractions/index.html +++ b/latest/optimizers/manifold_related/retractions/index.html @@ -1,2 +1,2 @@ -Retractions · GeometricMachineLearning.jl

      Retractions

      Classical Definition

      Classically, retractions are defined as maps smooth maps

      \[R: T\mathcal{M}\to\mathcal{M}:(x,v)\mapsto{}R_x(v)\]

      such that each curve $c(t) := R_x(tv)$ satisfies $c(0) = x$ and $c'(0) = v$.

      In GeometricMachineLearning

      Retractions are a map from the horizontal component of the Lie algebra $\mathfrak{g}^\mathrm{hor}$ to the respective manifold.

      For optimization in neural networks (almost always first order) we solve a gradient flow equation

      \[\dot{W} = -\mathrm{grad}_WL, \]

      where $\mathrm{grad}_WL$ is the Riemannian gradient of the loss function $L$ evaluated at position $W$.

      If we deal with Euclidean spaces (vector spaces), then the Riemannian gradient is just the result of an AD routine and the solution of the equation above can be approximated with $W^{t+1} \gets W^t - \eta\nabla_{W^t}L$, where $\eta$ is the learning rate.

      For manifolds, after we obtained the Riemannian gradient (see e.g. the section on Stiefel manifold), we have to solve a geodesic equation. This is a canonical ODE associated with any Riemannian manifold.

      The general theory of Riemannian manifolds is rather complicated, but for the neural networks treated in GeometricMachineLearning, we only rely on optimization of matrix Lie groups and homogeneous spaces, which is much simpler.

      For Lie groups each tangent space is isomorphic to its Lie algebra $\mathfrak{g}\equiv{}T_\mathbb{I}G$. The geodesic map from $\mathfrak{g}$ to $G$, for matrix Lie groups with bi-invariant Riemannian metric like $SO(N)$, is simply the application of the matrix exponential $\exp$. Alternatively this can be replaced by the Cayley transform (see (Absil et al, 2008).)

      Starting from this basic map $\exp:\mathfrak{g}\to{}G$ we can build mappings for more complicated cases:

      1. General tangent space to a Lie group $T_AG$: The geodesic map for an element $V\in{}T_AG$ is simply $A\exp(A^{-1}V)$.

      2. Special tangent space to a homogeneous space $T_E\mathcal{M}$: For $V=BE\in{}T_E\mathcal{M}$ the exponential map is simply $\exp(B)E$.

      3. General tangent space to a homogeneous space $T_Y\mathcal{M}$ with $Y = AE$: For $\Delta=ABE\in{}T_Y\mathcal{M}$ the exponential map is simply $A\exp(B)E$. This is the general case which we deal with.

      The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function retraction in GeometricMachineLearning performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with apply_section and represented through the red vertical line in the diagram on the general optimizer framework.

      Word of caution

      The Lie group corresponding to the Stiefel manifold $SO(N)$ has a bi-invariant Riemannian metric associated with it: $(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)$. For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult (see (Bendokat et al, 2021).)

      References

      • Absil P A, Mahony R, Sepulchre R. Optimization algorithms on matrix manifolds[M]. Princeton University Press, 2008.

      • Bendokat T, Zimmermann R. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications[J]. arXiv preprint arXiv:2108.12447, 2021.

      • O'Neill, Barrett. Semi-Riemannian geometry with applications to relativity. Academic press, 1983.

      +Retractions · GeometricMachineLearning.jl

      Retractions

      Classical Definition

      Classically, retractions are defined as maps smooth maps

      \[R: T\mathcal{M}\to\mathcal{M}:(x,v)\mapsto{}R_x(v)\]

      such that each curve $c(t) := R_x(tv)$ satisfies $c(0) = x$ and $c'(0) = v$.

      In GeometricMachineLearning

      Retractions are a map from the horizontal component of the Lie algebra $\mathfrak{g}^\mathrm{hor}$ to the respective manifold.

      For optimization in neural networks (almost always first order) we solve a gradient flow equation

      \[\dot{W} = -\mathrm{grad}_WL, \]

      where $\mathrm{grad}_WL$ is the Riemannian gradient of the loss function $L$ evaluated at position $W$.

      If we deal with Euclidean spaces (vector spaces), then the Riemannian gradient is just the result of an AD routine and the solution of the equation above can be approximated with $W^{t+1} \gets W^t - \eta\nabla_{W^t}L$, where $\eta$ is the learning rate.

      For manifolds, after we obtained the Riemannian gradient (see e.g. the section on Stiefel manifold), we have to solve a geodesic equation. This is a canonical ODE associated with any Riemannian manifold.

      The general theory of Riemannian manifolds is rather complicated, but for the neural networks treated in GeometricMachineLearning, we only rely on optimization of matrix Lie groups and homogeneous spaces, which is much simpler.

      For Lie groups each tangent space is isomorphic to its Lie algebra $\mathfrak{g}\equiv{}T_\mathbb{I}G$. The geodesic map from $\mathfrak{g}$ to $G$, for matrix Lie groups with bi-invariant Riemannian metric like $SO(N)$, is simply the application of the matrix exponential $\exp$. Alternatively this can be replaced by the Cayley transform (see (Absil et al, 2008).)

      Starting from this basic map $\exp:\mathfrak{g}\to{}G$ we can build mappings for more complicated cases:

      1. General tangent space to a Lie group $T_AG$: The geodesic map for an element $V\in{}T_AG$ is simply $A\exp(A^{-1}V)$.

      2. Special tangent space to a homogeneous space $T_E\mathcal{M}$: For $V=BE\in{}T_E\mathcal{M}$ the exponential map is simply $\exp(B)E$.

      3. General tangent space to a homogeneous space $T_Y\mathcal{M}$ with $Y = AE$: For $\Delta=ABE\in{}T_Y\mathcal{M}$ the exponential map is simply $A\exp(B)E$. This is the general case which we deal with.

      The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function retraction in GeometricMachineLearning performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with apply_section and represented through the red vertical line in the diagram on the general optimizer framework.

      Word of caution

      The Lie group corresponding to the Stiefel manifold $SO(N)$ has a bi-invariant Riemannian metric associated with it: $(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)$. For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult (see (Bendokat et al, 2021).)

      References

      • Absil P A, Mahony R, Sepulchre R. Optimization algorithms on matrix manifolds[M]. Princeton University Press, 2008.

      • Bendokat T, Zimmermann R. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications[J]. arXiv preprint arXiv:2108.12447, 2021.

      • O'Neill, Barrett. Semi-Riemannian geometry with applications to relativity. Academic press, 1983.

      diff --git a/latest/pullbacks/computation_of_pullbacks/index.html b/latest/pullbacks/computation_of_pullbacks/index.html index 6a30d4963..caa04c038 100644 --- a/latest/pullbacks/computation_of_pullbacks/index.html +++ b/latest/pullbacks/computation_of_pullbacks/index.html @@ -1,9 +1,9 @@ -Pullbacks · GeometricMachineLearning.jl

      Pullbacks and Automatic Differentiation

      Automatic Differentiation is an important part of modern machine learning libraries. It is essentially a tool to compute the gradient of a loss function with respect to its input arguments.

      How to Compute Pullbacks

      GeometricMachineLearning has many pullbacks for custom array types and other operations implemented. The need for this essentially comes from the fact that we cannot trivially differentiate custom GPU kernels at the moment[1].

      What is a pullback?

      Here we first explain the principle of a pullback with the example of a vector-valued function. The generalization to matrices and higher-order tensors is straight-forward.

      The pullback of a vector-valued function $f:\mathbb{R}^{n}\to\mathbb{R}^m$ can be interpreted as the sensitivities in the input space $\mathbb{R}^n$ with respect to variations in the output space $\mathbb{R}^m$ via the function $f$:

      \[\left[\mathrm{pullback}(f)[a\in\mathbb{R}^n, db\in\mathbb{R}^m]\right]_i = \sum_{j=1}^m\frac{\partial{}f_j}{\partial{}a_i}db_j.\]

      This principle can easily be generalized to matrices. For this consider the function $g::\mathbb{R}^{n_1\times{}n_2}\to\mathbb{R}^{m_1\times{}m_2}$. For this case we have:

      \[\left[\mathrm{pullback}(g)[A\in\mathbb{R}^{n_1\times{}n_2}, dB\in\mathbb{R}^{m_1\times{}m_2}]\right]_{(i_1, i_2)} = \sum_{j_1=1}^{m_1}\sum_{j_2=1}^{m_2}\frac{\partial{}f_{(j_1, j_2)}}{\partial{}a_{(i_1, i_2)}}db_{(j_1, j_2)}.\]

      The generalization to higher-order tensors is again straight-forward.

      Illustrative example

      Consider the matrix inverse $\mathrm{inv}: \mathbb{R}^{n\times{}n}\to\mathbb{R}^{n\times{}n}$ as an example. This fits into the above framework where $inv$ is a matrix-valued function from $\mathbb{R}^{n\times{}n}$ to $\mathbb{R}^{n\times{}n}$. We here write $B := A^{-1} = \mathrm{inv}(A)$. We thus have to compute:

      \[\left[\mathrm{pullback}(\mathrm{inv})[A\in\mathbb{R}^{n\times{}n}, dB\in\mathbb{R}^{n\times{}n}]\right]_{(i, j)} = \sum_{k=1}^{n}\sum_{\ell=1}^{n}\frac{\partial{}b_{k, \ell}}{\partial{}a_{i, j}}db_{k, \ell}.\]

      For a matrix $A$ that depends on a parameter $\varepsilon$ we have that:

      \[\frac{\partial}{\partial\varepsilon}B = -B\left( \frac{\partial}{\partial\varepsilon} \right) B.\]

      This can easily be checked:

      \[\mathbb{O} = \frac{\partial}{\partial\varepsilon}\mathbb{I} = \frac{\partial}{\partial\varepsilon}(AB) = A\frac{\partial}{\partial\varepsilon}B + \left(\frac{\partial}{\partial\varepsilon}A\right)B.\]

      We can then write:

      \[\begin{aligned} +Pullbacks · GeometricMachineLearning.jl

      Pullbacks and Automatic Differentiation

      Automatic Differentiation is an important part of modern machine learning libraries. It is essentially a tool to compute the gradient of a loss function with respect to its input arguments.

      How to Compute Pullbacks

      GeometricMachineLearning has many pullbacks for custom array types and other operations implemented. The need for this essentially comes from the fact that we cannot trivially differentiate custom GPU kernels at the moment[1].

      What is a pullback?

      Here we first explain the principle of a pullback with the example of a vector-valued function. The generalization to matrices and higher-order tensors is straight-forward.

      The pullback of a vector-valued function $f:\mathbb{R}^{n}\to\mathbb{R}^m$ can be interpreted as the sensitivities in the input space $\mathbb{R}^n$ with respect to variations in the output space $\mathbb{R}^m$ via the function $f$:

      \[\left[\mathrm{pullback}(f)[a\in\mathbb{R}^n, db\in\mathbb{R}^m]\right]_i = \sum_{j=1}^m\frac{\partial{}f_j}{\partial{}a_i}db_j.\]

      This principle can easily be generalized to matrices. For this consider the function $g::\mathbb{R}^{n_1\times{}n_2}\to\mathbb{R}^{m_1\times{}m_2}$. For this case we have:

      \[\left[\mathrm{pullback}(g)[A\in\mathbb{R}^{n_1\times{}n_2}, dB\in\mathbb{R}^{m_1\times{}m_2}]\right]_{(i_1, i_2)} = \sum_{j_1=1}^{m_1}\sum_{j_2=1}^{m_2}\frac{\partial{}f_{(j_1, j_2)}}{\partial{}a_{(i_1, i_2)}}db_{(j_1, j_2)}.\]

      The generalization to higher-order tensors is again straight-forward.

      Illustrative example

      Consider the matrix inverse $\mathrm{inv}: \mathbb{R}^{n\times{}n}\to\mathbb{R}^{n\times{}n}$ as an example. This fits into the above framework where $inv$ is a matrix-valued function from $\mathbb{R}^{n\times{}n}$ to $\mathbb{R}^{n\times{}n}$. We here write $B := A^{-1} = \mathrm{inv}(A)$. We thus have to compute:

      \[\left[\mathrm{pullback}(\mathrm{inv})[A\in\mathbb{R}^{n\times{}n}, dB\in\mathbb{R}^{n\times{}n}]\right]_{(i, j)} = \sum_{k=1}^{n}\sum_{\ell=1}^{n}\frac{\partial{}b_{k, \ell}}{\partial{}a_{i, j}}db_{k, \ell}.\]

      For a matrix $A$ that depends on a parameter $\varepsilon$ we have that:

      \[\frac{\partial}{\partial\varepsilon}B = -B\left( \frac{\partial}{\partial\varepsilon} \right) B.\]

      This can easily be checked:

      \[\mathbb{O} = \frac{\partial}{\partial\varepsilon}\mathbb{I} = \frac{\partial}{\partial\varepsilon}(AB) = A\frac{\partial}{\partial\varepsilon}B + \left(\frac{\partial}{\partial\varepsilon}A\right)B.\]

      We can then write:

      \[\begin{aligned} \sum_{k,\ell}\left( \frac{\partial}{\partial{}a_{ij}} b_{k\ell} \right) db_{k\ell} & = \sum_{k\ell}\left[ \frac{\partial}{\partial{}a_{ij}} B \right]_{k\ell} db_{k,\ell} \\ & = - \sum_{k,\ell}\left[B \left(\frac{\partial}{\partial{}a_{ij}} A\right) B \right]_{k\ell} db_{k\ell} \\ & = - \sum_{k,\ell,m,n}b_{km} \left(\frac{\partial{}a_{mn}}{\partial{}a_{ij}}\right) b_{n\ell} db_{k\ell} \\ & = - \sum_{k,\ell,m,n}b_{km} \delta_{im}\delta_{jn} b_{n\ell} db_{k\ell} \\ & = - \sum_{k,\ell}b_{ki} b_{j\ell} db_{k\ell} \\ & \equiv - B^T\cdot{}dB\cdot{}B^T. -\end{aligned}\]

      Motivation from a differential-geometric perspective

      The notions of a pullback in automatic differentiation and differential geometry are closely related (see e.g. [10] and [11]). In both cases we want to compute, based on a mapping $f:\mathcal{V}\to\mathcal{W}, a \mapsto f(a) =: b$, a map of differentials $db \mapsto da$. In the differential geometry case $db$ and $da$ are part of the associated cotangent spaces, i.e. $db\in{}T^*_b\mathcal{W}$ and $da\in{}T^*_a\mathcal{V}$; in AD we (mostly) deal with spaces of arrays, i.e. vector spaces, which means that $db\in\mathcal{W}$ and $da\in\mathcal{V}$.

      [10]
      M. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).
      [11]
      J. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).
      • 1This will change once we switch to Enzyme (see [9]), but the package is still in its infancy.
      +\end{aligned}\]

      Motivation from a differential-geometric perspective

      The notions of a pullback in automatic differentiation and differential geometry are closely related (see e.g. [13] and [14]). In both cases we want to compute, based on a mapping $f:\mathcal{V}\to\mathcal{W}, a \mapsto f(a) =: b$, a map of differentials $db \mapsto da$. In the differential geometry case $db$ and $da$ are part of the associated cotangent spaces, i.e. $db\in{}T^*_b\mathcal{W}$ and $da\in{}T^*_a\mathcal{V}$; in AD we (mostly) deal with spaces of arrays, i.e. vector spaces, which means that $db\in\mathcal{W}$ and $da\in\mathcal{V}$.

      [13]
      M. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).
      [14]
      J. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).
      • 1This will change once we switch to Enzyme (see [12]), but the package is still in its infancy.
      diff --git a/latest/reduced_order_modeling/autoencoder/index.html b/latest/reduced_order_modeling/autoencoder/index.html index c6d2260f2..c437f1028 100644 --- a/latest/reduced_order_modeling/autoencoder/index.html +++ b/latest/reduced_order_modeling/autoencoder/index.html @@ -1,2 +1,2 @@ -POD and Autoencoders · GeometricMachineLearning.jl

      Reduced Order modeling and Autoencoders

      Reduced order modeling is a data-driven technique that exploits the structure of parametric PDEs to make solving those PDEs easier.

      Consider a parametric PDE written in the form: $F(z(\mu);\mu)=0$ where $z(\mu)$ evolves on a infinite-dimensional Hilbert space $V$.

      In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of $V$, which will be denoted by $V_h$.

      Solution manifold

      To any parametric PDE we associate a solution manifold:

      \[\mathcal{M} = \{z(\mu):F(z(\mu);\mu)=0, \mu\in\mathbb{P}\}.\]

      In the image above a 2-dimensional solution manifold is visualized as a sub-manifold in 3-dimensional space. In general the embedding space is an infinite-dimensional function space.

      As an example of this consider the 1-dimensional wave equation:

      \[\partial_{tt}^2q(t,\xi;\mu) = \mu^2\partial_{\xi\xi}^2q(t,\xi;\mu)\text{ on }I\times\Omega,\]

      where $I = (0,1)$ and $\Omega=(-1/2,1/2)$. As initial condition for the first derivative we have $\partial_tq(0,\xi;\mu) = -\mu\partial_\xi{}q_0(\xi;\mu)$ and furthermore $q(t,\xi;\mu)=0$ on the boundary (i.e. $\xi\in\{-1/2,1/2\}$).

      The solution manifold is a 1-dimensional submanifold:

      \[\mathcal{M} = \{(t, \xi)\mapsto{}q(t,\xi;\mu)=q_0(\xi-\mu{}t;\mu):\mu\in\mathbb{P}\subset\mathbb{R}\}.\]

      If we provide an initial condition $u_0$, a parameter instance $\mu$ and a time $t$, then $\xi\mapsto{}q(t,\xi;\mu)$ will be the momentary solution. If we consider the time evolution of $q(t,\xi;\mu)$, then it evolves on a two-dimensional submanifold $\bar{\mathcal{M}} := \{\xi\mapsto{}q(t,\xi;\mu):t\in{}I,\mu\in\mathbb{P}\}$.

      General workflow

      In reduced order modeling we aim to construct a mapping to a space that is close to this solution manifold. This is done through the following steps:

      1. Discretize the PDE.

      2. Solve the discretized PDE for a certain set of parameter instances $\mu\in\mathbb{P}$.

      3. Build a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the reduction $\mathcal{P}$ and the reconstruction $\mathcal{R}$.

      The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been Proper orthogonal decomposition (POD), but in recent years autoencoders have also become a popular alternative (see (Fresca et al, 2021)).

      References

      [25]
      S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).
      +POD and Autoencoders · GeometricMachineLearning.jl

      Reduced Order modeling and Autoencoders

      Reduced order modeling is a data-driven technique that exploits the structure of parametric PDEs to make solving those PDEs easier.

      Consider a parametric PDE written in the form: $F(z(\mu);\mu)=0$ where $z(\mu)$ evolves on a infinite-dimensional Hilbert space $V$.

      In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of $V$, which will be denoted by $V_h$.

      Solution manifold

      To any parametric PDE we associate a solution manifold:

      \[\mathcal{M} = \{z(\mu):F(z(\mu);\mu)=0, \mu\in\mathbb{P}\}.\]

      In the image above a 2-dimensional solution manifold is visualized as a sub-manifold in 3-dimensional space. In general the embedding space is an infinite-dimensional function space.

      As an example of this consider the 1-dimensional wave equation:

      \[\partial_{tt}^2q(t,\xi;\mu) = \mu^2\partial_{\xi\xi}^2q(t,\xi;\mu)\text{ on }I\times\Omega,\]

      where $I = (0,1)$ and $\Omega=(-1/2,1/2)$. As initial condition for the first derivative we have $\partial_tq(0,\xi;\mu) = -\mu\partial_\xi{}q_0(\xi;\mu)$ and furthermore $q(t,\xi;\mu)=0$ on the boundary (i.e. $\xi\in\{-1/2,1/2\}$).

      The solution manifold is a 1-dimensional submanifold:

      \[\mathcal{M} = \{(t, \xi)\mapsto{}q(t,\xi;\mu)=q_0(\xi-\mu{}t;\mu):\mu\in\mathbb{P}\subset\mathbb{R}\}.\]

      If we provide an initial condition $u_0$, a parameter instance $\mu$ and a time $t$, then $\xi\mapsto{}q(t,\xi;\mu)$ will be the momentary solution. If we consider the time evolution of $q(t,\xi;\mu)$, then it evolves on a two-dimensional submanifold $\bar{\mathcal{M}} := \{\xi\mapsto{}q(t,\xi;\mu):t\in{}I,\mu\in\mathbb{P}\}$.

      General workflow

      In reduced order modeling we aim to construct a mapping to a space that is close to this solution manifold. This is done through the following steps:

      1. Discretize the PDE.

      2. Solve the discretized PDE for a certain set of parameter instances $\mu\in\mathbb{P}$.

      3. Build a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the reduction $\mathcal{P}$ and the reconstruction $\mathcal{R}$.

      The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been Proper orthogonal decomposition (POD), but in recent years autoencoders have also become a popular alternative (see (Fresca et al, 2021)).

      References

      [27]
      S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).
      diff --git a/latest/reduced_order_modeling/kolmogorov_n_width/index.html b/latest/reduced_order_modeling/kolmogorov_n_width/index.html index 239554316..da95e0940 100644 --- a/latest/reduced_order_modeling/kolmogorov_n_width/index.html +++ b/latest/reduced_order_modeling/kolmogorov_n_width/index.html @@ -1,2 +1,2 @@ -Kolmogorov n-width · GeometricMachineLearning.jl

      Kolmogorov $n$-width

      The Kolmogorov $n$-width measures how well some set $\mathcal{M}$ (typically the solution manifold) can be approximated with a linear subspace:

      \[d_n(\mathcal{M}) := \mathrm{inf}_{V_n\subset{}V;\mathrm{dim}V_n=n}\mathrm{sup}(u\in\mathcal{M})\mathrm{inf}_{v_n\in{}V_n}|| u - v_n ||_V,\]

      with $\mathcal{M}\subset{}V$ and $V$ is a (typically infinite-dimensional) Banach space. For advection-dominated problems (among others) the decay of the Kolmogorov $n$-width is very slow, i.e. one has to pick $n$ very high in order to obtain useful approximations (see [34] and [35]).

      In order to overcome this, techniques based on neural networks (see e.g. [26]) and optimal transport (see e.g. [35]) have been used.

      References

      [35]
      T. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).
      [34]
      C. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).
      [26]
      K. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).
      +Kolmogorov n-width · GeometricMachineLearning.jl

      Kolmogorov $n$-width

      The Kolmogorov $n$-width measures how well some set $\mathcal{M}$ (typically the solution manifold) can be approximated with a linear subspace:

      \[d_n(\mathcal{M}) := \mathrm{inf}_{V_n\subset{}V;\mathrm{dim}V_n=n}\mathrm{sup}(u\in\mathcal{M})\mathrm{inf}_{v_n\in{}V_n}|| u - v_n ||_V,\]

      with $\mathcal{M}\subset{}V$ and $V$ is a (typically infinite-dimensional) Banach space. For advection-dominated problems (among others) the decay of the Kolmogorov $n$-width is very slow, i.e. one has to pick $n$ very high in order to obtain useful approximations (see [36] and [37]).

      In order to overcome this, techniques based on neural networks (see e.g. [28]) and optimal transport (see e.g. [37]) have been used.

      References

      [37]
      T. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).
      [36]
      C. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).
      [28]
      K. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).
      diff --git a/latest/reduced_order_modeling/projection_reduction_errors/index.html b/latest/reduced_order_modeling/projection_reduction_errors/index.html index 72307e8a9..3e95a9be1 100644 --- a/latest/reduced_order_modeling/projection_reduction_errors/index.html +++ b/latest/reduced_order_modeling/projection_reduction_errors/index.html @@ -1,5 +1,5 @@ -Projection and Reduction Error · GeometricMachineLearning.jl

      Projection and Reduction Errors of Reduced Models

      Two errors that are of very big importance in reduced order modeling are the projection and the reduction error. During training one typically aims to miminimze the projection error, but for the actual application of the model the reduction error is often more important.

      Projection Error

      The projection error computes how well a reduced basis, represented by the reduction $\mathcal{P}$ and the reconstruction $\mathcal{R}$, can represent the data with which it is build. In mathematical terms:

      \[e_\mathrm{proj}(\mu) := +Projection and Reduction Error · GeometricMachineLearning.jl

      Projection and Reduction Errors of Reduced Models

      Two errors that are of very big importance in reduced order modeling are the projection and the reduction error. During training one typically aims to miminimze the projection error, but for the actual application of the model the reduction error is often more important.

      Projection Error

      The projection error computes how well a reduced basis, represented by the reduction $\mathcal{P}$ and the reconstruction $\mathcal{R}$, can represent the data with which it is build. In mathematical terms:

      \[e_\mathrm{proj}(\mu) := \frac{|| \mathcal{R}\circ\mathcal{P}(M) - M ||}{|| M ||},\]

      where $||\cdot||$ is the Frobenius norm (one could also optimize for different norms).

      Reduction Error

      The reduction error measures how far the reduced system diverges from the full-order system during integration (online stage). In mathematical terms (and for a single initial condition):

      \[e_\mathrm{red}(\mu) := \sqrt{ \frac{\sum_{t=0}^K|| \mathbf{x}^{(t)}(\mu) - \mathcal{R}(\mathbf{x}^{(t)}_r(\mu)) ||^2}{\sum_{t=0}^K|| \mathbf{x}^{(t)}(\mu) ||^2} -},\]

      where $\mathbf{x}^{(t)}$ is the solution of the FOM at point $t$ and $\mathbf{x}^{(t)}_r$ is the solution of the ROM (in the reduced basis) at point $t$. The reduction error, as opposed to the projection error, not only measures how well the solution manifold is represented by the reduced basis, but also measures how well the FOM dynamics are approximated by the ROM dynamics (via the induced vector field on the reduced basis).

      +},\]

      where $\mathbf{x}^{(t)}$ is the solution of the FOM at point $t$ and $\mathbf{x}^{(t)}_r$ is the solution of the ROM (in the reduced basis) at point $t$. The reduction error, as opposed to the projection error, not only measures how well the solution manifold is represented by the reduced basis, but also measures how well the FOM dynamics are approximated by the ROM dynamics (via the induced vector field on the reduced basis).

      diff --git a/latest/reduced_order_modeling/symplectic_autoencoder/index.html b/latest/reduced_order_modeling/symplectic_autoencoder/index.html index 6f622f6cb..6ef02beb3 100644 --- a/latest/reduced_order_modeling/symplectic_autoencoder/index.html +++ b/latest/reduced_order_modeling/symplectic_autoencoder/index.html @@ -1,5 +1,5 @@ -PSD and Symplectic Autoencoders · GeometricMachineLearning.jl

      Symplectic Autoencoder

      Symplectic Autoencoders are a type of neural network suitable for treating Hamiltonian parametrized PDEs with slowly decaying Kolmogorov $n$-width. It is based on proper symplectic decomposition (PSD) and symplectic neural networks (SympNets).

      Hamiltonian Model Order Reduction

      Hamiltonian PDEs are partial differential equations that, like its ODE counterpart, have a Hamiltonian associated with it. An example of this is the linear wave equation (see [32]) with Hamiltonian

      \[\mathcal{H}(q, p; \mu) := \frac{1}{2}\int_\Omega\mu^2(\partial_\xi{}q(t,\xi;\mu))^2 + p(t,\xi;\mu)^2d\xi.\]

      The PDE for to this Hamiltonian can be obtained similarly as in the ODE case:

      \[\partial_t{}q(t,\xi;\mu) = \frac{\delta{}\mathcal{H}}{\delta{}p} = p(t,\xi;\mu), \quad \partial_t{}p(t,\xi;\mu) = -\frac{\delta{}\mathcal{H}}{\delta{}q} = \mu^2\partial_{\xi{}\xi}q(t,\xi;\mu)\]

      Symplectic Solution Manifold

      As with regular parametric PDEs, we also associate a solution manifold with Hamiltonian PDEs. This is a finite-dimensional manifold, on which the dynamics can be described through a Hamiltonian ODE. I NEED A PROOF OR SOME EXPLANATION FOR THIS!

      Workflow for Symplectic ROM

      As with any other reduced order modeling technique we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system:

      \[\mathcal{H}_\mathrm{discr}(z(t;\mu);\mu) := \frac{1}{2}x(t;\mu)^T\begin{bmatrix} -\mu^2D_{\xi{}\xi} & \mathbb{O} \\ \mathbb{O} & \mathbb{I} \end{bmatrix} x(t;\mu).\]

      In Hamiltonian reduced order modelling we try to find a symplectic submanifold of the solution space[1] that captures the dynamics of the full system as well as possible.

      Similar to the regular PDE case we again build an encoder $\Psi^\mathrm{enc}$ and a decoder $\Psi^\mathrm{dec}$; but now both these mappings are required to be symplectic!

      Concretely this means:

      1. The encoder is a mapping from a high-dimensional symplectic space to a low-dimensional symplectic space, i.e. $\Psi^\mathrm{enc}:\mathbb{R}^{2N}\to\mathbb{R}^{2n}$ such that $\nabla\Psi^\mathrm{enc}\mathbb{J}_{2N}(\nabla\Psi^\mathrm{enc})^T = \mathbb{J}_{2n}$.
      2. The decoder is a mapping from a low-dimensional symplectic space to a high-dimensional symplectic space, i.e. $\Psi^\mathrm{dec}:\mathbb{R}^{2n}\to\mathbb{R}^{2N}$ such that $(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}\nabla\Psi^\mathrm{dec} = \mathbb{J}_{2n}$.

      If these two maps are constrained to linear maps, then one can easily find good solutions with proper symplectic decomposition (PSD).

      Proper Symplectic Decomposition

      For PSD the two mappings $\Psi^\mathrm{enc}$ and $\Psi^\mathrm{dec}$ are constrained to be linear, orthonormal (i.e. $\Psi^T\Psi = \mathbb{I}$) and symplectic. The easiest way to enforce this is through the so-called cotangent lift:

      \[\Psi_\mathrm{CL} = +PSD and Symplectic Autoencoders · GeometricMachineLearning.jl

      Symplectic Autoencoder

      Symplectic Autoencoders are a type of neural network suitable for treating Hamiltonian parametrized PDEs with slowly decaying Kolmogorov $n$-width. It is based on proper symplectic decomposition (PSD) and symplectic neural networks (SympNets).

      Hamiltonian Model Order Reduction

      Hamiltonian PDEs are partial differential equations that, like its ODE counterpart, have a Hamiltonian associated with it. An example of this is the linear wave equation (see [34]) with Hamiltonian

      \[\mathcal{H}(q, p; \mu) := \frac{1}{2}\int_\Omega\mu^2(\partial_\xi{}q(t,\xi;\mu))^2 + p(t,\xi;\mu)^2d\xi.\]

      The PDE for to this Hamiltonian can be obtained similarly as in the ODE case:

      \[\partial_t{}q(t,\xi;\mu) = \frac{\delta{}\mathcal{H}}{\delta{}p} = p(t,\xi;\mu), \quad \partial_t{}p(t,\xi;\mu) = -\frac{\delta{}\mathcal{H}}{\delta{}q} = \mu^2\partial_{\xi{}\xi}q(t,\xi;\mu)\]

      Symplectic Solution Manifold

      As with regular parametric PDEs, we also associate a solution manifold with Hamiltonian PDEs. This is a finite-dimensional manifold, on which the dynamics can be described through a Hamiltonian ODE. I NEED A PROOF OR SOME EXPLANATION FOR THIS!

      Workflow for Symplectic ROM

      As with any other reduced order modeling technique we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system:

      \[\mathcal{H}_\mathrm{discr}(z(t;\mu);\mu) := \frac{1}{2}x(t;\mu)^T\begin{bmatrix} -\mu^2D_{\xi{}\xi} & \mathbb{O} \\ \mathbb{O} & \mathbb{I} \end{bmatrix} x(t;\mu).\]

      In Hamiltonian reduced order modelling we try to find a symplectic submanifold of the solution space[1] that captures the dynamics of the full system as well as possible.

      Similar to the regular PDE case we again build an encoder $\Psi^\mathrm{enc}$ and a decoder $\Psi^\mathrm{dec}$; but now both these mappings are required to be symplectic!

      Concretely this means:

      1. The encoder is a mapping from a high-dimensional symplectic space to a low-dimensional symplectic space, i.e. $\Psi^\mathrm{enc}:\mathbb{R}^{2N}\to\mathbb{R}^{2n}$ such that $\nabla\Psi^\mathrm{enc}\mathbb{J}_{2N}(\nabla\Psi^\mathrm{enc})^T = \mathbb{J}_{2n}$.
      2. The decoder is a mapping from a low-dimensional symplectic space to a high-dimensional symplectic space, i.e. $\Psi^\mathrm{dec}:\mathbb{R}^{2n}\to\mathbb{R}^{2N}$ such that $(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}\nabla\Psi^\mathrm{dec} = \mathbb{J}_{2n}$.

      If these two maps are constrained to linear maps, then one can easily find good solutions with proper symplectic decomposition (PSD).

      Proper Symplectic Decomposition

      For PSD the two mappings $\Psi^\mathrm{enc}$ and $\Psi^\mathrm{dec}$ are constrained to be linear, orthonormal (i.e. $\Psi^T\Psi = \mathbb{I}$) and symplectic. The easiest way to enforce this is through the so-called cotangent lift:

      \[\Psi_\mathrm{CL} = \begin{bmatrix} \Phi & \mathbb{O} \\ \mathbb{O} & \Phi \end{bmatrix},\]

      and $\Phi\in{}St(n,N)\subset\mathbb{R}^{N\times{}n}$, i.e. is an element of the Stiefel manifold. If the snapshot matrix is of the form:

      \[M = \left[\begin{array}{c:c:c:c} \hat{q}_1(t_0) & \hat{q}_1(t_1) & \quad\ldots\quad & \hat{q}_1(t_f) \\ \hat{q}_2(t_0) & \hat{q}_2(t_1) & \ldots & \hat{q}_2(t_f) \\ @@ -9,4 +9,4 @@ \hat{p}_2(t_0) & \hat{p}_2(t_1) & \ldots & \hat{p}_2(t_f) \\ \ldots & \ldots & \ldots & \ldots \\ \hat{p}_{N}(t_0) & \hat{p}_{N}(t_1) & \ldots & \hat{p}_{N}(t_f) \\ -\end{array}\right],\]

      then $\Phi$ can be computed in a very straight-forward manner:

      1. Rearrange the rows of the matrix $M$ such that we end up with a $N\times2(f+1)$ matrix: $\hat{M} := [M_q, M_p]$.
      2. Perform SVD: $\hat{M} = U\Sigma{}V^T$; set $\Phi\gets{}U\mathtt{[:,1:n]}$.

      For details on the cotangent lift (and other methods for linear symplectic model reduction) consult [33].

      Symplectic Autoencoders

      PSD suffers from the similar shortcomings as regular POD: it is a linear map and the approximation space $\tilde{\mathcal{M}}= \{\Psi^\mathrm{dec}(z_r)\in\mathbb{R}^{2N}:u_r\in\mathrm{R}^{2n}\}$ is strictly linear. For problems with slowly-decaying Kolmogorov $n$-width this leads to very poor approximations.

      In order to overcome this difficulty we use neural networks, more specifically SympNets, together with cotangent lift-like matrices. The resulting architecture, symplectic autoencoders, are demonstrated in the following image:

      So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices $\Phi\in{}St(n,N)$ we have to optimize on the Stiefel manifold.

      References

      [32]
      P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).
      [33]
      L. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).
      • 1The submanifold is: $\tilde{\mathcal{M}} = \{\Psi^\mathrm{dec}(z_r)\in\mathbb{R}^{2N}:u_r\in\mathrm{R}^{2n}\}$ where $z_r$ is the reduced state of the system.
      +\end{array}\right],\]

      then $\Phi$ can be computed in a very straight-forward manner:

      1. Rearrange the rows of the matrix $M$ such that we end up with a $N\times2(f+1)$ matrix: $\hat{M} := [M_q, M_p]$.
      2. Perform SVD: $\hat{M} = U\Sigma{}V^T$; set $\Phi\gets{}U\mathtt{[:,1:n]}$.

      For details on the cotangent lift (and other methods for linear symplectic model reduction) consult [35].

      Symplectic Autoencoders

      PSD suffers from the similar shortcomings as regular POD: it is a linear map and the approximation space $\tilde{\mathcal{M}}= \{\Psi^\mathrm{dec}(z_r)\in\mathbb{R}^{2N}:u_r\in\mathrm{R}^{2n}\}$ is strictly linear. For problems with slowly-decaying Kolmogorov $n$-width this leads to very poor approximations.

      In order to overcome this difficulty we use neural networks, more specifically SympNets, together with cotangent lift-like matrices. The resulting architecture, symplectic autoencoders, are demonstrated in the following image:

      So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices $\Phi\in{}St(n,N)$ we have to optimize on the Stiefel manifold.

      References

      [34]
      P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).
      [35]
      L. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).
      • 1The submanifold is: $\tilde{\mathcal{M}} = \{\Psi^\mathrm{dec}(z_r)\in\mathbb{R}^{2N}:u_r\in\mathrm{R}^{2n}\}$ where $z_r$ is the reduced state of the system.
      diff --git a/latest/references/index.html b/latest/references/index.html index 5b12abac7..337b28166 100644 --- a/latest/references/index.html +++ b/latest/references/index.html @@ -1,2 +1,2 @@ -References · GeometricMachineLearning.jl

      References

      [1]
      S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).
      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      [3]
      S. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).
      [4]
      S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).
      [5]
      M. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).
      [6]
      P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).
      [7]
      E. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).
      [8]
      F. Mezzadri. How to generate random matrices from the classical compact groups, arXiv preprint math-ph/0609050 (2006).
      [9]
      W. S. Moses, V. Churavy, L. Paehler, J. Hückelheim, S. H. Narayanan, M. Schanen and J. Doerfert. Reverse-Mode Automatic Differentiation and Optimization of GPU Kernels via Enzyme. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC '21 (Association for Computing Machinery, New York, NY, USA, 2021).
      [10]
      M. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).
      [11]
      J. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).
      [12]
      J. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).
      [13]
      D. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).
      [14]
      A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).
      [15]
      K. Jacobs. Discrete Stochastics (Birkhäuser Verlag, Basel, Switzerland, 1992).
      [16]
      P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).
      [17]
      K. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).
      [18]
      M.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).
      [19]
      K. Feng and M.-z. Qin. The symplectic methods for the computation of Hamiltonian equations. In: Numerical Methods for Partial Differential Equations: Proceedings of a Conference held in Shanghai, PR China, March 25–29, 1987 (Springer, 1987); pp. 1–37.
      [20]
      Z. Ge and K. Feng. On the approximation of linear Hamiltonian systems. Journal of Computational Mathematics, 88–97 (1988).
      [21]
      B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).
      [22]
      B. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).
      [23]
      [24]
      S. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).
      [25]
      S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).
      [26]
      K. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).
      [27]
      A. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).
      [28]
      A. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, $\beta$-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).
      [29]
      P. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).
      [30]
      P. Jin, Z. Lin and B. Xiao. Optimal unit triangular factorization of symplectic matrices. Linear Algebra and its Applications (2022).
      [31]
      N. Patwardhan, S. Marrone and C. Sansone. Transformers in the real world: A survey on nlp applications. Information 14, 242 (2023).
      [32]
      P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).
      [33]
      L. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).
      [34]
      C. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).
      [35]
      T. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).
      [36]
      B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
      [37]
      T. Lin and H. Zha. Riemannian manifold learning. IEEE transactions on pattern analysis and machine intelligence 30, 796–809 (2008).
      [38]
      T. Blickhan. BrenierTwoFluids.jl, https://github.com/ToBlick/BrenierTwoFluids (2023).
      [39]
      I. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).
      [40]
      B. Brantner and M. Kraus. Symplectic autoencoders for Model Reduction of Hamiltonian Systems, arXiv preprint arXiv:2312.10004 (2023).
      [41]
      T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).
      [42]
      T. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).
      [43]
      T. Bendokat, R. Zimmermann and P.-A. Absil. A Grassmann manifold handbook: Basic geometry and computational aspects, arXiv preprint arXiv:2011.13699 (2020).
      [44]
      B. Brantner, G. de Romemont, M. Kraus and Z. Li. Structure-Preserving Transformers for Learning Parametrized Hamiltonian Systems, arXiv preprint arXiv:2312:11166 (2023).
      +References · GeometricMachineLearning.jl

      References

      [1]
      S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).
      [2]
      S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).
      [3]
      S. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).
      [4]
      S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).
      [5]
      M. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).
      [6]
      P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).
      [7]
      E. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).
      [8]
      F. Mezzadri. How to generate random matrices from the classical compact groups, arXiv preprint math-ph/0609050 (2006).
      [9]
      D. D. Holm, T. Schmah and C. Stoica. Geometric mechanics and symmetry: from finite to infinite dimensions. Vol. 12 (Oxford University Press, Oxford, UK, 2009).
      [10]
      P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).
      [11]
      T. Bendokat, R. Zimmermann and P.-A. Absil. A Grassmann manifold handbook: Basic geometry and computational aspects, arXiv preprint arXiv:2011.13699 (2020).
      [12]
      W. S. Moses, V. Churavy, L. Paehler, J. Hückelheim, S. H. Narayanan, M. Schanen and J. Doerfert. Reverse-Mode Automatic Differentiation and Optimization of GPU Kernels via Enzyme. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC '21 (Association for Computing Machinery, New York, NY, USA, 2021).
      [13]
      M. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).
      [14]
      J. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).
      [15]
      J. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).
      [16]
      D. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).
      [17]
      A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).
      [18]
      K. Jacobs. Discrete Stochastics (Birkhäuser Verlag, Basel, Switzerland, 1992).
      [19]
      K. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).
      [20]
      M.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).
      [21]
      K. Feng and M.-z. Qin. The symplectic methods for the computation of Hamiltonian equations. In: Numerical Methods for Partial Differential Equations: Proceedings of a Conference held in Shanghai, PR China, March 25–29, 1987 (Springer, 1987); pp. 1–37.
      [22]
      Z. Ge and K. Feng. On the approximation of linear Hamiltonian systems. Journal of Computational Mathematics, 88–97 (1988).
      [23]
      B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).
      [24]
      B. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).
      [25]
      [26]
      S. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).
      [27]
      S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).
      [28]
      K. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).
      [29]
      A. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).
      [30]
      A. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, $\beta$-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).
      [31]
      P. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).
      [32]
      P. Jin, Z. Lin and B. Xiao. Optimal unit triangular factorization of symplectic matrices. Linear Algebra and its Applications (2022).
      [33]
      N. Patwardhan, S. Marrone and C. Sansone. Transformers in the real world: A survey on nlp applications. Information 14, 242 (2023).
      [34]
      P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).
      [35]
      L. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).
      [36]
      C. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).
      [37]
      T. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).
      [38]
      B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
      [39]
      T. Lin and H. Zha. Riemannian manifold learning. IEEE transactions on pattern analysis and machine intelligence 30, 796–809 (2008).
      [40]
      T. Blickhan. BrenierTwoFluids.jl, https://github.com/ToBlick/BrenierTwoFluids (2023).
      [41]
      I. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).
      [42]
      B. Brantner and M. Kraus. Symplectic autoencoders for Model Reduction of Hamiltonian Systems, arXiv preprint arXiv:2312.10004 (2023).
      [43]
      T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).
      [44]
      T. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).
      [45]
      B. Brantner, G. de Romemont, M. Kraus and Z. Li. Structure-Preserving Transformers for Learning Parametrized Hamiltonian Systems, arXiv preprint arXiv:2312:11166 (2023).
      diff --git a/latest/search_index.js b/latest/search_index.js index 46c194912..b7779c36a 100644 --- a/latest/search_index.js +++ b/latest/search_index.js @@ -1,3 +1,3 @@ var documenterSearchIndex = {"docs": -[{"location":"architectures/neural_network_integrators/#Neural-Network-Integrators","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"In GeometricMachineLearning we can divide most neural network architectures (that are used for applications to physical systems) into two categories: autoencoders and integrators. Integrator in its most general form refers to an approximation of the flow of an ODE (see the section on the existence and uniqueness theorem) by a numerical scheme. Traditionally these numerical schemes were constructed by defining certain relationships between a known time step z^(t) and a future unknown one z^(t+1) [7, 22]: ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":" f(z^(t) z^(t+1)) = 0","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"One usually refers to such a relationship as an \"integration scheme\". If this relationship can be reformulated as ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":" z^(t+1) = g(z^(t))","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"then we refer to the scheme as explicit, if it cannot be reformulated in such a way then we refer to it as implicit. Implicit schemes are typically more expensive to solve than explicit ones. The Julia library GeometricIntegrators [23] offers a wide variety of integration schemes both implicit and explicit. ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"The neural network integrators in GeometricMachineLearning (the corresponding type is NeuralNetworkIntegrator) are all explicit integration schemes where the function g above is modeled with a neural network.","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"Neural networks, as an alternative to traditional methods, are employed because of (i) potentially superior performance and (ii) an ability to learn unknown dynamics from data. ","category":"page"},{"location":"architectures/neural_network_integrators/#Multi-step-methods","page":"Neural Network Integrators","title":"Multi-step methods","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"Multi-step method [19, 20] refers to schemes that are of the form[1]: ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"[1]: We again assume that all the steps up to and including t are known.","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":" f(z^(t - mathttsl + 1) z^(t - mathttsl + 2) ldots z^(t) z^(t + 1) ldots z^(mathttpw + 1)) = 0","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"where sl is short for sequence length and pw is short for prediction window. In contrast to traditional single-step methods, sl and pw can be greater than 1. An explicit multi-step method has the following form: ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"z^(t+1) ldots z^(t+mathttpw) = g(z^(t - mathttsl + 1) ldots z^(t))","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"There are essentially two ways to construct multi-step methods with neural networks: the older one is using recurrent neural networks such as long short-term memory cells (LSTMs, [24]) and the newer one is using transformer neural networks [14]. Both of these approaches have been successfully employed to learn multi-step methods (see [25, 26] for the former and [21, 27, 28] for the latter), but because the transformer architecture exhibits superior performance on modern hardware and can be imbued with geometric properties it is recommended to always use a transformer-derived architecture when dealing with time series[2].","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"[2]: GeometricMachineLearning also has an LSTM implementation, but this may be deprecated in the future. ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"Explicit multi-step methods derived from he transformer are always subtypes of the type TransformerIntegrator in GeometricMachineLearning. In GeometricMachineLearning the standard transformer, the volume-preserving transformer and the linear symplectic transformer are implemented. ","category":"page"},{"location":"architectures/neural_network_integrators/#Library-Functions","page":"Neural Network Integrators","title":"Library Functions","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"NeuralNetworkIntegrator \nTransformerIntegrator","category":"page"},{"location":"architectures/neural_network_integrators/#GeometricMachineLearning.NeuralNetworkIntegrator-architectures-neural_network_integrators","page":"Neural Network Integrators","title":"GeometricMachineLearning.NeuralNetworkIntegrator","text":"This is a super type of various neural network architectures such as SympNet and ResNet whose purpose is to approximate the flow of an ordinary differential equation (ODE).\n\n\n\n\n\n","category":"type"},{"location":"architectures/neural_network_integrators/#GeometricMachineLearning.TransformerIntegrator-architectures-neural_network_integrators","page":"Neural Network Integrators","title":"GeometricMachineLearning.TransformerIntegrator","text":"Encompasses various transformer architectures, such as the structure-preserving transformer and the linear symplectic transformer. \n\n\n\n\n\n","category":"type"},{"location":"architectures/neural_network_integrators/#References","page":"Neural Network Integrators","title":"References","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"E. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).\n\n\n\nB. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).\n\n\n\nM. Kraus. GeometricIntegrators.jl: Geometric Numerical Integration in Julia, https://github.com/JuliaGNI/GeometricIntegrators.jl (2020).\n\n\n\nK. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).\n\n\n\nS. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).\n\n\n\nA. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).\n\n\n\nS. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).\n\n\n\nK. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).\n\n\n\nA. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).\n\n\n\nA. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, beta-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).\n\n\n\nB. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).\n\n\n\n","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/#Projection-and-Reduction-Errors-of-Reduced-Models","page":"Projection and Reduction Error","title":"Projection and Reduction Errors of Reduced Models","text":"","category":"section"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"Two errors that are of very big importance in reduced order modeling are the projection and the reduction error. During training one typically aims to miminimze the projection error, but for the actual application of the model the reduction error is often more important. ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/#Projection-Error","page":"Projection and Reduction Error","title":"Projection Error","text":"","category":"section"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"The projection error computes how well a reduced basis, represented by the reduction mathcalP and the reconstruction mathcalR, can represent the data with which it is build. In mathematical terms: ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"e_mathrmproj(mu) = \n frac mathcalRcircmathcalP(M) - M M ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"where cdot is the Frobenius norm (one could also optimize for different norms).","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/#Reduction-Error","page":"Projection and Reduction Error","title":"Reduction Error","text":"","category":"section"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"The reduction error measures how far the reduced system diverges from the full-order system during integration (online stage). In mathematical terms (and for a single initial condition): ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"e_mathrmred(mu) = sqrt\n fracsum_t=0^K mathbfx^(t)(mu) - mathcalR(mathbfx^(t)_r(mu)) ^2sum_t=0^K mathbfx^(t)(mu) ^2\n","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"where mathbfx^(t) is the solution of the FOM at point t and mathbfx^(t)_r is the solution of the ROM (in the reduced basis) at point t. The reduction error, as opposed to the projection error, not only measures how well the solution manifold is represented by the reduced basis, but also measures how well the FOM dynamics are approximated by the ROM dynamics (via the induced vector field on the reduced basis).","category":"page"},{"location":"architectures/autoencoders/#Variational-Autoencoders","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Variational autoencoders (Lee and Carlberg, 2020) train on the following set: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"mathcalX(mathbbP_mathrmtrain) = mathbfx^k(mu) - mathbfx^0(mu)0leqkleqKmuinmathbbP_mathrmtrain","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathbfx^k(mu)approxmathbfx(t^kmu). Note that mathbf0inmathcalX(mathbbP_mathrmtrain) as k can also be zero. ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"The encoder Psi^mathrmenc and decoder Psi^mathrmdec are then trained on this set mathcalX(mathbbP_mathrmtrain) by minimizing the reconstruction error: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":" mathbfx - Psi^mathrmdeccircPsi^mathrmenc(mathbfx) text for mathbfxinmathcalX(mathbbP_mathrmtrain)","category":"page"},{"location":"architectures/autoencoders/#Initial-condition","page":"Variational Autoencoders","title":"Initial condition","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"No matter the parameter mu the initial condition in the reduced system is always mathbfx_r0(mu) = mathbfx_r0 = Psi^mathrmenc(mathbf0). ","category":"page"},{"location":"architectures/autoencoders/#Reconstructed-solution","page":"Variational Autoencoders","title":"Reconstructed solution","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"In order to arrive at the reconstructed solution one first has to decode the reduced state and then add the reference state:","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"mathbfx^mathrmreconstr(tmu) = mathbfx^mathrmref(mu) + Psi^mathrmdec(mathbfx_r(tmu))","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathbfx^mathrmref(mu) = mathbfx(t_0mu) - Psi^mathrmdeccircPsi^mathrmdec(mathbf0).","category":"page"},{"location":"architectures/autoencoders/#Symplectic-reduced-vector-field","page":"Variational Autoencoders","title":"Symplectic reduced vector field","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"A symplectic vector field is one whose flow conserves the symplectic structure mathbbJ. This is equivalent[1] to there existing a Hamiltonian H s.t. the vector field X can be written as X = mathbbJnablaH.","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"[1]: Technically speaking the definitions are equivalent only for simply-connected manifolds, so also for vector spaces. ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"If the full-order Hamiltonian is H^mathrmfullequivH we can obtain another Hamiltonian on the reduces space by simply setting: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"H^mathrmred(mathbfx_r(tmu)) = H(mathbfx^mathrmreconstr(tmu)) = H(mathbfx^mathrmref(mu) + Psi^mathrmdec(mathbfx_r(tmu)))","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"The ODE associated to this Hamiltonian is also the one corresponding to Manifold Galerkin ROM (see (Lee and Carlberg, 2020)).","category":"page"},{"location":"architectures/autoencoders/#Manifold-Galerkin-ROM","page":"Variational Autoencoders","title":"Manifold Galerkin ROM","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Define the FOM ODE residual as: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"r (mathbfv xi tau mu) mapsto mathbfv - f(xi tau mu)","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"The reduced ODE is then defined to be: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"dothatmathbfx(tmu) = mathrmargmin_hatmathbfvinmathbbR^p r(mathcalJ(hatmathbfx(tmu))hatmathbfvhatmathbfx^mathrmref(mu) + Psi^mathrmdec(hatmathbfx(tmu))tmu) _2^2","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathcalJ is the Jacobian of the decoder Psi^mathrmdec. This leads to: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"mathcalJ(hatmathbfx(tmu))hatmathbfv - f(hatmathbfx^mathrmref(mu) + Psi^mathrmdec(hatmathbfx(tmu)) t mu) overset= 0 implies \nhatmathbfv = mathcalJ(hatmathbfx(tmu))^+f(hatmathbfx^mathrmref(mu) + Psi^mathrmdec(hatmathbfx(tmu)) t mu)","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathcalJ(hatmathbfx(tmu))^+ is the pseudoinverse of mathcalJ(hatmathbfx(tmu)). Because mathcalJ(hatmathbfx(tmu)) is a symplectic matrix the pseudoinverse is the symplectic inverse (see (Peng and Mohseni, 2016)).","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Furthermore, because f is Hamiltonian, the vector field describing dothatmathbfx(tmu) will also be Hamiltonian. ","category":"page"},{"location":"architectures/autoencoders/#References","page":"Variational Autoencoders","title":"References","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"K. Lee and K. Carlberg. “Model reduction of dynamical systems on nonlinear manifolds using","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"deep convolutional autoencoders”. In: Journal of Computational Physics 404 (2020), p. 108973.","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Peng L, Mohseni K. Symplectic model reduction of Hamiltonian systems[J]. SIAM Journal on Scientific Computing, 2016, 38(1): A1-A27.","category":"page"},{"location":"tutorials/mnist_tutorial/#MNIST-tutorial","page":"MNIST","title":"MNIST tutorial","text":"","category":"section"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"This is a short tutorial that shows how we can use GeometricMachineLearning to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. This is also the result presented in [36].","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"First, we need to import the relevant packages: ","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"using GeometricMachineLearning, CUDA, Plots\nimport Zygote, MLDatasets, KernelAbstractions","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"For the AD routine we here use the GeometricMachineLearning default and we get the dataset from MLDatasets. First we need to load the data set, and put it on GPU (if you have one):","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"train_x, train_y = MLDatasets.MNIST(split=:train)[:]\ntest_x, test_y = MLDatasets.MNIST(split=:test)[:]\ntrain_x = train_x |> cu \ntest_x = test_x |> cu \ntrain_y = train_y |> cu \ntest_y = test_y |> cu","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"GeometricMachineLearning has built-in data loaders that make it particularly easy to handle data: ","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"patch_length = 7\ndl = DataLoader(train_x, train_y, patch_length=patch_length)\ndl_test = DataLoader(train_x, train_y, patch_length=patch_length)","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"Here patch_length indicates the size one patch has. One image in MNIST is of dimension 28times28, this means that we decompose this into 16 (7times7) images (also see [36]).","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"We next define the model with which we want to train:","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"model = ClassificationTransformer(dl, n_heads=n_heads, n_layers=n_layers, Stiefel=true)","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"Here we have chosen a ClassificationTransformer, i.e. a composition of a specific number of transformer layers composed with a classification layer. We also set the Stiefel option to true, i.e. we are optimizing on the Stiefel manifold.","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"We now have to initialize the neural network weights. This is done with the constructor for NeuralNetwork:","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"backend = KernelAbstractions.get_backend(dl)\nT = eltype(dl)\nnn = NeuralNetwork(model, backend, T)","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"And with this we can finally perform the training:","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"# an instance of batch is needed for the optimizer\nbatch = Batch(batch_size)\n\noptimizer_instance = Optimizer(AdamOptimizer(), nn)\n\n# this prints the accuracy and is optional\nprintln(\"initial test accuracy: \", accuracy(Ψᵉ, ps, dl_test), \"\\n\")\n\nloss_array = optimizer_instance(nn, dl, batch, n_epochs)\n\nprintln(\"final test accuracy: \", accuracy(Ψᵉ, ps, dl_test), \"\\n\")","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"It is instructive to play with n_layers, n_epochs and the Stiefel property.","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\n","category":"page"},{"location":"tutorials/volume_preserving_attention/#Comparison-of-different-VolumePreservingAttention","page":"Volume-Preserving Attention","title":"Comparison of different VolumePreservingAttention","text":"","category":"section"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"In the section of volume-preserving attention we mentioned two ways of computing volume-preserving attention: one where we compute the correlations with a skew-symmetric matrix and one where we compute the correlations with an arbitrary matrix. Here we compare the two approaches. When calling the VolumePreservingAttention layer we can specify whether we want to use the skew-symmetric or the arbitrary weighting by setting the keyword skew_sym = true and skew_sym = false respectively. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"In here we demonstrate the differences between the two approaches for computing correlations. For this we first generate a training set consisting of two collections of curves: (i) sine curves and (ii) cosine curve. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"using GeometricMachineLearning # hide\nusing GeometricMachineLearning: FeedForwardLoss, TransformerLoss # hide\nusing Plots # hide\nimport Random # hide \nRandom.seed!(123) # hide\n\nsine_cosine = zeros(1, 1000, 2)\nsine_cosine[1, :, 1] .= sin.(0.:.1:99.9)\nsine_cosine[1, :, 2] .= cos.(0.:.1:99.9)\n\n\nconst dl = DataLoader(Float16.(sine_cosine))","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"The third axis (i.e. the parameter axis) has length two, meaning we have two different kinds of curves: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"plot(dl.input[1, :, 1], label = \"sine\")\nplot!(dl.input[1, :, 2], label = \"cosine\")","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We want to train a single neural network on both these curves. We compare three networks which are of the following form: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"mathttnetwork = mathcalNN_dcircPsicircmathcalNN_u","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"where mathcalNN_u refers to a neural network that scales up and mathcalNN_d refers to a neural network that scales down. The up and down scaling is done with simple dense layers: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"mathcalNN_u(x) = mathrmtanh(a_ux + b_u) text and mathcalNN_d(x) = a_d^Tx + b_d","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"where a_u b_u a_dinmathbbR^mathrmud and b_d is a scalar. ud refers to upscaling dimension. For Psi we consider three different choices:","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"a volume-preserving attention with skew-symmetric weighting,\na volume-preserving attention with arbitrary weighting,\nan identity layer.","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We further choose a sequence length 5 (i.e. the network always sees the last 5 time steps) and always predict one step into the future (i.e. the prediction window is set to 1):","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"const seq_length = 3\nconst prediction_window = 1\n\nconst upscale_dimension_1 = 2\n\nconst T = Float16\n\nfunction set_up_networks(upscale_dimension::Int = upscale_dimension_1)\n model_skew = Chain(Dense(1, upscale_dimension, tanh), VolumePreservingAttention(upscale_dimension, seq_length; skew_sym = true), Dense(upscale_dimension, 1, identity; use_bias = true))\n model_arb = Chain(Dense(1, upscale_dimension, tanh), VolumePreservingAttention(upscale_dimension, seq_length; skew_sym = false), Dense(upscale_dimension, 1, identity; use_bias = true))\n model_comp = Chain(Dense(1, upscale_dimension, tanh), Dense(upscale_dimension, 1, identity; use_bias = true))\n\n nn_skew = NeuralNetwork(model_skew, CPU(), T)\n nn_arb = NeuralNetwork(model_arb, CPU(), T)\n nn_comp = NeuralNetwork(model_comp, CPU(), T)\n\n nn_skew, nn_arb, nn_comp\nend\n\nnn_skew, nn_arb, nn_comp = set_up_networks()","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We expect the third network to not be able to learn anything useful since it cannot resolve time series data: a regular feedforward network only ever sees one datum at a time. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"Next we train the networks (here we pick a batch size of 30):","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"function set_up_optimizers(nn_skew, nn_arb, nn_comp)\n o_skew = Optimizer(AdamOptimizer(T), nn_skew)\n o_arb = Optimizer(AdamOptimizer(T), nn_arb)\n o_comp = Optimizer(AdamOptimizer(T), nn_comp)\n\n o_skew, o_arb, o_comp\nend\n\no_skew, o_arb, o_comp = set_up_optimizers(nn_skew, nn_arb, nn_comp)\n\nconst n_epochs = 1000\n\nconst batch_size = 30\n\nconst batch = Batch(batch_size, seq_length, prediction_window)\nconst batch2 = Batch(batch_size)\n\nfunction train_networks!(nn_skew, nn_arb, nn_comp)\n loss_array_skew = o_skew(nn_skew, dl, batch, n_epochs, TransformerLoss(batch))\n loss_array_arb = o_arb( nn_arb, dl, batch, n_epochs, TransformerLoss(batch))\n loss_array_comp = o_comp(nn_comp, dl, batch2, n_epochs, FeedForwardLoss())\n\n loss_array_skew, loss_array_arb, loss_array_comp\nend\n\nloss_array_skew, loss_array_arb, loss_array_comp = train_networks!(nn_skew, nn_arb, nn_comp)\n\nfunction plot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)\n p = plot(loss_array_skew, color = 2, label = \"skew\", yaxis = :log)\n plot!(p, loss_array_arb, color = 3, label = \"arb\")\n plot!(p, loss_array_comp, color = 4, label = \"comp\")\n\n p\nend\n\nplot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"Looking at the training errors, we can see that the network with the skew-symmetric weighting is stuck at a relatively high error rate, whereas the loss for the network with the arbitrary weighting is decreasing to a significantly lower level. The feedforward network without the attention mechanism is not able to learn anything useful (as was expected). ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"The following demonstrates the predictions of our approaches[1]: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"[1]: Here we have to use the architectures DummyTransformer and DummyNNIntegrator to reformulate the three neural networks defined here as NeuralNetworkIntegrators. Normally the user should try to use predefined architectures in GeometricMachineLearning, that way they never use DummyTransformer and DummyNNIntegrator. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"initial_condition = dl.input[:, 1:seq_length, 2]\n\nfunction make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)\n nn_skew = NeuralNetwork(GeometricMachineLearning.DummyTransformer(seq_length), nn_skew.model, nn_skew.params, CPU())\n nn_arb = NeuralNetwork(GeometricMachineLearning.DummyTransformer(seq_length), nn_arb.model, nn_arb.params, CPU())\n nn_comp = NeuralNetwork(GeometricMachineLearning.DummyNNIntegrator(), nn_comp.model, nn_comp.params, CPU())\n\n nn_skew, nn_arb, nn_comp\nend\n\nnn_skew, nn_arb, nn_comp = make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)\n\nfunction produce_validation_plot(n_points::Int, nn_skew = nn_skew, nn_arb = nn_arb, nn_comp = nn_comp; initial_condition::Matrix=initial_condition, type = :cos)\n validation_skew = iterate(nn_skew, initial_condition; n_points = n_points, prediction_window = 1)\n validation_arb = iterate(nn_arb, initial_condition; n_points = n_points, prediction_window = 1)\n validation_comp = iterate(nn_comp, initial_condition[:, 1]; n_points = n_points)\n\n p2 = type == :cos ? plot(dl.input[1, 1:n_points, 2], color = 1, label = \"reference\") : plot(dl.input[1, 1:n_points, 1], color = 1, label = \"reference\")\n\n plot!(validation_skew[1, :], color = 2, label = \"skew\")\n plot!(p2, validation_arb[1, :], color = 3, label = \"arb\")\n plot!(p2, validation_comp[1, :], color = 4, label = \"comp\")\n vline!([seq_length], color = :red, label = \"start of prediction\")\n\n p2 \nend\n\np2 = produce_validation_plot(40)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"In the above plot we can see that the network with the arbitrary weighting performs much better; even though the green line does not fit the blue line very well either, it manages to least qualitatively reflect the training data. We can also plot the predictions for longer time intervals: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"p3 = produce_validation_plot(400)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We can also plot the comparison with the sine function: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"initial_condition = dl.input[:, 1:seq_length, 1]\n\np2 = produce_validation_plot(40, initial_condition = initial_condition, type = :sin)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"This advantage of the volume-preserving attention with arbitrary weighting may however be due to the fact that the skew-symmetric attention only has 3 learnable parameters, as opposed to 9 for the arbitrary weighting. If we increase the upscaling dimension the result changes: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"const upscale_dimension_2 = 10\n\nnn_skew, nn_arb, nn_comp = set_up_networks(upscale_dimension_2)\n\no_skew, o_arb, o_comp = set_up_optimizers(nn_skew, nn_arb, nn_comp)\n\nloss_array_skew, loss_array_arb, loss_array_comp = train_networks!(nn_skew, nn_arb, nn_comp)\n\nplot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"initial_condition = dl.input[:, 1:seq_length, 2]\n\nnn_skew, nn_arb, nn_comp = make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)\n\np2 = produce_validation_plot(40, nn_skew, nn_arb, nn_comp)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"And for a longer time interval: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"p3 = produce_validation_plot(200, nn_skew, nn_arb, nn_comp)","category":"page"},{"location":"optimizers/manifold_related/geodesic/#Geodesic-Retraction","page":"Geodesic Retraction","title":"Geodesic Retraction","text":"","category":"section"},{"location":"optimizers/manifold_related/geodesic/","page":"Geodesic Retraction","title":"Geodesic Retraction","text":"General retractions are approximations of the exponential map. In GeometricMachineLearning we can, instead of using an approximation, solve the geodesic equation exactly (up to numerical error) by specifying Geodesic() as the argument of layers that have manifold weights. ","category":"page"},{"location":"optimizers/manifold_related/cayley/#The-Cayley-Retraction","page":"Cayley Retraction","title":"The Cayley Retraction","text":"","category":"section"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"The Cayley transformation is one of the most popular retractions. For several matrix Lie groups it is a mapping from the Lie algebra mathfrakg onto the Lie group G. They Cayley retraction reads: ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":" mathrmCayley(C) = left(mathbbI -frac12Cright)^-1left(mathbbI +frac12Cright)","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"This is easily checked to be a retraction, i.e. mathrmCayley(mathbbO) = mathbbI and fracpartialpartialtmathrmCayley(tC) = C.","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"What we need in practice is not the computation of the Cayley transform of an arbitrary matrix, but the Cayley transform of an element of mathfrakg^mathrmhor, the global tangent space representation. ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"The elements of mathfrakg^mathrmhor can be written as: ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"C = beginbmatrix\n A -B^T \n B mathbbO\nendbmatrix = beginbmatrix frac12A mathbbI B mathbbO endbmatrix beginbmatrix mathbbI mathbbO frac12A -B^T endbmatrix","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"where the second expression exploits the sparse structure of the array, i.e. it is a multiplication of a Ntimes2n with a 2ntimesN matrix. We can hence use the Sherman-Morrison-Woodbury formula to obtain:","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"(mathbbI - frac12UV)^-1 = mathbbI + frac12U(mathbbI - frac12VU)^-1V","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"So what we have to invert is the term ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"mathbbI - frac12beginbmatrix mathbbI mathbbO frac12A -B^T endbmatrixbeginbmatrix frac12A mathbbI B mathbbO endbmatrix = \nbeginbmatrix mathbbI - frac14A - frac12mathbbI frac12B^TB - frac18A^2 mathbbI - frac14A endbmatrix","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"The whole Cayley transform is then: ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"left(mathbbI + frac12beginbmatrix frac12A mathbbI B mathbbO endbmatrix beginbmatrix mathbbI - frac14A - frac12mathbbI frac12B^TB - frac18A^2 mathbbI - frac14A endbmatrix^-1 beginbmatrix mathbbI mathbbO frac12A -B^T endbmatrix right)left( E + frac12beginbmatrix frac12A mathbbI B mathbbO endbmatrix beginbmatrix mathbbI frac12A endbmatrix right) = \nE + frac12beginbmatrix frac12A mathbbI B mathbbO endbmatrixleft(\n beginbmatrix mathbbI frac12A endbmatrix + \n beginbmatrix mathbbI - frac14A - frac12mathbbI frac12B^TB - frac18A^2 mathbbI - frac14A endbmatrix^-1left(\n beginbmatrix mathbbI frac12A endbmatrix + \n beginbmatrix frac12A frac14A^2 - frac12B^TB endbmatrix\n right)\n right)","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"Note that for computational reason we compute mathrmCayley(C)E instead of just the Cayley transform (see the section on retractions).","category":"page"},{"location":"data_loader/snapshot_matrix/#Snapshot-matrix","page":"Snapshot matrix & tensor","title":"Snapshot matrix","text":"","category":"section"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct reduced bases in a data-driven way. So (for a single parameter[1]) the snapshot matrix takes the following form: ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"[1]: If we deal with a parametrized PDE then there are two stages at which the snapshot matrix has to be processed: the offline stage and the online stage. ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"M = leftbeginarraycccc\nhatu_1(t_0) hatu_1(t_1) quadldotsquad hatu_1(t_f) \nhatu_2(t_0) hatu_2(t_1) ldots hatu_2(t_f) \nhatu_3(t_0) hatu_3(t_1) ldots hatu_3(t_f) \nldots ldots ldots ldots \nhatu_2N(t_0) hatu_2N(t_1) ldots hatu_2N(t_f) \nendarrayright","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"In the above example we store a matrix whose first axis is the system dimension (i.e. a state is an element of mathbbR^2n) and the second dimension gives the time step. ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"The starting point for using the snapshot matrix as data for a machine learning model is that all the columns of M live on a lower-dimensional solution manifold and we can use techniques such as POD and autoencoders to find this solution manifold. We also note that the second axis of M does not necessarily indicate time but can also represent various parameters (including initial conditions). The second axis in the DataLoader struct is therefore saved in the field n_params.","category":"page"},{"location":"data_loader/snapshot_matrix/#Snapshot-tensor","page":"Snapshot matrix & tensor","title":"Snapshot tensor","text":"","category":"section"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"The snapshot tensor fulfills the same role as the snapshot matrix but has a third axis that describes different initial parameters (such as different initial conditions). ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"Main.include_graphics(\"../tikz/tensor\") # hide","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"When drawing training samples from the snapshot tensor we also need to specify a sequence length (as an argument to the Batch struct). When sampling a batch from the snapshot tensor we sample over the starting point of the time interval (which is of length seq_length) and the third axis of the tensor (the parameters). The total number of batches in this case is lceilmathtt(dlinput_time_steps - batchseq_length) * dln_params batchbatch_sizerceil. ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/#Horizontal-component-of-the-Lie-algebra-\\mathfrak{g}","page":"Stiefel Global Tangent Space","title":"Horizontal component of the Lie algebra mathfrakg","text":"","category":"section"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"What we use to optimize Adam (and other algorithms) to manifolds is a global tangent space representation of the homogeneous spaces. ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"For the Stiefel manifold, this global tangent space representation takes a simple form: ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"mathcalB = beginbmatrix\n A -B^T \n B mathbbO\nendbmatrix","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"where AinmathbbR^ntimesn is skew-symmetric and BinmathbbR^Ntimesn is arbitary. In GeometricMachineLearning the struct StiefelLieAlgHorMatrix implements elements of this form.","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/#Theoretical-background","page":"Stiefel Global Tangent Space","title":"Theoretical background","text":"","category":"section"},{"location":"arrays/stiefel_lie_alg_horizontal/#Vertical-and-horizontal-components","page":"Stiefel Global Tangent Space","title":"Vertical and horizontal components","text":"","category":"section"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"The Stiefel manifold St(n N) is a homogeneous space obtained from SO(N) by setting two matrices, whose first n columns conincide, equivalent. Another way of expressing this is: ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"A_1 sim A_2 iff A_1E = A_2E","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"for ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"E = beginbmatrix mathbbI mathbbOendbmatrix","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"Because St(nN) is a homogeneous space, we can take any element YinSt(nN) and SO(N) acts transitively on it, i.e. can produce any other element in SO(N). A similar statement is also true regarding the tangent spaces of St(nN), namely: ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"T_YSt(nN) = mathfrakgcdotY","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"i.e. every tangent space can be expressed through an action of the associated Lie algebra. ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"The kernel of the mapping mathfrakgtoT_YSt(nN) BmapstoBY is referred to as mathfrakg^mathrmverY, the vertical component of the Lie algebra at Y. In the case Y=E it is easy to see that elements belonging to mathfrakg^mathrmverE are of the following form: ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"beginbmatrix\nhatmathbbO tildemathbbO^T \ntildemathbbO C\nendbmatrix","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"where hatmathbbOinmathbbR^ntimesn is a \"small\" matrix and tildemathbbOinmathbbR^Ntimesn is a bigger one. CinmathbbR^NtimesN is a skew-symmetric matrix. ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"The orthogonal complement of the vertical component is referred to as the horizontal component and denoted by mathfrakg^mathrmhor Y. It is isomorphic to T_YSt(nN) and this isomorphism can be found explicitly. In the case of the Stiefel manifold: ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"Omega(Y cdot)T_YSt(nN)tomathfrakg^mathrmhorY Delta mapsto (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"The elements of mathfrakg^mathrmhorE=mathfrakg^mathrmhor, i.e. for the special case Y=E. Its elements are of the form described on top of this page.","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/#Special-functions","page":"Stiefel Global Tangent Space","title":"Special functions","text":"","category":"section"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"You can also draw random elements from mathfrakg^mathrmhor through e.g. ","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"rand(CUDADevice(), StiefelLieAlgHorMatrix{Float32}, 10, 5)","category":"page"},{"location":"arrays/stiefel_lie_alg_horizontal/","page":"Stiefel Global Tangent Space","title":"Stiefel Global Tangent Space","text":"In this example: N=10 and n=5.","category":"page"},{"location":"architectures/symplectic_autoencoder/#Symplectic-Autoencoder","page":"Symplectic Autoencoders","title":"Symplectic Autoencoder","text":"","category":"section"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"A visualization of an instance of SymplecticAutoencoder is shown below: ","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Main.include_graphics(\"../tikz/symplectic_autoencoder_architecture\") # hide","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"The intermediate dimension M is calculated via n : (N - n) ÷ (n_blocks - 1) : N. Further we have the following choices:","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"n_encoder_layers::Integer = 4\nn_encoder_blocks::Integer = 2 \nn_decoder_layers::Integer = 2 \nn_decoder_blocks::Integer = 3\nencoder_init_q::Bool = true\ndecoder_init_q::Bool = true","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Note that all of these are keyword arguments that can be supplied to SymplecticAutoencoder.","category":"page"},{"location":"tutorials/linear_symplectic_transformer/#linear_symplectic_transformer_tutorial","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"","category":"section"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"In this tutorial we compare the linear symplectic transformer to the standard transformer. ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"using GeometricMachineLearning # hide\nusing GeometricProblems.CoupledHarmonicOscillator: hodeensemble, default_parameters\nusing GeometricIntegrators: ImplicitMidpoint, integrate \nusing LaTeXStrings\nusing Plots\nimport Random\n\nRandom.seed!(123)\n\nconst tstep = .3\nconst n_init_con = 5\n\n# ensemble problem\nep = hodeensemble([rand(2) for _ in 1:n_init_con], [rand(2) for _ in 1:n_init_con]; tstep = tstep)\n\ndl_nt = DataLoader(integrate(ep, ImplicitMidpoint()))\ndl = DataLoader(vcat(dl_nt.input.q, dl_nt.input.p))\n\nnothing # hide","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"We now define the architectures and train them: ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"const seq_length = 4\nconst batch_size = 16384\nconst n_epochs = 2000\n\narch_standard = StandardTransformerIntegrator(dl.input_dim; n_heads = 2, L = 1, n_blocks = 2)\narch_symplectic = LinearSymplecticTransformer(dl.input_dim, seq_length; n_sympnet = 2, L = 1, upscaling_dimension = 2 * dl.input_dim)\narch_sympnet = GSympNet(dl.input_dim; n_layers = 4, upscaling_dimension = 2 * dl.input_dim)\n\nnn_standard = NeuralNetwork(arch_standard)\nnn_symplectic = NeuralNetwork(arch_symplectic)\nnn_sympnet = NeuralNetwork(arch_sympnet)\n\no_method = AdamOptimizerWithDecay(n_epochs, Float64)\n\no_standard = Optimizer(o_method, nn_standard)\no_symplectic = Optimizer(o_method, nn_symplectic)\no_sympnet = Optimizer(o_method, nn_sympnet)\n\nbatch = Batch(batch_size, seq_length)\nbatch2 = Batch(batch_size)\n\nloss_array_standard = o_standard(nn_standard, dl, batch, n_epochs)\nloss_array_symplectic = o_symplectic(nn_symplectic, dl, batch, n_epochs)\nloss_array_sympnet = o_sympnet(nn_sympnet, dl, batch2, n_epochs)\n\np_train = plot(loss_array_standard; color = 2, xlabel = \"epoch\", ylabel = \"training error\", label = \"ST\", yaxis = :log)\nplot!(p_train, loss_array_symplectic; color = 4, label = \"LST\")\nplot!(p_train, loss_array_sympnet; color = 3, label = \"SympNet\")\n\np_train","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"We further evaluate a trajectory with the trained networks: ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"const index = 1\ninit_con = dl.input[:, 1:seq_length, index]\n\nconst n_steps = 30\n\nfunction make_validation_plot(n_steps = n_steps; kwargs...)\n prediction_standard = iterate(nn_standard, init_con; n_points = n_steps, prediction_window = seq_length)\n prediction_symplectic = iterate(nn_symplectic, init_con; n_points = n_steps, prediction_window = seq_length)\n prediction_sympnet = iterate(nn_sympnet, init_con[:, 1]; n_points = n_steps)\n\n p_validate = plot(dl.input[1, 1:n_steps, index]; color = 1, ylabel = L\"q_1\", label = \"implicit midpoint\", kwargs...)\n plot!(p_validate, prediction_standard[1, :]; color = 2, label = \"ST\", kwargs...)\n plot!(p_validate, prediction_symplectic[1, :]; color = 4, label = \"LST\", kwargs...)\n plot!(p_validate, prediction_sympnet[1, :]; color = 3, label = \"SympNet\", kwargs...)\n\n p_validate\nend\n\nmake_validation_plot(; linewidth = 2)","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"We can see that the standard transformer is not able to stay close to the trajectory coming from implicit midpoint very well. The linear symplectic transformer outperforms the standard transformer as well as the SympNet while needed much fewer parameters than the standard transformer: ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"parameterlength(nn_standard), parameterlength(nn_symplectic), parameterlength(nn_sympnet)","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"It is also interesting to note that the training error for the SympNet gets lower than the one for the linear symplectic transformer, but it does not manage to outperform it when looking at the validation. ","category":"page"},{"location":"layers/multihead_attention_layer/#Multihead-Attention","page":"Multihead Attention","title":"Multihead Attention","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"In order to arrive from the attention layer at the multihead attention layer we have to do a few modifications: ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Note that these neural networks were originally developed for natural language processing (NLP) tasks and the terminology used here bears some resemblance to that field. The input to a multihead attention layer typicaly comprises three components:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Values VinmathbbR^ntimesT: a matrix whose columns are value vectors, \nQueries QinmathbbR^ntimesT: a matrix whose columns are query vectors, \nKeys KinmathbbR^ntimesT: a matrix whose columns are key vectors.","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Regular attention performs the following operation: ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"mathrmAttention(QKV) = Vmathrmsoftmax(fracK^TQsqrtn)","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"where n is the dimension of the vectors in V, Q and K. The softmax activation function here acts column-wise, so it can be seen as a transformation mathrmsoftmaxmathbbR^TtomathbbR^T with mathrmsoftmax(v)_i = e^v_ileft(sum_j=1e^v_jright). The K^TQ term is a similarity matrix between the queries and the vectors. ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"The transformer contains a self-attention mechanism, i.e. takes an input X and then transforms it linearly to V, Q and K, i.e. V = P^VX, Q = P^QX and K = P^KX. What distinguishes the multihead attention layer from the singlehead attention layer, is that there is not just one P^V, P^Q and P^K, but there are several: one for each head of the multihead attention layer. After computing the individual values, queries and vectors, and after applying the softmax, the outputs are then concatenated together in order to obtain again an array that is of the same size as the input array:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Main.include_graphics(\"../tikz/mha\") # hide","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Here the various P matrices can be interpreted as being projections onto lower-dimensional subspaces, hence the designation by the letter P. Because of this interpretation as projection matrices onto smaller spaces that should capture features in the input data it makes sense to constrain these elements to be part of the Stiefel manifold. ","category":"page"},{"location":"layers/multihead_attention_layer/#Computing-Correlations-in-the-Multihead-Attention-Layer","page":"Multihead Attention","title":"Computing Correlations in the Multihead-Attention Layer","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"The attention mechanism describes a reweighting of the \"values\" V_i based on correlations between the \"keys\" K_i and the \"queries\" Q_i. First note the structure of these matrices: they are all a collection of T vectors (Ndivmathttn_heads)-dimensional vectors, i.e. V_i=v_i^(1) ldots v_i^(T) K_i=k_i^(1) ldots k_i^(T) Q_i=q_i^(1) ldots q_i^(T) . Those vectors have been obtained by applying the respective projection matrices onto the original input I_iinmathbbR^NtimesT.","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"When performing the reweighting of the columns of V_i we first compute the correlations between the vectors in K_i and in Q_i and store the results in a correlation matrix C_i: ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":" C_i_mn = left(k_i^(m)right)^Tq_i^(n)","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"The columns of this correlation matrix are than rescaled with a softmax function, obtaining a matrix of probability vectors mathcalP_i:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":" mathcalP_i_bulletn = mathrmsoftmax(C_i_bulletn)","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Finally the matrix mathcalP_i is multiplied onto V_i from the right, resulting in 16 convex combinations of the 16 vectors v_i^(m) with m=1ldotsT:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":" V_imathcalP_i = leftsum_m=1^16mathcalP_i_m1v_i^(m) ldots sum_m=1^TmathcalP_i_mTv_i^(m)right","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"With this we can now give a better interpretation of what the projection matrices W_i^V, W_i^K and W_i^Q should do: they map the original data to lower-dimensional subspaces. We then compute correlations between the representation in the K and in the Q basis and use this correlation to perform a convex reweighting of the vectors in the V basis. These reweighted values are then fed into a standard feedforward neural network.","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Because the main task of the W_i^V, W_i^K and W_i^Q matrices here is for them to find bases, it makes sense to constrain them onto the Stiefel manifold; they do not and should not have the maximum possible generality.","category":"page"},{"location":"layers/multihead_attention_layer/#Library-Functions","page":"Multihead Attention","title":"Library Functions","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"MultiHeadAttention","category":"page"},{"location":"layers/multihead_attention_layer/#GeometricMachineLearning.MultiHeadAttention-layers-multihead_attention_layer","page":"Multihead Attention","title":"GeometricMachineLearning.MultiHeadAttention","text":"MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data. \n\nConstructor\n\nTakes input arguments: \n\ndim::Int: The system dimension \nn_heads::Int: The number of heads. \nStiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold. \nretraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction. \nadd_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result. \n\n\n\n\n\n","category":"type"},{"location":"layers/multihead_attention_layer/#References","page":"Multihead Attention","title":"References","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).\n\n\n\n","category":"page"},{"location":"layers/sympnet_gradient/#SympNet-Gradient-Layer","page":"Sympnet Gradient Layers","title":"SympNet Gradient Layer","text":"","category":"section"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"The Sympnet gradient layer (called GradientLayer in GeometricMachineLearning) is based on the following theorem: ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"Main.theorem(raw\"\"\"Given a symplectic vector space ``\\mathbb{R}^{2n}`` which coordinates ``q_1, \\ldots, q_n, p_1, \\ldots, p_n`` and a function ``f:\\mathbb{R}^n\\to\\mathbb{R}`` that only acts on the ``q`` part, the map ``(q, p) \\mapsto (q, p + \\nabla_qf)`` is symplectic. A similar statement holds if ``f`` only acts on the ``p`` part.\"\"\")","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"Proofing this is straightforward by looking at the gradient of the mapping:","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" beginpmatrix\n mathbbI mathbbO \n nabla_q^2f mathbbI\n endpmatrix","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"where nabla_q^2f is the Hessian of f. This matrix is symmetric and for any symmetric matrix A we have that: ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" beginpmatrix\n mathbbI mathbbO \n A mathbbI\n endpmatrix^T mathbbJ_2n \n beginpmatrix \n mathbbI mathbbO \n A mathbbI \n endpmatrix = \n beginpmatrix\n mathbbI A \n mathbbO mathbbI\n endpmatrix \n beginpmatrix \n mathbbO mathbbI \n -mathbbI mathbbO \n endpmatrix \n beginpmatrix\n mathbbI mathbbO \n A mathbbI\n endpmatrix = \n beginpmatrix\n mathbbO mathbbI \n -mathbbI mathbbO \n endpmatrix","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"If we deal with GSympNets this function f is ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" f(q) = a^T Sigma(Kq + b)","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"where a binmathbbR^m, KinmathbbR^mtimesn and Sigma is the antiderivative of some common activation function sigma. We routinely refer to m as the upscaling dimension in GeometricMachineLearning. Computing the gradient of f gives: ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" nabla_qf_k = sum_i=1^m a_i sigma(sum_j=1^nk_ijq_j + b_i)k_ik = = K^T a odot sigma(Kq + b)","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"where odot is the element-wise product, i.e. aodotv_k = a_kv_k.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Symplectic-Autoencoder","page":"PSD and Symplectic Autoencoders","title":"Symplectic Autoencoder","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Symplectic Autoencoders are a type of neural network suitable for treating Hamiltonian parametrized PDEs with slowly decaying Kolmogorov n-width. It is based on proper symplectic decomposition (PSD) and symplectic neural networks (SympNets).","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Hamiltonian-Model-Order-Reduction","page":"PSD and Symplectic Autoencoders","title":"Hamiltonian Model Order Reduction","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Hamiltonian PDEs are partial differential equations that, like its ODE counterpart, have a Hamiltonian associated with it. An example of this is the linear wave equation (see [32]) with Hamiltonian ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"mathcalH(q p mu) = frac12int_Omegamu^2(partial_xiq(tximu))^2 + p(tximu)^2dxi","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"The PDE for to this Hamiltonian can be obtained similarly as in the ODE case:","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"partial_tq(tximu) = fracdeltamathcalHdeltap = p(tximu) quad partial_tp(tximu) = -fracdeltamathcalHdeltaq = mu^2partial_xixiq(tximu)","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Symplectic-Solution-Manifold","page":"PSD and Symplectic Autoencoders","title":"Symplectic Solution Manifold","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"As with regular parametric PDEs, we also associate a solution manifold with Hamiltonian PDEs. This is a finite-dimensional manifold, on which the dynamics can be described through a Hamiltonian ODE. I NEED A PROOF OR SOME EXPLANATION FOR THIS!","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Workflow-for-Symplectic-ROM","page":"PSD and Symplectic Autoencoders","title":"Workflow for Symplectic ROM","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"As with any other reduced order modeling technique we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"mathcalH_mathrmdiscr(z(tmu)mu) = frac12x(tmu)^Tbeginbmatrix -mu^2D_xixi mathbbO mathbbO mathbbI endbmatrix x(tmu)","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"In Hamiltonian reduced order modelling we try to find a symplectic submanifold of the solution space[1] that captures the dynamics of the full system as well as possible.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"[1]: The submanifold is: tildemathcalM = Psi^mathrmdec(z_r)inmathbbR^2Nu_rinmathrmR^2n where z_r is the reduced state of the system. ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Similar to the regular PDE case we again build an encoder Psi^mathrmenc and a decoder Psi^mathrmdec; but now both these mappings are required to be symplectic!","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Concretely this means: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"The encoder is a mapping from a high-dimensional symplectic space to a low-dimensional symplectic space, i.e. Psi^mathrmencmathbbR^2NtomathbbR^2n such that nablaPsi^mathrmencmathbbJ_2N(nablaPsi^mathrmenc)^T = mathbbJ_2n.\nThe decoder is a mapping from a low-dimensional symplectic space to a high-dimensional symplectic space, i.e. Psi^mathrmdecmathbbR^2ntomathbbR^2N such that (nablaPsi^mathrmdec)^TmathbbJ_2NnablaPsi^mathrmdec = mathbbJ_2n.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"If these two maps are constrained to linear maps, then one can easily find good solutions with proper symplectic decomposition (PSD).","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Proper-Symplectic-Decomposition","page":"PSD and Symplectic Autoencoders","title":"Proper Symplectic Decomposition","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"For PSD the two mappings Psi^mathrmenc and Psi^mathrmdec are constrained to be linear, orthonormal (i.e. Psi^TPsi = mathbbI) and symplectic. The easiest way to enforce this is through the so-called cotangent lift: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Psi_mathrmCL = \nbeginbmatrix Phi mathbbO mathbbO Phi endbmatrix","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"and PhiinSt(nN)subsetmathbbR^Ntimesn, i.e. is an element of the Stiefel manifold. If the snapshot matrix is of the form: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"M = leftbeginarraycccc\nhatq_1(t_0) hatq_1(t_1) quadldotsquad hatq_1(t_f) \nhatq_2(t_0) hatq_2(t_1) ldots hatq_2(t_f) \nldots ldots ldots ldots \nhatq_N(t_0) hatq_N(t_1) ldots hatq_N(t_f) \nhatp_1(t_0) hatp_1(t_1) ldots hatp_1(t_f) \nhatp_2(t_0) hatp_2(t_1) ldots hatp_2(t_f) \nldots ldots ldots ldots \nhatp_N(t_0) hatp_N(t_1) ldots hatp_N(t_f) \nendarrayright","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"then Phi can be computed in a very straight-forward manner: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Rearrange the rows of the matrix M such that we end up with a Ntimes2(f+1) matrix: hatM = M_q M_p.\nPerform SVD: hatM = USigmaV^T; set PhigetsUmathtt1n.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"For details on the cotangent lift (and other methods for linear symplectic model reduction) consult [33].","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Symplectic-Autoencoders","page":"PSD and Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"PSD suffers from the similar shortcomings as regular POD: it is a linear map and the approximation space tildemathcalM= Psi^mathrmdec(z_r)inmathbbR^2Nu_rinmathrmR^2n is strictly linear. For problems with slowly-decaying Kolmogorov n-width this leads to very poor approximations. ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"In order to overcome this difficulty we use neural networks, more specifically SympNets, together with cotangent lift-like matrices. The resulting architecture, symplectic autoencoders, are demonstrated in the following image: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Main.include_graphics(\"../tikz/symplectic_autoencoder\") # hide","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices PhiinSt(nN) we have to optimize on the Stiefel manifold.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#References","page":"PSD and Symplectic Autoencoders","title":"References","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).\n\n\n\nL. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).\n\n\n\n","category":"page"},{"location":"manifolds/metric_and_vector_spaces/#(Topological)-Metric-Spaces","page":"Metric and Vector Spaces","title":"(Topological) Metric Spaces","text":"","category":"section"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"A metric space is a certain class of a topological space where the topology is induced through a metric.","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **metric** on a topological space ``\\mathcal{M}`` is a mapping ``d:\\mathcal{M}\\times\\mathcal{M}\\to\\mathbb{R}`` such that the following three conditions hold: \n\" * \nMain.indentation * raw\"1. ``d(x, y) = 0 \\iff x = y`` for every ``x,y\\in\\mathcal{M}``, i.e. the distance between 2 points is only zero if and only if they are the same,\n\" * \nMain.indentation * raw\"2. ``d(x, y) = d(y, x)``,\n\" *\nMain.indentation * raw\"3. ``d(x, z) \\leq d(x, y) + d(y, z)``.\n\" *\nMain.indentation * raw\"The second condition is referred to as *symmetry* and the third condition is referred to as the *triangle inequality*.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"We give some examples of metric spaces that are relevant for us: ","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"The real line ``\\mathbb{R}`` with the metric defined by the absolute distance between two points: ``d(x, y) = |y - x|``.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"The vector space ``\\mathbb{R}^n`` with the *Euclidean distance* ``d_2(x, y) = \\sqrt{\\sum_{i=1}^n (x_i - y_i)^2}``.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"The space of continuous functions ``\\mathcal{C} = \\{f:(-\\epsilon, \\epsilon)\\to\\mathbb{R}^n\\}`` with the metric ``d_\\infty(f_1, f_2) = \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_2(t)|.``\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.proof(raw\"We have to show the triangle inequality: \n\" * \nMain.indentation * raw\"```math\n\" * \nMain.indentation * raw\"\\begin{aligned}\n\" *\nMain.indentation * raw\"d_\\infty(d_1, d_3) = \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_3(t)| & \\leq \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}(|f_1(t) - f_2(t)| + |f_2(t) - f_3(t)|) \\\\\n\" *\nMain.indentation * raw\"& \\leq \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_2(t)| + \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_2(t)|.\n\" * \nMain.indentation * raw\"\\end{aligned}\n\" * \nMain.indentation * raw\"```\n\" *\nMain.indentation * raw\"This shows that ``d_\\infty`` is indeed a metric.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"Any Riemannian manifold is a metric space.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"This last example shows that metric spaces need not be vector spaces, i.e. spaces for which we can define a metric but not addition of two elements. This will be discussed in more detail in the section on riemannian manifolds.","category":"page"},{"location":"manifolds/metric_and_vector_spaces/#Complete-Metric-Spaces","page":"Metric and Vector Spaces","title":"Complete Metric Spaces","text":"","category":"section"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"To define complete metric spaces we first need the definition of a Cauchy sequence.","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **Cauchy sequence** is a sequence ``(a_n)_{n\\in\\mathbb{N}}`` for which, given any `epsilon>0`, we can find an integer ``N`` such that ``d(a_n, a_m) < \\epsilon`` for all ``n, m \\geq N``.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Now we can give the definition of a complete metric space:","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **complete metric space** is one for which every Cauchy sequence converges.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Completeness of the real numbers is most often seen as an axiom and therefore stated without proof. This also implies completeness of mathbbR^n [4].","category":"page"},{"location":"manifolds/metric_and_vector_spaces/#(Topological)-Vector-Spaces","page":"Metric and Vector Spaces","title":"(Topological) Vector Spaces","text":"","category":"section"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Vector Spaces are, like metric spaces, topological spaces which we endow with additional structure. ","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **vector space** ``\\mathcal{V}`` is a topological space for which we define an operation called *addition* and denoted by ``+`` and an operation called *scalar multiplication* (by elements of ``\\mathbb{R}``) denoted by ``x \\mapsto ax`` for ``x\\in\\mathcal{V}`` and ``x\\in\\mathbb{R}`` for which the following hold for all ``x, y, z\\in\\mathcal{V}`` and ``a, b\\in\\mathbb{R}``:\n\" * \nMain.indentation * raw\"1. ``x + (y + z) = (x + y) + z,``\n\" * \nMain.indentation * raw\"2. ``x + y = y + x,``\n\" * \nMain.indentation * raw\"3. ``\\exists 0 \\in \\mathcal{V}`` such that ``x + 0 = x,``\n\" * \nMain.indentation * raw\"4. ``\\exists -x \\in \\mathcal{V} such that ``x + (-x) = 0,``\n\" * \nMain.indentation * raw\"5. ``a(ax) = (ab)x,``\n\" * \nMain.indentation * raw\"6. ``1x = x`` for ``1\\in\\mathbb{R},``\n\" * \nMain.indentation * raw\"7. ``a(x + y) = ax + ay,``\n\" * \nMain.indentation * raw\"8. ``(a + b)x = ax + bx.``\n\" * \nMain.indentation * raw\"The first law is known as *associativity*, the second one as *commutativity* and the last two ones are known *distributivity*.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"The topological spaces mathbbR and mathbbR^n are (almost) trivially vector spaces. The same is true for many function spaces. One of the special aspects of GeometricMachineLearning is that it can deal with spaces that are not vector spaces, but manifolds. All vector spaces are however manifolds. ","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).\n\n\n\n","category":"page"},{"location":"architectures/volume_preserving_transformer/#Volume-Preserving-Transformer","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"","category":"section"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"The volume-preserving transformer is, similar to the standard transformer, a combination of two different neural networks: a volume-preserving attention layer and a volume-preserving feedforward layer. It is visualized below:","category":"page"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"Main.include_graphics(\"../tikz/vp_transformer\") # hide","category":"page"},{"location":"architectures/volume_preserving_transformer/#Library-Functions","page":"Volume-Preserving Transformer","title":"Library Functions","text":"","category":"section"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"VolumePreservingTransformer","category":"page"},{"location":"architectures/volume_preserving_transformer/#GeometricMachineLearning.VolumePreservingTransformer-architectures-volume_preserving_transformer","page":"Volume-Preserving Transformer","title":"GeometricMachineLearning.VolumePreservingTransformer","text":"The volume-preserving transformer with the Cayley activation function and built-in upscaling.\n\nConstructor\n\nThe arguments for the constructor are: \n\nsys_dim::Int\nseq_length::Int: The sequence length of the data fed into the transformer.\n\nThe following are keyword argumetns:\n\nn_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function.\ninit_upper::Bool=false: Specifies if the network first acts on the q component. \nskew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.\n\n\n\n\n\n","category":"type"},{"location":"architectures/volume_preserving_transformer/#References","page":"Volume-Preserving Transformer","title":"References","text":"","category":"section"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).\n\n\n\n","category":"page"},{"location":"layers/attention_layer/#The-Attention-Layer","page":"Attention","title":"The Attention Layer","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The attention mechanism was originally developed for image and natural language processing (NLP) tasks. It is motivated by the need to handle time series data in an efficient way[1]. Its essential idea is to compute correlations between vectors in input sequences. I.e. given sequences ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z_q^(1) z_q^(2) ldots z_q^(T)) text and (z_p^(1) z_p^(2) ldots z_p^(T))","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"an attention mechanism computes pair-wise correlations between all combinations of two input vectors from these sequences. In [13] \"additive\" attention is used to compute such correlations: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[1]: Recurrent neural networks have the same motivation. ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z_q z_k) mapsto v^Tsigma(Wz_q + Uz_k) ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"where z_q z_k in mathbbR^d are elements of the input sequences. The learnable parameters are W U in mathbbR^ntimesd and v in mathbbR^n.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"However multiplicative attention (see e.g. [14])is more straightforward to interpret and cheaper to handle computationally: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z_q z_k) mapsto z_q^TWz_k","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"where W in mathbbR^dtimesd is a learnable weight matrix with respect to which correlations are computed as scalar products. Regardless of the type of attention used, they all try to compute correlations among input sequences on whose basis further computation is performed. Given two input sequences Z_q = (z_q^(1) ldots z_q^(T)) and Z_k = (z_k^(1) ldots z_k^(T)), we can arrange the various correlations into a correlation matrix CinmathbbR^TtimesT with entries C_ij = mathttattention(z_q^(i) z_k^(j)). In the case of multiplicative attention this matrix is just C = Z^TWZ.","category":"page"},{"location":"layers/attention_layer/#Reweighting-of-the-input-sequence","page":"Attention","title":"Reweighting of the input sequence","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In GeometricMachineLearning we always compute self-attention, meaning that the two input sequences Z_q and Z_k are the same, i.e. Z = Z_q = Z_k.[2]","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[2]: Multihead attention also falls into this category. Here the input Z is multiplied from the left with several projection matrices P^Q_i and P^K_i, where i indicates the head. For each head we then compute a correlation matrix (P^Q_i Z)^T(P^K Z). ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"This is then used to reweight the columns in the input sequence Z. For this we first apply a nonlinearity sigma onto C and then multiply sigma(C) onto Z from the right, i.e. the output of the attention layer is Zsigma(C). So we perform the following mappings:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z xrightarrowmathrmcorrelations C(Z) = C xrightarrowsigma sigma(C) xrightarrowtextright multiplication Z sigma(C)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"After the right multiplication the outputs is of the following form: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":" sum_i=1^Tp^(1)_iz^(i) ldots sum_i=1^Tp^(T)_iz^(i)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"for p^(i) = sigma(C)_bulleti. What is learned during training are T different linear combinations of the input vectors, where the coefficients p^(i)_j in these linear combinations depend on the input Z nonlinearly. ","category":"page"},{"location":"layers/attention_layer/#Volume-Preserving-Attention","page":"Attention","title":"Volume-Preserving Attention","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The attention layer (and the activation function sigma defined for it) in GeometricMachineLearning was specifically designed to apply it to data coming from physical systems that can be described through a divergence-free or a symplectic vector field. Traditionally the nonlinearity in the attention mechanism is a softmax[3] (see [14]) and the self-attention layer performs the following mapping: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[3]: The softmax acts on the matrix C in a vector-wise manner, i.e. it operates on each column of the input matrix C = c^(1) ldots c^(T). The result is a sequence of probability vectors p^(1) ldots p^(T) for which sum_i=1^Tp^(j)_i=1quadforalljin1dotsT","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z = z^(1) ldots z^(T) mapsto Zmathrmsoftmax(Z^TWZ)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The softmax activation acts vector-wise, i.e. if we supply it with a matrix C as input it returns: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"mathrmsoftmax(C) = mathrmsoftmax(c_bullet1) ldots mathrmsoftmax(c_bulletT)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The output of a softmax is a probability vector (also called stochastic vector) and the matrix P = p^(1) ldots p^(T), where each column is a probability vector, is sometimes referred to as a stochastic matrix (see [15]). This attention mechanism finds application in transformer neural networks [14]. The problem with this matrix from a geometric point of view is that all the columns are independent of each other and the nonlinear transformation could in theory produce a stochastic matrix for which all columns are identical and thus lead to a loss of information. So the softmax activation function is inherently non-geometric. ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Besides the traditional attention mechanism GeometricMachineLearning therefore also has a volume-preserving transformation that fulfills a similar role. There are two approaches implemented to realize similar transformations. Both of them however utilize the Cayley transform to produce orthogonal matrices sigma(C) instead of stochastic matrices. For an orthogonal matrix Sigma we have Sigma^TSigma = mathbbI, so all the columns are linearly independent which is not necessarily true for a stochastic matrix P. The following explains how this new activation function is implemented.","category":"page"},{"location":"layers/attention_layer/#The-Cayley-transform","page":"Attention","title":"The Cayley transform","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The Cayley transform maps from skew-symmetric matrices to orthonormal matrices[4]. It takes the form:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[4]: A matrix A is skew-symmetric if A = -A^T and a matrix B is orthonormal if B^TB = mathbbI. The orthonormal matrices form a Lie group, i.e. the set of orthonormal matrices can be endowed with the structure of a differential manifold and this set also satisfies the group axioms. The corresponding Lie algebra are the skew-symmetric matrices and the Cayley transform is a so-called retraction in this case. For more details consult e.g. [7] and [16].","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"mathrmCayley A mapsto (mathbbI - A)(mathbbI + A)^-1","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"We can easily check that mathrmCayley(A) is orthogonal if A is skew-symmetric. For this consider varepsilon mapsto A(varepsilon)inmathcalS_mathrmskew with A(0) = mathbbI and A(0) = B. Then we have: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"fracdeltamathrmCayleydeltaA = fracddvarepsilon_varepsilon=0 mathrmCayley(A(varepsilon))^T mathrmCayley(A(varepsilon)) = mathbbO","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In order to use the Cayley transform as an activation function we further need a mapping from the input Z to a skew-symmetric matrix. This is realized in two ways in GeometricMachineLearning: via a scalar-product with a skew-symmetric weighting and via a scalar-product with an arbitrary weighting.","category":"page"},{"location":"layers/attention_layer/#First-approach:-scalar-products-with-a-skew-symmetric-weighting","page":"Attention","title":"First approach: scalar products with a skew-symmetric weighting","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"For this the attention layer is modified in the following way: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z = z^(1) ldots z^(T) mapsto Zsigma(Z^TAZ)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"where sigma(C)=mathrmCayley(C) and A is a skew-symmetric matrix that is learnable, i.e. the parameters of the attention layer are stored in A.","category":"page"},{"location":"layers/attention_layer/#Second-approach:-scalar-products-with-an-arbitrary-weighting","page":"Attention","title":"Second approach: scalar products with an arbitrary weighting","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"For this approach we compute correlations between the input vectors with a skew-symmetric weighting. The correlations we consider here are based on: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z^(2))^TAz^(1) (z^(3))^TAz^(1) ldots (z^(d))^TAz^(1) (z^(3))^TAz^(2) ldots (z^(d))^TAz^(2) ldots (z^(d))^TAz^(d-1)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"So in total we consider correlations (z^(i))^Tz^(j) for which i j. We now arrange these correlations into a skew-symmetric matrix: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"C = beginbmatrix\n 0 -(z^(2))^TAz^(1) -(z^(3))^TAz^(1) ldots -(z^(d))^TAz^(1) \n (z^(2))^TAz^(1) 0 -(z^(3))^TAz^(2) ldots -(z^(d))^TAz^(2) \n ldots ldots ldots ldots ldots \n (z^(d))^TAz^(1) (z^(d))^TAz^(2) (z^(d))^TAz^(3) ldots 0 \nendbmatrix","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"This correlation matrix can now again be used as an input for the Cayley transform to produce an orthogonal matrix.","category":"page"},{"location":"layers/attention_layer/#How-is-structure-preserved?","page":"Attention","title":"How is structure preserved?","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In order to discuss how structure is preserved we first have to define what structure we mean precisely. This structure is strongly inspired by traditional multi-step methods (see [17]). We now define what volume preservation means for the product space mathbbR^dtimescdotstimesmathbbR^dequivtimes_textT timesmathbbR^d.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Consider an isomorphism hat times_text(T times)mathbbR^dstackrelapproxlongrightarrowmathbbR^dT. Specifically, this isomorphism takes the form:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z = leftbeginarraycccc\n z_1^(1) z_1^(2) quadcdotsquad z_1^(T) \n z_2^(1) z_2^(2) cdots z_2^(T) \n cdots cdots cdots cdots \n z_d^(1) z_d^(2) cdots z_d^(T)\n endarrayright mapsto \n leftbeginarrayc z_1^(1) z_1^(2) cdots z_1^(T) z_2^(1) cdots z_d^(T) endarrayright = Z_mathrmvec","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The inverse of Z mapsto hatZ we refer to as Y mapsto tildeY. In the following we also write hatvarphi for the mapping hatcircvarphicirctilde.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"DEFINITION: We say that a mapping varphi times_textT timesmathbbR^d to times_textT timesmathbbR^d is volume-preserving if the associated hatvarphi is volume-preserving.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In the transformed coordinate system (in terms of the vector Z_mathrmvec defined above) this is equivalent to multiplication by a sparse matrix tildeLambda(Z) from the left:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":" tildeLambda(Z) Z_mathrmvec =\n beginpmatrix\n Lambda(Z) mathbbO cdots mathbbO \n mathbbO Lambda(Z) cdots mathbbO \n cdots cdots ddots cdots \n mathbbO mathbbO cdots Lambda(Z) \n endpmatrix\n leftbeginarrayc z_1^(1) z_1^(2) ldots z_1^(T) z_2^(1) ldots z_d^(T) endarrayright ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"tildeLambda(Z) in m[eq:LambdaApplication]m(@latex) is easily shown to be an orthogonal matrix. ","category":"page"},{"location":"layers/attention_layer/#Historical-Note","page":"Attention","title":"Historical Note","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Attention was used before, but always in connection with recurrent neural networks (see [18] and [13]). ","category":"page"},{"location":"layers/attention_layer/#References","page":"Attention","title":"References","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"D. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).\n\n\n\nM.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).\n\n\n\n","category":"page"},{"location":"manifolds/homogeneous_spaces/#Homogeneous-Spaces","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Homogeneous spaces are very important in GeometricMachineLearning as we can generalize existing neural network optimizers from vector spaces to such homogenous spaces. They are intricately linked to the notion of a Lie Group and its Lie Algebra[1].","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"[1]: Recall that a Lie group is a manifold that also has group structure. We say that a Lie group G acts on a manifold mathcalM if there is a map GtimesmathcalM to mathcalM such that (ab)x = a(bx) for abinG and xinmathcalM. For us the Lie algebra belonging to a Lie group, denoted by mathfrakg, is the tangent space to the identity element T_mathbbIG. ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.definition(raw\"A **homogeneous space** is a manifold ``\\mathcal{M}`` on which a Lie group ``G`` acts transitively, i.e.\n\" * Main.indentation * raw\" ```math\n\" * Main.indentation * raw\"\\forall X,Y\\in\\mathcal{M} \\exists{}A\\in{}G\\text{ s.t. }AX = Y.\n\" * Main.indentation * raw\"```\n\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Now fix a distinct element EinmathcalM; we will refer to this as the canonical element. We can also establish an isomorphism between mathcalM and the quotient space Gsim with the equivalence relation: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"A_1 sim A_2 iff A_1E = A_2E","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Note that this is independent of the chosen E.","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The tangent spaces of mathcalM are of the form T_YmathcalM = mathfrakgcdotY, i.e. can be fully described through its Lie algebra. Based on this we can perform a splitting of mathfrakg into two parts:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.definition(raw\"A **splitting of the Lie algebra** ``mathfrak{g}`` at an element of a homogeneous space ``Y`` is a decomposition into a **vertical** and a **horizontal** component, denoted by ``\\mathfrak{g} = \\mathfrak{g}^{\\mathrm{ver},Y} \\oplus \\mathfrak{g}^{\\mathrm{hor},Y}`` such that\n\" * Main.indentation * raw\"1. The *vertical component* ``\\mathfrak{g}^{\\mathrm{ver},Y}`` is the kernel of the map ``\\mathfrak{g}\\to{}T_Y\\mathcal{M}, V \\mapsto VY``, i.e. ``\\mathfrak{g}^{\\mathrm{ver},Y} = \\{V\\in\\mathfrak{g}:VY = 0\\}.``\n\" * Main.indentation * raw\"2. The *horizontal component* ``\\mathfrak{g}^{\\mathrm{hor},Y}`` is the orthogonal complement of ``\\mathfrak{g}^{\\mathrm{ver},Y}`` in ``\\mathfrak{g}``. It is isomorphic to ``T_Y\\mathcal{M}``.\n\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"We will refer to the mapping from T_YmathcalM to mathfrakg^mathrmhor Y by Omega. We will give explicit examples of Omega below. If we have now defined a metric langlecdotcdotrangle on mathfrakg, then this induces a Riemannian metric on mathcalM:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"g_Y(Delta_1 Delta_2) = langleOmega(YDelta_1)Omega(YDelta_2)rangletext for Delta_1Delta_2inT_YmathcalM","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Two examples of homogeneous spaces implemented in GeometricMachineLearning are the Stiefel and the Grassmann manifold. The Lie group SO(N) acts transitively on both of these manifolds, i.e. turns them into homogeneous spaces. The Lie algebra of SO(N) are the skew-symmetric matrices mathfrakso(N)=VinmathbbR^NtimesNV^T + V = 0 and the canonical metric associated with it is simply (V_1V_2)mapstofrac12mathrmTr(V_1^TV_2).","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Stiefel-Manifold","page":"Homogeneous Spaces","title":"The Stiefel Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The Stiefel manifold St(n N) is the space of all orthonormal frames in mathbbR^Ntimesn, i.e. matrices YinmathbbR^Ntimesn s.t. Y^TY = mathbbI_n. It can also be seen as SO(N) modulo an equivalence relation: AsimBiffAE = BE for ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"E = beginbmatrix\nmathbbI_n \nmathbbO\nendbmatrixinSt(n N)","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"which is the canonical element of the Stiefel manifold. In words: the first n columns of A and B are the same. We also use this principle to draw random elements from the Stiefel manifold.","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.example(raw\"Drawing random elements from the Stiefel (and the Grassmann) manifold is done by first calling `rand(N, n)` (i.e. drawing from a normal distribution) and then performing a ``QR`` decomposition. We then take the first ``n`` columns of the ``Q`` matrix to be an element of the Stiefel manifold.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The tangent space to the element YinSt(nN) can be determined by considering C^infty curves on SO(N) through mathbbI which we write tmapstoA(t). Because SO(N) acts transitively on St(n N) each C^infty curve on St(n N) through Y can be written as A(t)Y and we get: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"T_YSt(nN)=BY Binmathfrakg = DeltainmathbbR^Ntimesn Delta^TY + Y^TDelta = mathbbO","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"where the last equality can be established through the isomorphism ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Omega T_YSt(n N) to mathfrakg^mathrmvec Y Delta mapsto (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"That this is an isomorphism can be easily checked: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" Omega(Delta)Y = (mathbbI - frac12YY^T)Delta - frac12YDelta^TY = Delta","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The isomorphism is also implemented in GeometricMachineLearning:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"using GeometricMachineLearning\n\nY = rand(StiefelManifold{Float32}, 5, 3)\nΔ = rgrad(Y, rand(Float32, 5, 3))\nGeometricMachineLearning.Ω(Y, Δ) * Y ≈ Δ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The function rgrad is introduced below. ","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Riemannian-Gradient-for-the-Stiefel-Manifold","page":"Homogeneous Spaces","title":"The Riemannian Gradient for the Stiefel Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"We defined the Riemannian gradient to be a vector field mathrmgrad^gL such that it is compatible with the Riemannian metric in some sense; the definition we gave relied on an explicit coordinate chart. We can also express the Riemannian gradient for matrix manifolds by not relying on an explicit coordinate representation (which would be computationally expensive) [6].","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.definition(raw\"Given a Riemannian matrix manifold ``\\mathcal{M}`` we define the **Riemannian gradient** of ``L:\\mathcal{M}\\to\\mathbb{R}`` at ``Y``, called ``\\mathrm{grad}_YL\\in{}T_Y\\mathcal{M}``, as the unique element of ``T_Y\\mathcal{M}`` such that for any other ``\\Delta\\in{}T_Y\\mathcal{M}`` we have\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\mathrm{Tr}((\\nabla{}L)^T\\Delta) = g_Y(\\mathrm{grad}_YL, \\Delta),\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where Tr indicates the usual matrix trace.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"For the Stiefel manifold the Riemannian gradient is given by: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" mathrmgrad_YL = nabla_YL - Y(nabla_YL)^TY = mathttrgrad(Y nabla_YL)","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"where nabla_YL refers to the Euclidean gradient, i.e. ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" nabla_YL_ij = fracpartialLpartialy_ij","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The Euclidean gradient nablaL can in practice be obtained with an AD routine. We then use the function rgrad to map nabla_YL from mathbbR^Ntimesn to T_YSt(nN). We can check that this mapping indeed maps to the Riemannian gradient","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"using GeometricMachineLearning\nusing LinearAlgebra: tr\n\nY = rand(StiefelManifold{Float32}, 5, 3)\n∇L = rand(Float32, 5, 3)\ngradL = rgrad(Y, ∇L)\nΔ = rgrad(Y, rand(Float32, 5, 3))\n\nmetric(Y, gradL, Δ) ≈ tr(∇L' * Δ)","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Grassmann-Manifold","page":"Homogeneous Spaces","title":"The Grassmann Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The Grassmann manifold is closely related to the Stiefel manifold, and an element of the Grassmann manifold can be represented through an element of the Stiefel manifold (but not vice-versa). An element of the Grassmann manifold G(nN) is a vector subspace subsetmathbbR^N of dimension n. Each such subspace (i.e. element of the Grassmann manifold) can be represented by a full-rank matrix AinmathbbR^Ntimesn and we identify two elements with the following equivalence relation: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" A_1 sim A_2 iff existsCinmathbbR^ntimesntext st A_1C = A_2","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The resulting manifold is of dimension n(N-n). One can find a parametrization of the manifold the following way: Because the matrix Y has full rank, there have to be n independent columns in it: i_1 ldots i_n. For simplicity assume that i_1 = 1 i_2=2 ldots i_n=n and call the matrix made up of these columns C. Then the mapping to the coordinate chart is: YC^-1 and the last N-n columns are the coordinates.","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"We can also define the Grassmann manifold based on the Stiefel manifold since elements of the Stiefel manifold are already full-rank matrices. In this case we have the following equivalence relation (for Y_1 Y_2inSt(nN)): ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" Y_1 sim Y_2 iff existsCinSO(n)text st Y_1C = Y_2","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"In GeometricMachineLearning elements of the Grassmann manifold are drawn the same way as elements of the Stiefel manifold:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"using GeometricMachineLearning\n\nrand(GrassmannManifold{Float32}, 5, 3)","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Riemannian-Gradient-of-the-Grassmann-Manifold","page":"Homogeneous Spaces","title":"The Riemannian Gradient of the Grassmann Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Obtaining the Riemannian Gradient for the Grassmann manifold is slightly more difficult than it is in the case of the Stiefel manifold [6]. Since the Grassmann manifold can be obtained from the Stiefel manifold through an equivalence relation, we can however use this as a starting point. ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.theorem(raw\"The Riemannian gradient of a function ``L`` defined on the Grassmann manifold can be written as\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\mathrm{grad}_\\mathcal{Y}^{Gr}L = \\nabla_Y{}L - YY^T\\nabla_YL,\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``\\nabla_Y{}L`` again is again the Euclidean gradient.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.proof(raw\"In a first step we identify charts on the Grassmann manifold to make dealing with it easier. For this consider the following open cover of the Grassmann manifold. \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\{\\mathcal{U}_W\\}_{W\\in{}St(n, N)} \\quad\\text{where}\\quad \\mathcal{U}_W = \\{\\mathrm{span}(Y):\\mathrm{det}(W^TY)\\neq0\\}.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"We can find a canonical bijective mapping from the set ``\\mathcal{U}_W`` to the set ``\\mathcal{S}_W := \\{Y\\in\\mathbb{R}^{N\\times{}n}:W^TY=\\mathbb{I}_n\\}``:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\sigma_W: \\mathcal{U}_W \\to \\mathcal{S}_W,\\, \\mathcal{Y}=\\mathrm{span}(Y)\\mapsto{}Y(W^TY)^{-1} =: \\hat{Y}.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"That ``\\sigma_W`` is well-defined is easy to see: Consider ``YC`` with ``C\\in\\mathbb{R}^{n\\times{}n}`` non-singular. Then ``YC(W^TYC)^{-1}=Y(W^TY)^{-1} = \\hat{Y}``. With this isomorphism we can also find a representation of elements of the tangent space:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"T_\\mathcal{Y}\\sigma_W: T_\\mathcal{Y}Gr(n,N)\\to{}T_{\\hat{Y}}\\mathcal{S}_W.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"We give an explicit representation of this isomorphism; because the map ``\\sigma_W`` does not care about the representation of ``\\mathrm{span}(Y)`` we can perform the variations in ``St(n,N)``. We write the variations as ``Y(t)\\in{}St(n,N)`` for ``t\\in(-\\varepsilon,\\varepsilon)``. We also set ``Y(0) = Y`` and hence\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\frac{d}{dt}Y(t)(W^TY(t))^{-1} = (\\dot{Y}(0) - Y(W^TY)^{-1}W^T\\dot{Y}(0))(W^TY)^{-1},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``\\dot{Y}(0)\\in{}T_YSt(n,N)``. Also note note that we have ``T_\\mathcal{Y}\\mathcal{U}_W = T_\\mathcal{Y}Gr(n,N)`` because ``\\mathcal{U}_W`` is an open subset of ``Gr(n,N)``. We thus can identify the tangent space ``T_\\mathcal{Y}Gr(n,N)`` with the following set:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"T_{\\hat{Y}}\\mathcal{S}_W = \\{(\\Delta - YW^T\\Delta)(W^T\\Delta)^{-1}: Y\\in{}St(n,N)\\text{ s.t. }\\mathrm{span}(Y)=\\mathcal{Y}\\text{ and }\\Delta\\in{}T_YSt(n,N)\\}.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"Further note that we can pick any element ``W`` to construct the charts for a neighborhood around the point ``\\mathcal{Y}\\in{}Gr(n,N)`` as long as we have ``\\mathrm{det}(W^TY)\\neq0`` for ``\\mathrm{span}(Y)=\\mathcal{Y}``. We hence take ``W=Y`` and get the identification: \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"T_\\mathcal{Y}Gr(n,N) \\equiv \\{\\Delta - YY^T\\Delta: Y\\in{}St(n,N)\\text{ s.t. }\\mathrm{span}(Y)=\\mathcal{Y}\\text{ and }\\Delta\\in{}T_YSt(n,N)\\},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"which is very easy to handle computationally (we simply store and change the matrix ``Y`` that represents an element of the Grassmann manifold). The Riemannian gradient is then \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\mathrm{grad}_\\mathcal{Y}^{Gr}L = \\mathrm{grad}_Y^{St}L - YY^T\\mathrm{grad}_Y^{St}L = \\nabla_Y{}L - YY^T\\nabla_YL,\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``\\mathrm{grad}^{St}_YL`` is the Riemannian gradient of the Stiefel manifold at ``Y``. We proved our assertion.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/#Library-Functions","page":"Homogeneous Spaces","title":"Library Functions","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"StiefelManifold\nrand(manifold_type::Type{MT}, ::Integer, ::Integer) where MT <: Manifold\nGeometricMachineLearning.rgrad(::StiefelManifold, ::AbstractMatrix)\nGeometricMachineLearning.metric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)\nGeometricMachineLearning.Ω(::StiefelManifold{T}, ::AbstractMatrix{T}) where T","category":"page"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.StiefelManifold-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.StiefelManifold","text":"An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices YinmathbbR^Ntimesn whose columns are orthonormal, i.e. \n\n St(n N) = Y Y^TY = mathbbI_n \n\nThe Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.\n\n\n\n\n\n","category":"type"},{"location":"manifolds/homogeneous_spaces/#Base.rand-Union{Tuple{MT}, Tuple{Type{MT}, Integer, Integer}} where MT<:Manifold-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"Base.rand","text":"rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold\n\nDraw random elements from the Stiefel and the Grassmann manifold. \n\nBecause both of these manifolds are compact spaces we can sample them uniformly [8].\n\nExamples\n\nWhen we call ...\n\nusing GeometricMachineLearning\nimport Random\nRandom.seed!(123)\n\nN, n = 5, 3\nrand(StiefelManifold{Float32}, N, n)\n\n# output\n\n5×3 StiefelManifold{Float32, Matrix{Float32}}:\n -0.275746 0.329913 0.772753\n -0.624851 -0.332242 -0.0685991\n -0.693326 0.36724 -0.189882\n -0.0929493 -0.731446 0.460639\n 0.210203 0.333008 0.387173\n\n... the sampling is done by first allocating a random matrix of size Ntimesn via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix. \n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.rgrad-Tuple{StiefelManifold, AbstractMatrix}-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.rgrad","text":"Computes the Riemannian gradient for the Stiefel manifold given an element YinSt(Nn) and a matrix nablaLinmathbbR^Ntimesn (the Euclidean gradient). It computes the Riemannian gradient with respect to the canonical metric (see the documentation for the function metric for an explanation of this). The precise form of the mapping is: \n\nmathttrgrad(Y nablaL) mapsto nablaL - Y(nablaL)^TY\n\nIt is called with inputs:\n\nY::StiefelManifold\ne_grad::AbstractMatrix: i.e. the Euclidean gradient (what was called nablaL) above.\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.metric-Tuple{StiefelManifold, AbstractMatrix, AbstractMatrix}-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.metric","text":"Implements the canonical Riemannian metric for the Stiefel manifold:\n\ng_Y (Delta_1 Delta_2) mapsto mathrmtr(Delta_1^T(mathbbI - frac12YY^T)Delta_2)\n\nIt is called with: \n\nY::StiefelManifold\nΔ₁::AbstractMatrix\nΔ₂::AbstractMatrix\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.Ω-Union{Tuple{T}, Tuple{StiefelManifold{T, AT} where AT<:AbstractMatrix{T}, AbstractMatrix{T}}} where T-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.Ω","text":"Implements the canonical horizontal lift for the Stiefel manifold:\n\n (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)\n\nInternally this performs \n\nSkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')\n\nto save memory. \n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#References","page":"Homogeneous Spaces","title":"References","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).\n\n\n\nT. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).\n\n\n\nT. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).\n\n\n\n","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/#Kolmogorov-n-width","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"","category":"section"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"The Kolmogorov n-width measures how well some set mathcalM (typically the solution manifold) can be approximated with a linear subspace:","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"d_n(mathcalM) = mathrminf_V_nsubsetVmathrmdimV_n=nmathrmsup(uinmathcalM)mathrminf_v_ninV_n u - v_n _V","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"with mathcalMsubsetV and V is a (typically infinite-dimensional) Banach space. For advection-dominated problems (among others) the decay of the Kolmogorov n-width is very slow, i.e. one has to pick n very high in order to obtain useful approximations (see [34] and [35]).","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"In order to overcome this, techniques based on neural networks (see e.g. [26]) and optimal transport (see e.g. [35]) have been used. ","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/#References","page":"Kolmogorov n-width","title":"References","text":"","category":"section"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"T. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).\n\n\n\nC. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).\n\n\n\nK. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).\n\n\n\n","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/#The-Horizontal-Lift","page":"Horizontal Lift","title":"The Horizontal Lift","text":"","category":"section"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"For each element YinmathcalM we can perform a splitting mathfrakg = mathfrakg^mathrmhor Yoplusmathfrakg^mathrmver Y, where the two subspaces are the horizontal and the vertical component of mathfrakg at Y respectively. For homogeneous spaces: T_YmathcalM = mathfrakgcdotY, i.e. every tangent space to mathcalM can be expressed through the application of the Lie algebra to the relevant element. The vertical component consists of those elements of mathfrakg which are mapped to the zero element of T_YmathcalM, i.e. ","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"mathfrakg^mathrmver Y = mathrmker(mathfrakgtoT_YmathcalM)","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"The orthogonal complement[1] of mathfrakg^mathrmver Y is the horizontal component and is referred to by mathfrakg^mathrmhor Y. This is naturally isomorphic to T_YmathcalM. For the Stiefel manifold the horizontal lift has the simple form: ","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"Omega(Y V) = left(mathbbI - frac12right)VY^T - YV^T(mathbbI - frac12YY^T)","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"If the element Y is the distinct element E, then the elements of mathfrakg^mathrmhorE take a particularly simple form, see Global Tangent Space for a description of this. ","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"[1]: The orthogonal complement is taken with respect to a metric defined on mathfrakg. For the case of G=SO(N) and mathfrakg=mathfrakso(N) = AA+A^T =0 this metric can be chosen as (A_1A_2)mapstofrac12A_1^TA_2.","category":"page"},{"location":"Optimizer/#Optimizer","page":"Optimizers","title":"Optimizer","text":"","category":"section"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"In order to generalize neural network optimizers to homogeneous spaces, a class of manifolds we often encounter in machine learning, we have to find a global tangent space representation which we call mathfrakg^mathrmhor here. ","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"Starting from an element of the tangent space T_YmathcalM[1], we need to perform two mappings to arrive at mathfrakg^mathrmhor, which we refer to by Omega and a red horizontal arrow:","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"[1]: In practice this is obtained by first using an AD routine on a loss function L, and then computing the Riemannian gradient based on this. See the section of the Stiefel manifold for an example of this.","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"Main.include_graphics(\"tikz/general_optimization_with_boundary\") # hide","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"Here the mapping Omega is a horizontal lift from the tangent space onto the horizontal component of the Lie algebra at Y. ","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"The red line maps the horizontal component at Y, i.e. mathfrakg^mathrmhorY, to the horizontal component at mathfrakg^mathrmhor.","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"The mathrmcache stores information about previous optimization steps and is dependent on the optimizer. The elements of the mathrmcache are also in mathfrakg^mathrmhor. Based on this the optimer (Adam in this case) computes a final velocity, which is the input of a retraction. Because this update is done for mathfrakg^mathrmhorequivT_YmathcalM, we still need to perform a mapping, called apply_section here, that then finally updates the network parameters. The two red lines are described in global sections.","category":"page"},{"location":"Optimizer/#References","page":"Optimizers","title":"References","text":"","category":"section"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\n","category":"page"},{"location":"","page":"Home","title":"Home","text":"CurrentModule = GeometricMachineLearning","category":"page"},{"location":"#Geometric-Machine-Learning","page":"Home","title":"Geometric Machine Learning","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"GeometricMachineLearning.jl implements various scientific machine learning models that aim at learning dynamical systems with geometric structure, such as Hamiltonian (symplectic) or Lagrangian (variational) systems.","category":"page"},{"location":"#Installation","page":"Home","title":"Installation","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"GeometricMachineLearning.jl and all of its dependencies can be installed via the Julia REPL by typing ","category":"page"},{"location":"","page":"Home","title":"Home","text":"]add GeometricMachineLearning","category":"page"},{"location":"#Architectures","page":"Home","title":"Architectures","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"There are several architectures tailored towards problems in scientific machine learning implemented in GeometricMachineLearning.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"architectures/sympnet.md\",\n]","category":"page"},{"location":"#Manifolds","page":"Home","title":"Manifolds","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"GeometricMachineLearning supports putting neural network weights on manifolds. These include:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"manifolds/grassmann_manifold.md\",\n \"manifolds/stiefel_manifold.md\",\n]","category":"page"},{"location":"#Special-Neural-Network-Layer","page":"Home","title":"Special Neural Network Layer","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Many layers have been adapted in order to be used for problems in scientific machine learning. Including:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"layers/attention_layer.md\",\n]","category":"page"},{"location":"#Tutorials","page":"Home","title":"Tutorials","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Tutorials for using GeometricMachineLearning are: ","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"tutorials/sympnet_tutorial.md\",\n \"tutorials/mnist_tutorial.md\",\n]","category":"page"},{"location":"#Reduced-Order-Modeling","page":"Home","title":"Reduced Order Modeling","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"A short description of the key concepts in reduced order modeling (where GeometricMachineLearning can be used) are in:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"reduced_order_modeling/autoencoder.md\",\n \"reduced_order_modeling/symplectic_autoencoder.md\",\n \"reduced_order_modeling/kolmogorov_n_width.md\",\n]","category":"page"},{"location":"references/#References","page":"References","title":"References","text":"","category":"section"},{"location":"references/","page":"References","title":"References","text":"S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).\n\n\n\nS. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\nS. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).\n\n\n\nS. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).\n\n\n\nM. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).\n\n\n\nP.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).\n\n\n\nE. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).\n\n\n\nF. Mezzadri. How to generate random matrices from the classical compact groups, arXiv preprint math-ph/0609050 (2006).\n\n\n\nW. S. Moses, V. Churavy, L. Paehler, J. Hückelheim, S. H. Narayanan, M. Schanen and J. Doerfert. Reverse-Mode Automatic Differentiation and Optimization of GPU Kernels via Enzyme. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC '21 (Association for Computing Machinery, New York, NY, USA, 2021).\n\n\n\nM. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).\n\n\n\nJ. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).\n\n\n\nJ. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).\n\n\n\nD. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).\n\n\n\nA. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).\n\n\n\nK. Jacobs. Discrete Stochastics (Birkhäuser Verlag, Basel, Switzerland, 1992).\n\n\n\nP.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).\n\n\n\nK. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).\n\n\n\nM.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).\n\n\n\nK. Feng and M.-z. Qin. The symplectic methods for the computation of Hamiltonian equations. In: Numerical Methods for Partial Differential Equations: Proceedings of a Conference held in Shanghai, PR China, March 25–29, 1987 (Springer, 1987); pp. 1–37.\n\n\n\nZ. Ge and K. Feng. On the approximation of linear Hamiltonian systems. Journal of Computational Mathematics, 88–97 (1988).\n\n\n\nB. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).\n\n\n\nB. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).\n\n\n\nM. Kraus. GeometricIntegrators.jl: Geometric Numerical Integration in Julia, https://github.com/JuliaGNI/GeometricIntegrators.jl (2020).\n\n\n\nS. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).\n\n\n\nS. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).\n\n\n\nK. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).\n\n\n\nA. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).\n\n\n\nA. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, beta-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).\n\n\n\nP. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).\n\n\n\nP. Jin, Z. Lin and B. Xiao. Optimal unit triangular factorization of symplectic matrices. Linear Algebra and its Applications (2022).\n\n\n\nN. Patwardhan, S. Marrone and C. Sansone. Transformers in the real world: A survey on nlp applications. Information 14, 242 (2023).\n\n\n\nP. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).\n\n\n\nL. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).\n\n\n\nC. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).\n\n\n\nT. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).\n\n\n\nB. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\nT. Lin and H. Zha. Riemannian manifold learning. IEEE transactions on pattern analysis and machine intelligence 30, 796–809 (2008).\n\n\n\nT. Blickhan. BrenierTwoFluids.jl, https://github.com/ToBlick/BrenierTwoFluids (2023).\n\n\n\nI. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).\n\n\n\nB. Brantner and M. Kraus. Symplectic autoencoders for Model Reduction of Hamiltonian Systems, arXiv preprint arXiv:2312.10004 (2023).\n\n\n\nT. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).\n\n\n\nT. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).\n\n\n\nT. Bendokat, R. Zimmermann and P.-A. Absil. A Grassmann manifold handbook: Basic geometry and computational aspects, arXiv preprint arXiv:2011.13699 (2020).\n\n\n\nB. Brantner, G. de Romemont, M. Kraus and Z. Li. Structure-Preserving Transformers for Learning Parametrized Hamiltonian Systems, arXiv preprint arXiv:2312:11166 (2023).\n\n\n\n","category":"page"},{"location":"manifolds/manifolds/#(Matrix)-Manifolds","page":"General Theory on Manifolds","title":"(Matrix) Manifolds","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Manifolds are topological spaces that locally look like vector spaces. In the following we restrict ourselves to finite-dimensional smooth[1] manifolds. In this section we routinely denote points on a manifold by lower case letters like x y and z if we speak about general properties and by upper case letters like A and B if we talk about specific examples of matrix manifolds. All manifolds that can be used to build neural networks in GeometricMachineLearning, such as the Stiefel manifold and the Grassmann manifold are matrix manifolds.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[1]: Smooth here means C^infty.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A **finite-dimensional smooth manifold** of dimension ``n`` is a second-countable Hausdorff space ``\\mathcal{M}`` for which ``\\forall{}x\\in\\mathcal{M}`` we can find a neighborhood ``U`` that contains ``x`` and a corresponding homeomorphism ``\\varphi_U:U\\cong{}W\\subset\\mathbb{R}^n`` where ``W`` is an open subset. The homeomorphisms ``\\varphi_U`` are referred to as *coordinate charts*. If two such coordinate charts overlap, i.e. if ``U_1\\cap{}U_2\\neq\\{\\}``, then the map ``\\varphi_{U_2}^{-1}\\circ\\varphi_{U_1}`` has to be ``C^\\infty``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"One example of a manifold that is also important for GeometricMachineLearning is the Lie group[2] of orthonormal matrices SO(N). Before we can proof that SO(N) is a manifold we first need the preimage theorem.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[2]: Lie groups are manifolds that also have a group structure, i.e. there is an operation mathcalMtimesmathcalMtomathcalM(ab)mapstoab s.t. (ab)c = a(bc) and there exists a neutral elementemathcalM s.t. ae = a forallainmathcalM as well as an (for every a) inverse element a^-1 s.t. a(a^-1) = e. The neutral element e we refer to as mathbbI when dealing with matrix manifolds.","category":"page"},{"location":"manifolds/manifolds/#The-Preimage-Theorem","page":"General Theory on Manifolds","title":"The Preimage Theorem","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Before we can state the preimage theorem we need another definition[3]:","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[3]: In this definition we use the notation T_xg. This will be explained below. For we will interpret T_xg simply as (varphi_Ucircgcircpsi_V^-1) where varphi_U is a coordinate chart around y = g(x) and psi_V is a coordinate chart around x.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"Consider a smooth mapping ``g: \\mathcal{M}\\to\\mathcal{N}`` from one manifold to another. A point ``y\\in\\mathcal{N}`` is called a **regular value** of ``g`` if ``\\forall{}x\\in{}g^{-1}\\{y\\}`` the map ``T_xg:T_A\\mathcal{M}\\to{}T_{y}\\mathcal{N}`` is surjective.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"We now state the preimage theorem:","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.theorem(raw\"Consider a smooth map ``g:\\mathcal{M}\\to\\mathcal{N}`` from one manifold to another (we assume the dimensions of the two manifolds to be ``m+n`` and ``m`` respectively). Then the preimage of a regular point ``y`` of ``\\mathcal{N}`` is a submanifold of ``\\mathcal{M}``. Furthermore the codimension of ``g^{-1}\\{y\\}`` is equal to the dimension of ``\\mathcal{N}`` and the tangent space ``T_x(g^{-1}\\{y\\})`` is equal to the kernel of ``T_xg``.\"; name = \"Preimage Theorem\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.proof(raw\"Because ``\\mathcal{N}`` has manifold structure we can find a chart ``\\varphi_U:U\\to\\mathbb{R}^m`` for some neighborhood ``U`` that contains ``y``. We further consider a point ``A\\in{}g^{-1}\\{y\\}`` and a chart around it ``\\psi_V:V\\to\\mathbb{R}^{m+n}``. By the implicit function theorem we can then find a mapping ``h`` that turns ``\\varphi_U\\circ{}g\\circ\\psi_V^{-1}`` into a projection ``(x_1, \\ldots, x_{n+m}) \\mapsto (x_{n+1}, \\ldots, x_{n+m})``. We now consider the neighborhood ``V_1\\times\\{0\\} = \\psi(V \\cup f^{-1}\\{y\\})`` for ``\\psi(V) = V_1\\times{}V_2`` with the coordinate chart ``(x_1, \\ldots, x_n) \\mapsto \\psi(x_1, \\ldots, x_n, 0, \\ldots, 0).`` As this map is also smooth by the implicit function theorem this proofs our assertion.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.example(raw\"The group ``SO(N)`` is a Lie group (i.e. has manifold structure).\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.proof(raw\"The vector space ``\\mathbb{R}^{N\\times{}N}`` clearly has manifold structure. The group ``SO(N)`` is equivalent to one of the level sets of the mapping: ``g:\\mathbb{R}^{N\\times{}N}\\to\\mathcal{S}(N), A\\mapsto{}A^TA - \\mathbb{I}``, i.e. it is the component of ``f^{-1}\\{\\mathbb{I}\\}`` that contains ``\\mathbb{I}``. We still need to proof that ``\\mathbb{I}`` is a regular point of ``g``, i.e. that for ``A\\in{}SO(N)`` the mapping ``T_Ag`` is surjective. This means that ``\\forall{}B\\in\\mathcal{S}(N), A\\in\\mathbb{R}^{N\\times{}N}`` ``\\exists{}C\\in\\mathbb{R}^{N\\times{}N}`` s.t. ``C^TA + A^TC = B``. The element ``C=\\frac{1}{2}AB\\in\\mathcal{R}^{N\\times{}N}`` satisfies this property.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Similarly we can also proof: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.example(raw\"The sphere ``S^n:=\\{x\\in\\mathbb{R}^{n+1}: x^Tx = 1\\}`` is a manifold of dimension ``n``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.proof(raw\"Take ``g(x) = x^x - 1`` and proceed as in the case of ``SO(N)``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Note that both these manifolds, SO(N) and S^n are matrix manifolds, i.e. an element of mathcalM can be written as an element of mathbbR^NtimesN in the first case and mathbbR^ntimes1 in the second case. The additional conditions we impose on these manifolds are A^TA = mathbbI in the first case and x^Tx = 1 in the second case. Both of these manifolds belong to the category of Stiefel manifolds.","category":"page"},{"location":"manifolds/manifolds/#Tangent-Spaces","page":"General Theory on Manifolds","title":"Tangent Spaces","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"A tangent space can be seen as the collection of all possible velocities a curve can take at a point on a manifold. For this consider a manifold mathcalM and a point x on it and the collection of C^infty curves through x: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A mapping ``\\gamma:(-\\epsilon, \\epsilon)\\to\\mathcal{M}`` that is ``C^\\infty`` and for which we have ``\\gamma(0) = x`` is called a **``C^\\infty`` curve through ``x``**.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"The tangent space of mathcalM at x is the collection of the first derivatives of all gamma: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"The **tangent space** of ``\\mathcal{M}`` at ``x`` is the collection of all ``C^\\infty`` curves at ``x`` modulo the equivalence class ``\\gamma_1 \\sim \\gamma_2 \\iff \\gamma_1'(0) = \\gamma_2'(0)``. It is denoted by ``T_x\\mathcal{M}``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"As is customary we write gamma for the equivalence class of gamma and this is by definition equivalent to gamma(0). The tangent space T_xmathcalM can be shown to be homeomorphic[4] to mathbbR^n where n is the dimension of the manifold mathcalM. If the homeomorphism is constructed through the coordinate chart (varphi U) we call it varphi(x) or simply[5] varphi. If we are given a map gmathcalMtomathcalN we further define T_xg = (varphi)^-1circ(varphicircgpsi^-1)circpsi, i.e. a smooth map between two manifolds mathcalM and mathcalN induces a smooth map between the tangent spaces T_xmathcalM and T_g(x)mathcalN.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[4]: Note that we have not formally defined addition for T_xmathcalM. This can be done through the definition gamma + beta = alpha where alpha is any C^infty curve through x that satisfies alpha(0) = beta(0) + gamma(0). Note that we can always find such an alpha by the existence and uniqueness theorem.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[5]: We will further discuss this when we introduce the tangent bundle.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"We want to demonstrate this principle of constructing the tangent space from curves through the example of S^2. We consider the following curves: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"gamma_1(t) = beginpmatrix 0 sin(t) cos(t) endpmatrix\ngamma_2(t) = beginpmatrix sin(t) 0 cos(t) endpmatrix\ngamma_3(t) = beginpmatrix exp(-t ^ 2 2) t sin(t) exp(-t ^ 2 2) t cos(t) sqrt1 - (t ^ 2) exp(-t^2) endpmatrix","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"We now plot the manifold S^2, the three curves described above and the associated tangent vectors (visualized as arrows). Note that the tangent vectors induced by gamma_1 and gamma_3 are the same; for these curves we have gamma_1 sim gamma_3 and the tangent vectors of those two curves coincide: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"using CairoMakie\nusing ForwardDiff\nusing LaTeXStrings\n\nfunction plot_curve!(ax, gamma::Function; epsilon_range::T = 1.4, epsilon_spacing::T = .01, kwargs...) where T\n curve_domain = -epsilon_range : epsilon_spacing : epsilon_range\n curve = zeros(T, 3, length(curve_domain))\n for (i, t) in zip(axes(curve_domain, 1), curve_domain)\n curve[:, i] .= gamma(t)\n end\n lines!(ax, curve[1, :], curve[2, :], curve[3, :]; kwargs...)\nend\n\nfunction plot_arrow!(ax, gamma::Function; kwargs...)\n arrow_val = ForwardDiff.derivative(gamma, 0.)\n\n gamma_vec = ([gamma(0)[1]], [gamma(0)[2]], [gamma(0)[3]])\n gamma_deriv_vec = ([arrow_val[1]], [arrow_val[2]], [arrow_val[3]])\n\n arrows!(ax, gamma_vec..., gamma_deriv_vec...; kwargs...)\nend\n\nfunction tangent_space(; n = 100)\n xs = LinRange(-1.2, 1.2, n)\n ys = LinRange(-1.2, 1.2, n)\n zs = [one(x) * one(y) for x in xs, y in ys]\n xs, ys, zs\nend\n\ngamma_1(t) = [zero(t), sin(t), cos(t)]\ngamma_2(t) = [sin(t), zero(t), cos(t)]\ngamma_3(t) = [exp(-t ^ 2 / 2) * (t ^ 1) * sin(t), exp(-t ^ 2 / 2) * (t ^ 1) * cos(t), sqrt(1 - (t ^ 2) * exp(-t^2))]\n\ncurves = (gamma_1, gamma_2, gamma_3)\n\nmorange = RGBf(255 / 256, 127 / 256, 14 / 256)\nmblue = RGBf(31 / 256, 119 / 256, 180 / 256)\nmred = RGBf(214 / 256, 39 / 256, 40 / 256)\nmpurple = RGBf(148 / 256, 103 / 256, 189 / 256)\nmgreen = RGBf(44 / 256, 160 / 256, 44 / 256)\n\ncolors = (morange, mblue, mred)\n\nfunction make_plot(; theme = :light)\n text_color = theme == :light ? :black : :white\n\n fig = Figure(; backgroundcolor = :transparent)\n\n ax = Axis3(fig[1, 1]; \n backgroundcolor = :transparent, \n aspect = (1., 1., 0.8), \n azimuth = π / 6, \n elevation = π / 8, \n xlabel = rich(\"x\", subscript(\"1\"), font = :italic, color = text_color),\n ylabel = rich(\"x\", subscript(\"2\"), font = :italic, color = text_color),\n zlabel = rich(\"x\", subscript(\"3\"), font = :italic, color = text_color),\n )\n\n surface!(Main.sphere(1., [0., 0., 0.])...; alpha = .6)\n\n for (i, curve, color) in zip(1:length(curves), curves, colors)\n plot_curve!(ax, curve; label = rich(\"γ\", subscript(string(i)); color = text_color, font = :italic), linewidth = 2, color = color)\n end\n\n surface!(ax, tangent_space()...; alpha = .2)\n text!(.9, -.9, 1.; text = L\"T_x\\mathcal{M}\", color = text_color)\n\n for (i, curve, color) in zip(1:length(curves), curves, colors)\n plot_arrow!(ax, curve; linewidth = .03, color = color)\n end\n\n axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)\n\n fig, ax\nend\n\nif Main.output_type == :html\n save(\"tangent_space.png\", make_plot(; theme = :light)[1]; px_per_unit = 1.5)\n save(\"tangent_space_dark.png\", make_plot(; theme = :dark )[1]; px_per_unit = 1.5)\nelseif Main.output_type == :latex\n save(\"tangent_space.png\", make_plot(; theme = :light)[1]; px_per_unit = 2.0)\nend\n\nnothing","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.include_graphics(\"tangent_space\"; caption = raw\"Visualization of how the tangent space is constructed.\", width = .8) # hide","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"The tangent space T_xmathcalM for","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"x = beginpmatrix0 0 1 endpmatrix\n","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"is also shown. ","category":"page"},{"location":"manifolds/manifolds/#Vector-Fields","page":"General Theory on Manifolds","title":"Vector Fields","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"A time-independent vector field[6] is an object that specifies a velocity for every point on a domain. We first give the definition of a vector field on the vector space mathbbR^n and limit ourselves here to C^infty vector fields:","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[6]: Also called ordinary differential equation (ODE).","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A **vector field** on ``\\mathbb{R}^n`` is a smooth map ``X:\\mathbb{R}^n\\to\\mathbb{R}^n``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"The definition of a vector field on a manifold is not much more complicated: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A **vector field** on ``\\mathcal{M}`` is a map ``X`` defined on ``\\mathcal{M}`` such that ``X(x)\\in{}T_x\\mathcal{M}`` and ``\\varphi'\\circ{}X\\circ(\\varphi)^{-1}`` is smooth for any coordinate chart ``(\\varphi, U)`` that contains ``x``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"In the section on the existence-and-uniqueness theorem we show that every vector field has a unique solution given an initial condition; i.e. given a point xinmathcalM and a vector field X we can find a curve gamma such that gamma(0) = x and gamma(t) = X(gamma(t)) for all t in some interval (-epsilon epsilon).","category":"page"},{"location":"manifolds/manifolds/#The-Tangent-Bundle","page":"General Theory on Manifolds","title":"The Tangent Bundle","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"To each manifold mathcalM we can associate another manifold which we call the tangent bundle and denote by TmathcalM. The points on this manifold are: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"TmathcalM = (x v_x) xinmathcalM v_xinT_xmathcalM ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Coordinate charts on this manifold can be constructed in a straightforward manner; for every coordinate chart varphi_U the map varphi_U(x) gives a homeomorphism between T_xmathcalM and mathbbR^nfor any xinU. We can then find a neighborhood of any point (x v_x) by taking pi^-1(U) = (x v_x) xinU v_xinT_xmathcalM and this neighborhood is isomorphic to mathbbR^2n via (x v_x) mapsto (varphi_U(x) varphi(x)v_x). The geodesic spray is an important vector field defined on TmathcalM.","category":"page"},{"location":"manifolds/manifolds/#Library-Functions","page":"General Theory on Manifolds","title":"Library Functions","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Manifold","category":"page"},{"location":"manifolds/manifolds/#GeometricMachineLearning.Manifold-manifolds-manifolds","page":"General Theory on Manifolds","title":"GeometricMachineLearning.Manifold","text":"A manifold in GeometricMachineLearning is a sutype of AbstractMatrix. All manifolds are matrix manifolds and therefore stored as matrices. More details can be found in the docstrings for the StiefelManifold and the GrassmannManifold.\n\n\n\n\n\n","category":"type"},{"location":"manifolds/manifolds/#References","page":"General Theory on Manifolds","title":"References","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).\n\n\n\n","category":"page"},{"location":"architectures/volume_preserving_feedforward/#Volume-Preserving-Feedforward-Neural-Network","page":"Volume-Preserving FeedForward","title":"Volume-Preserving Feedforward Neural Network","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/#Neural-network-architecture","page":"Volume-Preserving FeedForward","title":"Neural network architecture","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"The constructor produces the following architecture[1]:","category":"page"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"[1]: Based on the input arguments n_linear and n_blocks. In this example init_upper is set to false, which means that the first layer is of type lower followed by a layer of type upper. ","category":"page"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"Main.include_graphics(\"../tikz/vp_feedforward\") # hide","category":"page"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"Here LinearLowerLayer performs x mapsto x + Lx and NonLinearLowerLayer performs x mapsto x + sigma(Lx + b). The activation function sigma is the forth input argument to the constructor and tanh by default. ","category":"page"},{"location":"architectures/volume_preserving_feedforward/#Note-on-Sympnets","page":"Volume-Preserving FeedForward","title":"Note on Sympnets","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"As SympNets are symplectic maps, they also conserve phase space volume and therefore form a subcategory of volume-preserving feedforward layers. ","category":"page"},{"location":"architectures/volume_preserving_feedforward/#Library-Functions","page":"Volume-Preserving FeedForward","title":"Library Functions","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"VolumePreservingFeedForward","category":"page"},{"location":"architectures/volume_preserving_feedforward/#GeometricMachineLearning.VolumePreservingFeedForward-architectures-volume_preserving_feedforward","page":"Volume-Preserving FeedForward","title":"GeometricMachineLearning.VolumePreservingFeedForward","text":"Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer. \n\nConstructor\n\nThe constructor is called with the following arguments: \n\nsys_dim::Int: The system dimension. \nn_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nactivation: The activation function for the nonlinear layers in a block. \ninit_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper. \n\n\n\n\n\n","category":"type"},{"location":"library/","page":"Library","title":"Library","text":"CurrentModule = GeometricMachineLearning","category":"page"},{"location":"library/#GeometricMachineLearning-Library-Functions","page":"Library","title":"GeometricMachineLearning Library Functions","text":"","category":"section"},{"location":"library/","page":"Library","title":"Library","text":"Modules = [GeometricMachineLearning]","category":"page"},{"location":"library/#AbstractNeuralNetworks.Chain-Tuple{GSympNet}","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Chain can also be called with a neural network as input.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, false, false}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is false and init_upper_act is false.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, false, true}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is false and init_upper_act is true.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, true, false}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is true and init_upper_act is false.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, true, true}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is true and init_upper_act is true.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.AbstractCache","page":"Library","title":"GeometricMachineLearning.AbstractCache","text":"AbstractCache has subtypes: \n\nAdamCache\nMomentumCache\nGradientCache\nBFGSCache\n\nAll of them can be initialized with providing an array (also supporting manifold types).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AbstractLieAlgHorMatrix","page":"Library","title":"GeometricMachineLearning.AbstractLieAlgHorMatrix","text":"AbstractLieAlgHorMatrix is a supertype for various horizontal components of Lie algebras. We usually call this mathfrakg^mathrmhor.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AbstractRetraction","page":"Library","title":"GeometricMachineLearning.AbstractRetraction","text":"AbstractRetraction is a type that comprises all retraction methods for manifolds. For every manifold layer one has to specify a retraction method that takes the layer and elements of the (global) tangent space.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ActivationLayer","page":"Library","title":"GeometricMachineLearning.ActivationLayer","text":"ActivationLayer is the struct corresponding to the constructors ActivationLayerQ and ActivationLayerP. See those for more information.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ActivationLayerP-Tuple{Any, Any}","page":"Library","title":"GeometricMachineLearning.ActivationLayerP","text":"Performs:\n\nbeginpmatrix\n q p\nendpmatrix mapsto \nbeginpmatrix\n q p + mathrmdiag(a)sigma(q)\nendpmatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.ActivationLayerQ-Tuple{Any, Any}","page":"Library","title":"GeometricMachineLearning.ActivationLayerQ","text":"Performs:\n\nbeginpmatrix\n q p\nendpmatrix mapsto \nbeginpmatrix\n q + mathrmdiag(a)sigma(p) p\nendpmatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.AdamOptimizer","page":"Library","title":"GeometricMachineLearning.AdamOptimizer","text":"Defines the Adam Optimizer. Algorithm and suggested defaults are taken from [39] (page 301).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AdamOptimizerWithDecay","page":"Library","title":"GeometricMachineLearning.AdamOptimizerWithDecay","text":"Defines the Adam Optimizer with weight decay.\n\nConstructors\n\nThe default constructor takes as input: \n\nn_epochs::Int\nη₁: the learning rate at the start \nη₂: the learning rate at the end \nρ₁: the decay parameter for the first moment \nρ₂: the decay parameter for the second moment\nδ: the safety parameter \nT (keyword argument): the type. \n\nThe second constructor is called with: \n\nn_epochs::Int\nT\n\n... the rest are keyword arguments\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AutoEncoder","page":"Library","title":"GeometricMachineLearning.AutoEncoder","text":"The autoencoder architecture\n\nAn autoencoder [39] is a neural network consisting of an encoder Psi^e and a decoder Psi^d. In the simplest case they are trained on some data set mathcalD to reduce the following error: \n\nPsi^dcircPsi^e(mathcalD) - mathcalD\n\nwhich we call the reconstruction error or autoencoder error (see the docs for AutoEncoderLoss) and cdot is some norm.\n\nImplementation details.\n\nAbstract AutoEncoder type. If a custom <:AutoEncoder architecture is implemented it should have the fields full_dim, reduced_dim, n_encoder_blocks and n_decoder_blocks. Further the routines encoder, decoder, encoder_parameters and decoder_parameters should be extended.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AutoEncoderLoss","page":"Library","title":"GeometricMachineLearning.AutoEncoderLoss","text":"This loss should always be used together with a neural network of type AutoEncoder (and it is also the default for training such a network). \n\nIt simply computes: \n\nmathttAutoEncoderLoss(nnmathttLoss input) = nn(input) - input\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BFGSCache","page":"Library","title":"GeometricMachineLearning.BFGSCache","text":"The cache for the BFGS optimizer.\n\nIt stores an array for the previous time step B and the inverse of the Hessian matrix H.\n\nIt is important to note that setting up this cache already requires a derivative! This is not the case for the other optimizers.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BFGSDummyCache","page":"Library","title":"GeometricMachineLearning.BFGSDummyCache","text":"In order to initialize BGGSCache we first need gradient information. This is why we initially have this BFGSDummyCache until gradient information is available.\n\nNOTE: we may not need this. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BFGSOptimizer","page":"Library","title":"GeometricMachineLearning.BFGSOptimizer","text":"This is an implementation of the Broyden-Fletcher-Goldfarb-Shanno (BFGS) optimizer. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Batch","page":"Library","title":"GeometricMachineLearning.Batch","text":"Batch is a struct whose functor acts on an instance of DataLoader to produce a sequence of training samples for training for one epoch. \n\nThe Constructor\n\nThe constructor for Batch is called with: \n\nbatch_size::Int\nseq_length::Int (optional)\nprediction_window::Int (optional)\n\nThe first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a sequence length and a prediction window as input arguments to Batch. These indicate the number of input vectors and the number of output vectors.\n\nThe functor\n\nAn instance of Batch can be called on an instance of DataLoader to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying batch:Batch to dl::DataLoader is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the time index and the second one is the parameter index.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BiasLayer","page":"Library","title":"GeometricMachineLearning.BiasLayer","text":"A bias layer that does nothing more than add a vector to the input. This is needed for LA-SympNets.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Classification","page":"Library","title":"GeometricMachineLearning.Classification","text":"Classification Layer that takes a matrix as an input and returns a vector that is used for MNIST classification. \n\nIt has the following arguments: \n\nM: input dimension \nN: output dimension \nactivation: the activation function \n\nAnd the following optional argument: \n\naverage: If this is set to true, then the output is computed as frac1Nsum_i=1^Ninput_bulleti. If set to false (the default) it picks the last column of the input. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ClassificationTransformer","page":"Library","title":"GeometricMachineLearning.ClassificationTransformer","text":"This is a transformer neural network for classification purposes. At the moment this is only used for training on MNIST, but can in theory be used for any classification problem.\n\nIt has to be called with a DataLoader that stores an input and an output tensor. The optional arguments are: \n\nn_heads: The number of heads in the MultiHeadAttention (mha) layers. Default: 7.\nn_layers: The number of transformer layers. Default: 16.\nactivation: The activation function. Default: softmax.\nStiefel: Wheter the matrices in the mha layers are on the Stiefel manifold. \nadd_connection: Whether the input is appended to the output of the mha layer. (skip connection)\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.DataLoader","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. \n\nConstructor\n\nThe data loader can be called with various inputs:\n\nA single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).\nA single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. \nA single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.\nA tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are n_p matrices (first input argument) and n_p integers (second input argument).\nA NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors. \nAn EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.\n\nWhen we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.\n\nFields of DataLoader\n\nThe fields of the DataLoader struct are the following: \n\ninput: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.\noutput: The tensor that contains the output (supervised learning) - this may be of type Nothing if the constructor is only called with one tensor (unsupervised learning).\ninput_dim: The dimension of the system, i.e. what is taken as input by a regular neural network.\ninput_time_steps: The length of the entire time series (length of the second axis).\nn_params: The number of parameters that are present in the data set (length of third axis)\noutput_dim: The dimension of the output tensor (first axis). If output is of type Nothing, then this is also of type Nothing.\noutput_time_steps: The size of the second axis of the output tensor. If output is of type Nothing, then this is also of type Nothing.\n\nThe input and output fields of DataLoader\n\nEven though the arguments to the Constructor may be vectors or matrices, internally DataLoader always stores tensors.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.DataLoader-Union{Tuple{@NamedTuple{q::AT, p::AT}}, Tuple{AT}, Tuple{T}} where {T, AT<:AbstractMatrix{T}}","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. \n\nConstructor\n\nThe data loader can be called with various inputs:\n\nA single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).\nA single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. \nA single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.\nA tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are n_p matrices (first input argument) and n_p integers (second input argument).\nA NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors. \nAn EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.\n\nWhen we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.DataLoader-Union{Tuple{GeometricSolutions.EnsembleSolution{T, T1, Vector{ST}}}, Tuple{ST}, Tuple{DT}, Tuple{T1}, Tuple{T}} where {T, T1, DT, ST<:(GeometricSolutions.GeometricSolution{T, T1, @NamedTuple{q::DT}})}","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Constructor for EnsembleSolution from package GeometricSolutions with field q.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.DataLoader-Union{Tuple{GeometricSolutions.EnsembleSolution{T, T1, Vector{ST}}}, Tuple{ST}, Tuple{DT}, Tuple{T1}, Tuple{T}} where {T, T1, DT<:(GeometricSolutions.DataSeries{T, AT} where AT<:Union{AbstractArray{T}, T}), ST<:(GeometricSolutions.GeometricSolution{T, T1, @NamedTuple{q::DT, p::DT}})}","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Constructor for EnsembleSolution form package GeometricSolutions with fields q and p.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Decoder","page":"Library","title":"GeometricMachineLearning.Decoder","text":"Abstract Decoder type. If a custom <:Decoder architecture is implemented it should have the fields full_dim, reduced_dim and n_decoder_blocks.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Encoder","page":"Library","title":"GeometricMachineLearning.Encoder","text":"Abstract Encoder type. If a custom <:Encoder architecture is implemented it should have the fields full_dim, reduced_dim and n_encoder_blocks.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GSympNet","page":"Library","title":"GeometricMachineLearning.GSympNet","text":"GSympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are: \n\nupscaling_dimension::Int: The upscaling dimension of the gradient layer. See the documentation for GradientLayerQ and GradientLayerP for further explanation. The default is 2*dim.\nn_layers::Int: The number of layers (i.e. the total number of GradientLayerQ and GradientLayerP). The default is 2.\nactivation: The activation function that is applied. By default this is tanh.\ninit_upper::Bool: Initialize the gradient layer so that it first modifies the q-component. The default is true.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GlobalSection","page":"Library","title":"GeometricMachineLearning.GlobalSection","text":"This implements global sections for the Stiefel manifold and the Symplectic Stiefel manifold. \n\nIn practice this is implemented using Householder reflections, with the auxiliary column vectors given by: |0| |0| |.| |1| ith spot for i in (n+1) to N (or with random columns) |0| |.| |0|\n\nMaybe consider dividing the output in the check functions by n!\n\nImplement a general global section here!!!! Tₓ𝔐 → G×𝔤 !!!!!! (think about random initialization!)\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientLayer","page":"Library","title":"GeometricMachineLearning.GradientLayer","text":"GradientLayer is the struct corresponding to the constructors GradientLayerQ and GradientLayerP. See those for more information.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientLayerP","page":"Library","title":"GeometricMachineLearning.GradientLayerP","text":"The gradient layer that changes the q component. It is of the form: \n\nbeginbmatrix\n mathbbI mathbbO nablaV mathbbI \nendbmatrix\n\nwith V(p) = sum_i=1^Ma_iSigma(sum_jk_ijp_j+b_i), where Sigma is the antiderivative of the activation function sigma (one-layer neural network). We refer to M as the upscaling dimension. Such layers are by construction symplectic.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientLayerQ","page":"Library","title":"GeometricMachineLearning.GradientLayerQ","text":"The gradient layer that changes the q component. It is of the form: \n\nbeginbmatrix\n mathbbI nablaV mathbbO mathbbI \nendbmatrix\n\nwith V(p) = sum_i=1^Ma_iSigma(sum_jk_ijp_j+b_i), where Sigma is the antiderivative of the activation function sigma (one-layer neural network). We refer to M as the upscaling dimension. Such layers are by construction symplectic.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientOptimizer","page":"Library","title":"GeometricMachineLearning.GradientOptimizer","text":"Define the Gradient optimizer, i.e. W ← W - η*∇f(W) Or the riemannian manifold equivalent, if applicable.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GrassmannLayer","page":"Library","title":"GeometricMachineLearning.GrassmannLayer","text":"Defines a layer that performs simple multiplication with an element of the Grassmann manifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GrassmannLieAlgHorMatrix","page":"Library","title":"GeometricMachineLearning.GrassmannLieAlgHorMatrix","text":"This implements the horizontal component of a Lie algebra that is isomorphic to the Grassmann manifold. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GrassmannManifold","page":"Library","title":"GeometricMachineLearning.GrassmannManifold","text":"The GrassmannManifold is based on the StiefelManifold\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.HRedSys","page":"Library","title":"GeometricMachineLearning.HRedSys","text":"HRedSys computes the reconstructed dynamics in the full system based on the reduced one. Optionally it can be compared to the FOM solution.\n\nIt can be called using the following constructor: HRedSys(N, n; encoder, decoder, v_full, f_full, v_reduced, f_reduced, parameters, tspan, tstep, ics, projection_error) where \n\nencoder: a function mathbbR^2NmapstomathbbR^2n\ndecoder: a (differentiable) function mathbbR^2nmapstomathbbR^2N\nv_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nf_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nv_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nf_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nparameters: a NamedTuple that parametrizes the vector fields (the same for fullvectorfield and reducedvectorfield)\ntspan: a tuple (t₀, tₗ) that specifies start and end point of the time interval over which integration is performed. \ntstep: the time step \nics: the initial condition for the big system.\nprojection_error: the error M - mathcalRcircmathcalP(M) where M is the snapshot matrix; mathcalP and mathcalR are the reduction and reconstruction respectively.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LASympNet","page":"Library","title":"GeometricMachineLearning.LASympNet","text":"LASympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are: \n\ndepth::Int: The number of linear layers that are applied. The default is 5.\nnhidden::Int: The number of hidden layers (i.e. layers that are not input or output layers). The default is 2.\nactivation: The activation function that is applied. By default this is tanh.\ninit_upper_linear::Bool: Initialize the linear layer so that it first modifies the q-component. The default is true.\ninit_upper_act::Bool: Initialize the activation layer so that it first modifies the q-component. The default is true.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LayerWithManifold","page":"Library","title":"GeometricMachineLearning.LayerWithManifold","text":"LayerWithManifold is a subtype of AbstractExplicitLayer that contains manifolds as weights.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LayerWithOptionalManifold","page":"Library","title":"GeometricMachineLearning.LayerWithOptionalManifold","text":"LayerWithOptionalManifold is a subtype of AbstractExplicitLayer that can contain manifolds as weights.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearLayer","page":"Library","title":"GeometricMachineLearning.LinearLayer","text":"LinearLayer is the struct corresponding to the constructors LinearLayerQ and LinearLayerP. See those for more information.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearLayerP-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.LinearLayerP","text":"Equivalent to a left multiplication by the matrix:\n\nbeginpmatrix\nmathbbI mathbbO \nB mathbbI\nendpmatrix \n\nwhere B is a symmetric matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.LinearLayerQ-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.LinearLayerQ","text":"Equivalent to a left multiplication by the matrix:\n\nbeginpmatrix\nmathbbI B \nmathbbO mathbbI\nendpmatrix \n\nwhere B is a symmetric matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.LinearSymplecticAttention","page":"Library","title":"GeometricMachineLearning.LinearSymplecticAttention","text":"Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the Q or the P part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.\n\nConstructor\n\nFor the constructors simply call \n\nLinearSymplecticAttentionQ(sys_dim, seq_length)\n\nor \n\nLinearSymplecticAttentionP(sys_dim, seq_length)\n\nwhere sys_dim is the system dimension and seq_length is the sequence length.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearSymplecticAttentionP","page":"Library","title":"GeometricMachineLearning.LinearSymplecticAttentionP","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearSymplecticAttentionQ","page":"Library","title":"GeometricMachineLearning.LinearSymplecticAttentionQ","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearSymplecticTransformer","page":"Library","title":"GeometricMachineLearning.LinearSymplecticTransformer","text":"Realizes the linear Symplectic Transformer.\n\nConstructor:\n\nThe constructor is called with the following arguments\n\ndim::Int: System dimension \nseq_length::Int: Number of time steps that the transformer considers. \n\nOptional keyword arguments:\n\nn_sympnet::Int=2: The number of sympnet layers in the transformer.\nupscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer. \nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function for the SympNet layers. \ninit_upper::Bool=true: Specifies if the first layer is a Q-type layer (init_upper=true) or if it is a P-type layer (init_upper=false).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LowerTriangular","page":"Library","title":"GeometricMachineLearning.LowerTriangular","text":"A lower-triangular matrix is an ntimesn matrix that has ones on the diagonal and zeros on the upper triangular.\n\nThe data are stored in a vector S similarly to SkewSymMatrix.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Manifold","page":"Library","title":"GeometricMachineLearning.Manifold","text":"A manifold in GeometricMachineLearning is a sutype of AbstractMatrix. All manifolds are matrix manifolds and therefore stored as matrices. More details can be found in the docstrings for the StiefelManifold and the GrassmannManifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ManifoldLayer","page":"Library","title":"GeometricMachineLearning.ManifoldLayer","text":"This defines a manifold layer that only has one matrix-valued manifold A associated with it does xmapstoAx. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.MomentumOptimizer","page":"Library","title":"GeometricMachineLearning.MomentumOptimizer","text":"Define the Momentum optimizer, i.e. V ← αV - ∇f(W) W ← W + ηV Or the riemannian manifold equivalent, if applicable.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.MultiHeadAttention","page":"Library","title":"GeometricMachineLearning.MultiHeadAttention","text":"MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data. \n\nConstructor\n\nTakes input arguments: \n\ndim::Int: The system dimension \nn_heads::Int: The number of heads. \nStiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold. \nretraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction. \nadd_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.NeuralNetworkIntegrator","page":"Library","title":"GeometricMachineLearning.NeuralNetworkIntegrator","text":"This is a super type of various neural network architectures such as SympNet and ResNet whose purpose is to approximate the flow of an ordinary differential equation (ODE).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Optimizer","page":"Library","title":"GeometricMachineLearning.Optimizer","text":"Optimizer struct that stores the 'method' (i.e. Adam with corresponding hyperparameters), the cache and the optimization step.\n\nIt takes as input an optimization method and the parameters of a network. \n\nFor technical reasons we first specify an OptimizerMethod that stores all the hyperparameters of the optimizer. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Optimizer-Tuple{NeuralNetwork, DataLoader, Batch, Int64, GeometricMachineLearning.NetworkLoss}","page":"Library","title":"GeometricMachineLearning.Optimizer","text":"A functor for Optimizer. It is called with: - nn::NeuralNetwork - dl::DataLoader - batch::Batch - n_epochs::Int - loss\n\nThe last argument is a function through which Zygote differentiates. This argument is optional; if it is not supplied GeometricMachineLearning defaults to an appropriate loss for the DataLoader.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Optimizer-Tuple{OptimizerMethod, NeuralNetwork}","page":"Library","title":"GeometricMachineLearning.Optimizer","text":"Typically the Optimizer is not initialized with the network parameters, but instead with a NeuralNetwork struct.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.PSDArch","page":"Library","title":"GeometricMachineLearning.PSDArch","text":"The architecture\n\nProper symplectic decomposition (PSD) can be seen as a SymplecticAutoencoder for which the decoder and the encoder are both PSD-like matrices (see the docs for PSDLayer. \n\nTraining\n\nFor optimizing the parameters in this architecture no neural network training is necessary (see the docs for solve!).\n\nThe constructor\n\nThe constructor only takes two arguments as input:\n\nfull_dim::Integer\nreduced_dim::Integer\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.PSDLayer","page":"Library","title":"GeometricMachineLearning.PSDLayer","text":"This is a PSD-like layer used for symplectic autoencoders. One layer has the following shape:\n\nA = beginbmatrix Phi mathbbO mathbbO Phi endbmatrix\n\nwhere Phi is an element of the Stiefel manifold St(n N).\n\nThe constructor of PSDLayer is called by PSDLayer(M, N; retraction=retraction): \n\nM is the input dimension.\nN is the output dimension. \nretraction is an instance of a struct with supertype AbstractRetraction. The only options at the moment are Geodesic() and Cayley().\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ResNet","page":"Library","title":"GeometricMachineLearning.ResNet","text":"A ResNet is a neural network that realizes a mapping of the form: x = mathcalNN(x) + x, so the input is again added to the output (a so-called add connection). In GeometricMachineLearning the specific ResNet that we use consists of a series of simple ResNetLayers.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ResNetLayer","page":"Library","title":"GeometricMachineLearning.ResNetLayer","text":"The ResNetLayer is a simple feedforward neural network to which we add the input after applying it, i.e. it realizes x mapsto x + sigma(Ax + b).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SkewSymMatrix","page":"Library","title":"GeometricMachineLearning.SkewSymMatrix","text":"A SkewSymMatrix is a matrix A s.t. A^T = -A.\n\nIf the constructor is called with a matrix as input it returns a symmetric matrix via the projection A mapsto frac12(A - A^T). This is a projection defined via the canonical metric mathbbR^ntimesntimesmathbbR^ntimesntomathbbR (AB) mapsto mathrmTr(A^TB).\n\nThe first index is the row index, the second one the column index.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StandardTransformerIntegrator","page":"Library","title":"GeometricMachineLearning.StandardTransformerIntegrator","text":"The regular transformer used as an integrator (multi-step method). \n\nThe constructor is called with one argument: \n\nsys_dim::Int\n\nThe following are keyword arguments:\n\ntransformer_dim::Int: the default is transformer_dim = sys_dim.\nn_blocks::Int: The default is 1.\nn_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)\nL::Int the number of transformer blocks (default is L = 2).\nupscaling_activation: by default identity\nresnet_activation: by default tanh\nadd_connection:Bool=true: if the input should be added to the output.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelLayer","page":"Library","title":"GeometricMachineLearning.StiefelLayer","text":"Defines a layer that performs simple multiplication with an element of the Stiefel manifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelLieAlgHorMatrix","page":"Library","title":"GeometricMachineLearning.StiefelLieAlgHorMatrix","text":"StiefelLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: (\\pi:S \\to SE ) where \n\nE = beginpmatrix mathbbI_n mathbbO_(N-n)timesn endpmatrix\n\nThe matrix (E) is implemented under StiefelProjection in GeometricMachineLearning.\n\nAn element of StiefelLieAlgMatrix takes the form: \n\nbeginpmatrix\nA B^T B mathbbO\nendpmatrix\n\nwhere (A) is skew-symmetric (this is SkewSymMatrix in GeometricMachineLearning).\n\nIf the constructor is called with a big (N\\times{}N) matrix, then the projection is performed the following way: \n\nbeginpmatrix\nA B_1 \nB_2 D\nendpmatrix mapsto \nbeginpmatrix\nmathrmskew(A) -B_2^T \nB_2 mathbbO\nendpmatrix\n\nThe operation mathrmskewmathbbR^ntimesntomathcalS_mathrmskew(n) is the skew-symmetrization operation. This is equivalent to calling the constructor of SkewSymMatrix with an (n\\times{}n) matrix.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelManifold","page":"Library","title":"GeometricMachineLearning.StiefelManifold","text":"An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices YinmathbbR^Ntimesn whose columns are orthonormal, i.e. \n\n St(n N) = Y Y^TY = mathbbI_n \n\nThe Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelProjection","page":"Library","title":"GeometricMachineLearning.StiefelProjection","text":"Outer constructor for StiefelProjection. This works with two integers as input and optionally the type.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelProjection-2","page":"Library","title":"GeometricMachineLearning.StiefelProjection","text":"An array that essentially does vcat(I(n), zeros(N-n, n)) with GPU support. It has three inner constructors. The first one is called with the following arguments: \n\nbackend: backends as supported by KernelAbstractions.\nT::Type\nN::Integer\nn::Integer\n\nThe second constructor is called by supplying a matrix as input. The constructor will then extract the backend, the type and the dimensions of that matrix. \n\nThe third constructor is called by supplying an instance of StiefelLieAlgHorMatrix. \n\nTechnically this should be a subtype of StiefelManifold. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SymmetricMatrix","page":"Library","title":"GeometricMachineLearning.SymmetricMatrix","text":"A SymmetricMatrix A is a matrix A^T = A.\n\nThis is a projection defined via the canonical metric (AB) mapsto mathrmtr(A^TB).\n\nInternally the struct saves a vector S of size n(n+1)div2. The conversion is done the following way: \n\nA_ij = begincases S( (i-1) i ) div 2 + j textif igeqj \n S( (j-1) j ) div 2 + i textelse endcases\n\nSo S stores a string of vectors taken from A: S = tildea_1 tildea_2 ldots tildea_n with tildea_i = A_i1A_i2ldotsA_ii.\n\nConstructor\n\nIf the constructor is called with a matrix as input it returns a symmetric matrix via the projection:\n\nA mapsto frac12(A + A^T)\n\nIt can also be called with two arguments S::AbstractVector and n::Integer where length(S) == n * (n + 1) ÷ 2 has to be true.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SympNet","page":"Library","title":"GeometricMachineLearning.SympNet","text":"The SympNet type encompasses GSympNets and LASympNets. SympNets are universal approximators of symplectic flows, i.e. maps varphimathbbR^2ntomathbbR^2n for which (nablavarphi)^TmathbbJnablavarphi = mathbbJ holds.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SympNetLayer","page":"Library","title":"GeometricMachineLearning.SympNetLayer","text":"Implements the various layers from the SympNet paper [29]. This is a super type of GradientLayer, ActivationLayer and LinearLayer.\n\nFor the linear layer, the activation and the bias are left out, and for the activation layer K and b are left out!\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SympNetLayer-Tuple{AbstractArray, NamedTuple}","page":"Library","title":"GeometricMachineLearning.SympNetLayer","text":"This is called when a SympnetLayer is applied to a NamedTuple. It calls apply_layer_to_nt_and_return_array.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.SymplecticAutoencoder","page":"Library","title":"GeometricMachineLearning.SymplecticAutoencoder","text":"The architecture\n\nThe symplectic autoencoder architecture was introduced in [40]. Like any other autoencoder it consists of an encoder Psi^emathbbR^2NtomathbbR^2n and a decoder Psi^dmathbbR^2ntomathbbR^2N with nllN. These satisfy the following properties: \n\nnabla_zPsi^emathbbJ_2N(nabla_zPsi^emathbbJ_2N)^T = mathbbJ_2n text and (nabla_xiPsi^d)^TmathbbJ_2Nnabla_xiPsi^d = mathbbJ_2n\n\nBecause the decoder has this particular property, the reduced system can be described by the Hamiltonian HcircPsi^d: \n\nmathbbJ_2nnabla_xi(HcircPsi^d) = mathbbJ_2n(nabla_xiPsi^d)^Tnabla_Psi^d(xi)H = mathbbJ_2n(nabla_xiPsi^d)^TmathbbJ_2N^TmathbbJ_2Nnabla_Psi^d(xi)H = (nabla_xiPsi^d)^+X_H(Psi^d(xi))\n\nwhere (nabla_xiPsi^d)^+ is the pseudoinverse of nabla_xiPsi^d (for more details see the docs on the AutoEncoder type).\n\nThe constructor\n\nThe constructor is called with\n\nfull_dim::Integer \nreduced_dim::Integer \nn_encoder_layers::Integer = 4 (keyword argument)\nn_encoder_blocks::Integer = 2 (keyword argument)\nn_decoder_layers::Integer = 1 (keyword argument)\nn_decoder_blocks::Integer = 3 (keyword argument)\nsympnet_upscale::Integer = 5 (keyword argument)\nactivation = tanh (keyword argument)\nencoder_init_q::Bool = true (keyword argument)\ndecoder_init_q::Bool = true (keyword argument)\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SymplecticPotential","page":"Library","title":"GeometricMachineLearning.SymplecticPotential","text":"SymplecticPotential(n)\n\nReturns a symplectic matrix of size 2n x 2n\n\nbeginpmatrix\nmathbbO mathbbI \nmathbbO -mathbbI \nendpmatrix\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.TrainingData","page":"Library","title":"GeometricMachineLearning.TrainingData","text":"TrainingData stores: \n\n - problem \n\n - shape \n\n - get \n\n - symbols \n\n - dim \n\n - noisemaker\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.TransformerIntegrator","page":"Library","title":"GeometricMachineLearning.TransformerIntegrator","text":"Encompasses various transformer architectures, such as the structure-preserving transformer and the linear symplectic transformer. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.TransformerLoss","page":"Library","title":"GeometricMachineLearning.TransformerLoss","text":"The loss for a transformer network (especially a transformer integrator). The constructor is called with:\n\nseq_length::Int\nprediction_window::Int (default is 1).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.UpperTriangular","page":"Library","title":"GeometricMachineLearning.UpperTriangular","text":"An upper-triangular matrix is an ntimesn matrix that has ones on the diagonal and zeros on the upper triangular.\n\nThe data are stored in a vector S similarly to SkewSymMatrix.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingAttention","page":"Library","title":"GeometricMachineLearning.VolumePreservingAttention","text":"Volume-preserving attention (single head attention)\n\nDrawbacks: \n\nthe super fast activation is only implemented for sequence lengths of 2, 3, 4 and 5.\nother sequence lengths only work on CPU for now (lu decomposition has to be implemented to work for tensors in parallel).\n\nConstructor\n\nThe constructor is called with: \n\ndim::Int: The system dimension \nseq_length::Int: The sequence length to be considered. The default is zero, i.e. arbitrary sequence lengths; this works for all sequence lengths but doesn't apply the super-fast activation. \nskew_sym::Bool (keyword argument): specifies if we the weight matrix is skew symmetric or arbitrary (default is false).\n\nFunctor\n\nApplying a layer of type VolumePreservingAttention does the following: \n\nFirst we perform the operation X mapsto X^T A X = C, where XinmathbbR^Ntimesmathttseq_length is a vector containing time series data and A is the skew symmetric matrix associated with the layer. \nIn a second step we compute the Cayley transform of C; Lambda = mathrmCayley(C).\nThe output of the layer is then XLambda.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingFeedForward","page":"Library","title":"GeometricMachineLearning.VolumePreservingFeedForward","text":"Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer. \n\nConstructor\n\nThe constructor is called with the following arguments: \n\nsys_dim::Int: The system dimension. \nn_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nactivation: The activation function for the nonlinear layers in a block. \ninit_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingFeedForwardLayer","page":"Library","title":"GeometricMachineLearning.VolumePreservingFeedForwardLayer","text":"Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following: \n\nx mapsto begincases sigma(Lx + b) textwhere L is mathttLowerTriangular sigma(Ux + b) textwhere U is mathttUpperTriangular endcases\n\nThe functor can be applied to a vecotr, a matrix or a tensor. \n\nConstructor\n\nThe constructors are called with:\n\nsys_dim::Int: the system dimension. \nactivation=tanh: the activation function. \ninclude_bias::Bool=true (keyword argument): specifies whether a bias should be used. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingLowerLayer","page":"Library","title":"GeometricMachineLearning.VolumePreservingLowerLayer","text":"See the documentation for VolumePreservingFeedForwardLayer.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingTransformer","page":"Library","title":"GeometricMachineLearning.VolumePreservingTransformer","text":"The volume-preserving transformer with the Cayley activation function and built-in upscaling.\n\nConstructor\n\nThe arguments for the constructor are: \n\nsys_dim::Int\nseq_length::Int: The sequence length of the data fed into the transformer.\n\nThe following are keyword argumetns:\n\nn_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function.\ninit_upper::Bool=false: Specifies if the network first acts on the q component. \nskew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingUpperLayer","page":"Library","title":"GeometricMachineLearning.VolumePreservingUpperLayer","text":"See the documentation for VolumePreservingFeedForwardLayer.\n\n\n\n\n\n","category":"type"},{"location":"library/#AbstractNeuralNetworks.update!-Union{Tuple{CT}, Tuple{T}, Tuple{Optimizer{<:BFGSOptimizer}, CT, AbstractArray{T}}} where {T, CT<:(BFGSCache{T, AT} where AT<:(AbstractArray{T}))}","page":"Library","title":"AbstractNeuralNetworks.update!","text":"Optimization for an entire neural networks with BFGS. What is different in this case is that we still have to initialize the cache.\n\nIf o.step == 1, then we initialize the cache\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.iterate-Union{Tuple{AT}, Tuple{T}, Tuple{NeuralNetwork{<:TransformerIntegrator}, @NamedTuple{q::AT, p::AT}}} where {T, AT<:AbstractMatrix{T}}","page":"Library","title":"Base.iterate","text":"This function computes a trajectory for a Transformer that has already been trained for valuation purposes.\n\nIt takes as input: \n\nnn: a NeuralNetwork (that has been trained).\nics: initial conditions (a matrix in mathbbR^2ntimesmathttseq_length or NamedTuple of two matrices in mathbbR^ntimesmathttseq_length)\nn_points::Int=100 (keyword argument): The number of steps for which we run the prediction. \nprediction_window::Int=size(ics.q, 2): The prediction window (i.e. the number of steps we predict into the future) is equal to the sequence length (i.e. the number of input time steps) by default. \n\n\n\n\n\n","category":"method"},{"location":"library/#Base.iterate-Union{Tuple{BT}, Tuple{AT}, Tuple{T}, Tuple{NeuralNetwork{<:NeuralNetworkIntegrator}, BT}} where {T, AT<:AbstractVector{T}, BT<:@NamedTuple{q::AT, p::AT}}","page":"Library","title":"Base.iterate","text":"This function computes a trajectory for a SympNet that has already been trained for valuation purposes.\n\nIt takes as input: \n\nnn: a NeuralNetwork (that has been trained).\nics: initial conditions (a NamedTuple of two vectors)\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.rand-Union{Tuple{MT}, Tuple{KernelAbstractions.Backend, Type{MT}, Integer, Integer}} where MT<:Manifold","page":"Library","title":"Base.rand","text":"rand(backend::KernelAbstractions.Backend, manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold)\n\nDraw random elements for a specific device.\n\nExamples\n\nusing GeometricMachineLearning\nimport Random\nRandom.seed!(123)\n\nN, n = 5, 3\nrand(CPU(), StiefelManifold{Float32}, N, n)\n\n# output\n\n5×3 StiefelManifold{Float32, Matrix{Float32}}:\n -0.275746 0.329913 0.772753\n -0.624851 -0.332242 -0.0685991\n -0.693326 0.36724 -0.189882\n -0.0929493 -0.731446 0.460639\n 0.210203 0.333008 0.387173\n\nRandom elements of the manifold can also be allocated on GPU, via e.g. ...\n\nrand(CUDABackend(), StiefelManifold{Float32}, N, n)\n\n... for drawing elements on a CUDA device.\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.rand-Union{Tuple{MT}, Tuple{Type{MT}, Integer, Integer}} where MT<:Manifold","page":"Library","title":"Base.rand","text":"rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold\n\nDraw random elements from the Stiefel and the Grassmann manifold. \n\nBecause both of these manifolds are compact spaces we can sample them uniformly [8].\n\nExamples\n\nWhen we call ...\n\nusing GeometricMachineLearning\nimport Random\nRandom.seed!(123)\n\nN, n = 5, 3\nrand(StiefelManifold{Float32}, N, n)\n\n# output\n\n5×3 StiefelManifold{Float32, Matrix{Float32}}:\n -0.275746 0.329913 0.772753\n -0.624851 -0.332242 -0.0685991\n -0.693326 0.36724 -0.189882\n -0.0929493 -0.731446 0.460639\n 0.210203 0.333008 0.387173\n\n... the sampling is done by first allocating a random matrix of size Ntimesn via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix. \n\n\n\n\n\n","category":"method"},{"location":"library/#Base.vec-Tuple{GeometricMachineLearning.AbstractTriangular}","page":"Library","title":"Base.vec","text":"If vec is applied onto Triangular, then the output is the associated vector. \n\n\n\n\n\n","category":"method"},{"location":"library/#Base.vec-Tuple{SkewSymMatrix}","page":"Library","title":"Base.vec","text":"If vec is applied onto SkewSymMatrix, then the output is the associated vector. \n\n\n\n\n\n","category":"method"},{"location":"library/#ChainRulesCore.rrule-Union{Tuple{T}, Tuple{typeof(GeometricMachineLearning.tensor_transpose_mat_mul), AbstractArray{T, 3}, AbstractMatrix{T}}} where T","page":"Library","title":"ChainRulesCore.rrule","text":"This implements the custom pullback for tensortransposemat_mul\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Gradient","page":"Library","title":"GeometricMachineLearning.Gradient","text":"This is an old constructor and will be depricated. For change_q=true it is equivalent to GradientLayerQ; for change_q=false it is equivalent to GradientLayerP.\n\nIf full_grad=false then ActivationLayer is called\n\n\n\n\n\n","category":"function"},{"location":"library/#GeometricMachineLearning.Transformer-Tuple{Integer, Integer, Integer}","page":"Library","title":"GeometricMachineLearning.Transformer","text":"The architecture for a \"transformer encoder\" is essentially taken from arXiv:2010.11929, but with the difference that no layer normalization is employed. This is because we still need to find a generalization of layer normalization to manifolds. \n\nThe transformer is called with the following inputs: \n\ndim: the dimension of the transformer \nn_heads: the number of heads \nL: the number of transformer blocks\n\nIn addition we have the following optional arguments: \n\nactivation: the activation function used for the ResNet (tanh by default)\nStiefel::Bool: if the matrices P^V, P^Q and P^K should live on a manifold (false by default)\nretraction: which retraction should be used (Geodesic() by default)\nadd_connection::Bool: if the input should by added to the ouput after the MultiHeadAttention layer is used (true by default)\nuse_bias::Bool: If the ResNet should use a bias (true by default)\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.accuracy-Union{Tuple{BT}, Tuple{AT}, Tuple{T1}, Tuple{T}, Tuple{Chain, Tuple, DataLoader{T, AT, BT}}} where {T, T1<:Integer, AT<:(AbstractArray{T}), BT<:(AbstractArray{T1})}","page":"Library","title":"GeometricMachineLearning.accuracy","text":"Computes the accuracy (as opposed to the loss) of a neural network classifier. \n\nIt takes as input:\n\nmodel::Chain\nps: parameters of the network\ndl::DataLoader\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.apply_layer_to_nt_and_return_array-Tuple{AbstractArray, AbstractNeuralNetworks.AbstractExplicitLayer, NamedTuple}","page":"Library","title":"GeometricMachineLearning.apply_layer_to_nt_and_return_array","text":"This function is used in the wrappers where the input to the SympNet layers is not a NamedTuple (as it should be) but an AbstractArray.\n\nIt converts the Array to a NamedTuple (via assign_q_and_p), then calls the SympNet routine(s) and converts back to an AbstractArray (with vcat).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_batch_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.assign_batch_kernel!","text":"Takes as input a batch tensor (to which the data are assigned), the whole data tensor and two vectors params and time_steps that include the specific parameters and time steps we want to assign. \n\nNote that this assigns sequential data! For e.g. being processed by a transformer.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_output_estimate-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, Int64}} where T","page":"Library","title":"GeometricMachineLearning.assign_output_estimate","text":"The function assign_output_estimate is closely related to the transformer. It takes the last prediction_window columns of the output and uses them for the final prediction. i.e.\n\nmathbbR^NtimesmathttpwtomathbbR^Ntimesmathttpw \nbeginbmatrix \n z^(1)_1 cdots z^(T)_1 \n cdots cdots cdots \n z^(1)_n cdots z^(T)_n\n endbmatrix mapsto \n beginbmatrix \n z^(T - mathttpw)_1 cdots z^(T)_1 \n cdots cdots cdots \n z^(T - mathttpw)_n cdots z^(T)_nendbmatrix \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_output_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.assign_output_kernel!","text":"This should be used together with assign_batch_kernel!. It assigns the corresponding output (i.e. target).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_q_and_p-Tuple{AbstractVector, Int64}","page":"Library","title":"GeometricMachineLearning.assign_q_and_p","text":"Allocates two new arrays q and p whose first dimension is half of that of the input x. This should also be supplied through the second argument N.\n\nThe output is a Tuple containing q and p.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.augment_zeros_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.augment_zeros_kernel!","text":"Used for differentiating assignoutputestimate (this appears in the loss). \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.build_v_reduced-Tuple{Any, Any, NeuralNetwork{<:GeometricMachineLearning.SymplecticDecoder}}","page":"Library","title":"GeometricMachineLearning.build_v_reduced","text":"Builds the reduced vector field based on the full vector field for a Hamiltonian system. We derive the reduced vector field via the reduced Hamiltonian: tildeH = HcircPsi^mathrmdec. We then get \n\nmathbbJ_2nnabla_xitildeH = mathbbJ_2n(nablaPsi^mathrmdec)^TmathbbJ_2N^TmathbbJ_2Nnabla_zH = mathbbJ_2n(nablaPsi^mathrmdec)^TmathbbJ_2N^T beginpmatrix v(z) f(z) endpmatrix = beginpmatrix - (nabla_pPsi_q)^Tf(z) + (nabla_pPsi_p)^Tv(z) (nabla_qPsi_q)^Tf(z) - (nabla_qPsi_p)^Tv(z) endpmatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.compute_iterations-Tuple{Integer, Integer, Integer}","page":"Library","title":"GeometricMachineLearning.compute_iterations","text":"This function gives iterations from the full dimension to the reduced dimension (i.e. the intermediate steps). The iterations are given in ascending order. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.compute_iterations_for_symplectic_system-Tuple{Integer, Integer, Integer}","page":"Library","title":"GeometricMachineLearning.compute_iterations_for_symplectic_system","text":"This function gives iterations from the full dimension to the reduced dimension (i.e. the intermediate steps). The iterations are given in ascending order. Only even steps are allowed here.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.compute_output_of_mha-Union{Tuple{T}, Tuple{M}, Tuple{MultiHeadAttention{M, M}, AbstractMatrix{T}, NamedTuple}} where {M, T}","page":"Library","title":"GeometricMachineLearning.compute_output_of_mha","text":"Applies MHA to an abstract matrix. This is the same independent of whether the input is added to the output or not. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.convert_input_and_batch_indices_to_array-Union{Tuple{BT}, Tuple{AT}, Tuple{T}, Tuple{DataLoader{T, BT}, Batch, Vector{Tuple{Int64, Int64}}}} where {T, AT<:AbstractArray{T, 3}, BT<:@NamedTuple{q::AT, p::AT}}","page":"Library","title":"GeometricMachineLearning.convert_input_and_batch_indices_to_array","text":"Takes the output of the batch functor and uses it to create the corresponding array (NamedTuples). \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.convert_input_and_batch_indices_to_array-Union{Tuple{BT}, Tuple{T}, Tuple{DataLoader{T, BT}, Batch, Vector{Tuple{Int64, Int64}}}} where {T, BT<:AbstractArray{T, 3}}","page":"Library","title":"GeometricMachineLearning.convert_input_and_batch_indices_to_array","text":"Takes the output of the batch functor and uses it to create the corresponding array. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.crop_array_for_transformer_loss-Union{Tuple{BT}, Tuple{AT}, Tuple{T2}, Tuple{T}, Tuple{AT, BT}} where {T, T2, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T2, 3}}","page":"Library","title":"GeometricMachineLearning.crop_array_for_transformer_loss","text":"This crops the output array of the neural network so that it conforms with the output it should be compared to. This is needed for the transformer loss. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.custom_mat_mul-Tuple{AbstractMatrix, AbstractVecOrMat}","page":"Library","title":"GeometricMachineLearning.custom_mat_mul","text":"Multiplies a matrix with a vector, a matrix or a tensor.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.decoder_layers_from_iteration-Tuple{GeometricMachineLearning.AutoEncoder, AbstractVector{<:Integer}}","page":"Library","title":"GeometricMachineLearning.decoder_layers_from_iteration","text":"Takes as input the autoencoder architecture and a vector of integers specifying the layer dimensions in the decoder. Has to return a tuple of AbstractExplicitLayers.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.draw_batch!-Union{Tuple{T}, Tuple{AbstractMatrix{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.draw_batch!","text":"This assigns the batch if the data are in form of a matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.encoder_layers_from_iteration-Tuple{GeometricMachineLearning.AutoEncoder, AbstractVector{<:Integer}}","page":"Library","title":"GeometricMachineLearning.encoder_layers_from_iteration","text":"Takes as input the autoencoder architecture and a vector of integers specifying the layer dimensions in the encoder. Has to return a tuple of AbstractExplicitLayers.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.geodesic-Union{Tuple{T}, Tuple{Manifold{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.geodesic","text":"The geodesic map for the manifolds. It takes as input an element x of mathcalM and an element of T_xmathcalM and returns mathttgeodesic(x v_x) = exp(v_x) For example: \n\nY = rand(StiefelManifold{Float64}, N, n)\nΔ = rgrad(Y, rand(N, n))\ngeodesic(Y, Δ)\n\nSee the docstring for rgrad for details on this function.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.init_optimizer_cache-Tuple{GradientOptimizer, Any}","page":"Library","title":"GeometricMachineLearning.init_optimizer_cache","text":"Wrapper for the functions setup_adam_cache, setup_momentum_cache, setup_gradient_cache, setup_bfgs_cache. These appear outside of optimizer_caches.jl because the OptimizerMethods first have to be defined.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.initialize_hessian_inverse-Union{Tuple{AbstractArray{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.initialize_hessian_inverse","text":"This initializes the inverse of the Hessian for various arrays. This requires an implementation of a vectorization operation vec. This is important for custom arrays.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.map_index_for_symplectic_potential-Tuple{Int64, Int64}","page":"Library","title":"GeometricMachineLearning.map_index_for_symplectic_potential","text":"This assigns the right index for the symplectic potential. To be used with assign_ones_for_symplectic_potential_kernel!.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul-Union{Tuple{AT}, Tuple{ST}, Tuple{BT}, Tuple{T}, Tuple{AT, AbstractArray{T, 3}}} where {T, BT<:(AbstractArray{T}), ST<:StiefelManifold{T, BT}, AT<:LinearAlgebra.Adjoint{T, ST}}","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul","text":"Extend mat_tensor_mul to a multiplication by the adjoint of an element of StiefelManifold. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul-Union{Tuple{T}, Tuple{StiefelManifold, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul","text":"Extend mat_tensor_mul to a multiplication by an element of StiefelManifold. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.metric-Tuple{StiefelManifold, AbstractMatrix, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.metric","text":"Implements the canonical Riemannian metric for the Stiefel manifold:\n\ng_Y (Delta_1 Delta_2) mapsto mathrmtr(Delta_1^T(mathbbI - frac12YY^T)Delta_2)\n\nIt is called with: \n\nY::StiefelManifold\nΔ₁::AbstractMatrix\nΔ₂::AbstractMatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.number_of_batches-Union{Tuple{OT}, Tuple{AT}, Tuple{BT}, Tuple{T}, Tuple{DataLoader{T, AT, OT, :TimeSeries}, Batch}} where {T, BT<:AbstractArray{T, 3}, AT<:Union{@NamedTuple{q::BT, p::BT}, BT}, OT}","page":"Library","title":"GeometricMachineLearning.number_of_batches","text":"Gives the number of batches. Inputs are of type DataLoader and Batch.\n\nHere the big distinction is between data that are time-series like and data that are autoencoder like.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.onehotbatch-Union{Tuple{AbstractVector{T}}, Tuple{T}} where T<:Integer","page":"Library","title":"GeometricMachineLearning.onehotbatch","text":"One-hot-batch encoding of a vector of integers: inputin01ldots9^ell. The output is a tensor of shape 10times1timesell. \n\n0 mapsto beginbmatrix 1 0 ldots 0 endbmatrix\n\nIn more abstract terms: i mapsto e_i.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.optimization_step!-Tuple{Optimizer, Chain, Tuple, Tuple}","page":"Library","title":"GeometricMachineLearning.optimization_step!","text":"Optimization for an entire neural network, the way this function should be called. \n\ninputs: \n\no::Optimizer\nmodel::Chain\nps::Tuple\ndx::Tuple\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.optimization_step!-Tuple{Optimizer, Union{AbstractNeuralNetworks.AbstractExplicitCell, AbstractNeuralNetworks.AbstractExplicitLayer}, NamedTuple, NamedTuple, NamedTuple}","page":"Library","title":"GeometricMachineLearning.optimization_step!","text":"Optimization for a single layer. \n\ninputs: \n\no::Optimizer\nd::Union{AbstractExplicitLayer, AbstractExplicitCell}\nps::NamedTuple: the parameters \nC::NamedTuple: NamedTuple of the caches \ndx::NamedTuple: NamedTuple of the derivatives (output of AD routine)\n\nps, C and dx must have the same keys. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.optimize_for_one_epoch!-Union{Tuple{T}, Tuple{Optimizer, Any, Union{Tuple, NamedTuple}, DataLoader{T, AT} where AT<:Union{AbstractArray{T}, NamedTuple}, Batch, Union{typeof(GeometricMachineLearning.loss), GeometricMachineLearning.NetworkLoss}}} where T","page":"Library","title":"GeometricMachineLearning.optimize_for_one_epoch!","text":"Optimize for an entire epoch. For this you have to supply: \n\nan instance of the optimizer.\nthe neural network model \nthe parameters of the model \nthe data (in form of DataLoader)\nin instance of Batch that contains batch_size (and optionally seq_length)\n\nWith the optional argument:\n\nthe loss, which takes the model, the parameters ps and an instance of DataLoader as input.\n\nThe output of optimize_for_one_epoch! is the average loss over all batches of the epoch:\n\noutput = frac1mathttsteps_per_epochsum_t=1^mathttsteps_per_epochloss(theta^(t-1))\n\nThis is done because any reverse differentiation routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps) (if the loss only depends on the parameters).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.patch_index-Union{Tuple{T}, Tuple{T, T, T}, NTuple{4, T}} where T<:Integer","page":"Library","title":"GeometricMachineLearning.patch_index","text":"Based on coordinates i,j this returns the batch index (for MNIST data set for now).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.rgrad-Tuple{StiefelManifold, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.rgrad","text":"Computes the Riemannian gradient for the Stiefel manifold given an element YinSt(Nn) and a matrix nablaLinmathbbR^Ntimesn (the Euclidean gradient). It computes the Riemannian gradient with respect to the canonical metric (see the documentation for the function metric for an explanation of this). The precise form of the mapping is: \n\nmathttrgrad(Y nablaL) mapsto nablaL - Y(nablaL)^TY\n\nIt is called with inputs:\n\nY::StiefelManifold\ne_grad::AbstractMatrix: i.e. the Euclidean gradient (what was called nablaL) above.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.solve!-Tuple{NeuralNetwork{<:PSDArch}, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.solve!","text":"PSDArch does not require neural network training since it is a strictly linear operation that can be solved with singular value decomposition (SVD).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.split_and_flatten-Union{Tuple{AbstractArray{T, 3}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.split_and_flatten","text":"split_and_flatten takes a tensor as input and produces another one as output (essentially rearranges the input data in an intricate way) so that it can easily be processed with a transformer.\n\nThe optional arguments are: \n\npatch_length: by default this is 7. \nnumber_of_patches: by default this is 16.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_skew_sym_assign-Union{Tuple{AT}, Tuple{T}, Tuple{AT, AbstractMatrix{T}}} where {T, AT<:AbstractArray{T, 3}}","page":"Library","title":"GeometricMachineLearning.tensor_mat_skew_sym_assign","text":"Takes as input: \n\nZ::AbstractArray{T, 3}: A tensor that stores a bunch of time series. \nA::AbstractMatrix: A matrix that is used to perform various scalar products. \n\nFor one of these time series the function performs the following computation: \n\n (z^(i) z^(j)) mapsto (z^(i))^TAz^(j) text for i j\n\nThe result of this are n(n-2)div2 scalar products. These scalar products are written into a lower-triangular matrix and the final output of the function is a tensor of these lower-triangular matrices. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_skew_sym_assign_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.tensor_mat_skew_sym_assign_kernel!","text":"A kernel that computes the weighted scalar products of all combinations of vectors in the matrix Z except where the two vectors are the same and writes the result into a tensor of skew symmetric matrices C. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.train!","page":"Library","title":"GeometricMachineLearning.train!","text":"train!(...)\n\nPerform a training of a neural networks on data using given method a training Method\n\nDifferent ways of use:\n\ntrain!(neuralnetwork, data, optimizer = GradientOptimizer(1e-2), training_method; nruns = 1000, batch_size = default(data, type), showprogress = false )\n\nArguments\n\nneuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend\ndata : the data (see TrainingData)\noptimizer = GradientOptimizer: the optimization method (see Optimizer)\ntraining_method : specify the loss function used \nnruns : number of iteration through the process with default value \nbatch_size : size of batch of data used for each step\n\n\n\n\n\n","category":"function"},{"location":"library/#GeometricMachineLearning.train!-Tuple{AbstractNeuralNetworks.AbstractNeuralNetwork{<:AbstractNeuralNetworks.Architecture}, AbstractTrainingData, TrainingParameters}","page":"Library","title":"GeometricMachineLearning.train!","text":"train!(neuralnetwork, data, optimizer, training_method; nruns = 1000, batch_size, showprogress = false )\n\nArguments\n\nneuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend\ndata::AbstractTrainingData : the data\n``\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.within_patch_index-Union{Tuple{T}, Tuple{T, T, T}} where T<:Integer","page":"Library","title":"GeometricMachineLearning.within_patch_index","text":"Based on coordinates i,j this returns the index within the batch\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.write_ones_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.write_ones_kernel!","text":"Kernel that is needed for functions relating to SymmetricMatrix and SkewSymMatrix \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Ω-Union{Tuple{T}, Tuple{StiefelManifold{T, AT} where AT<:AbstractMatrix{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.Ω","text":"Implements the canonical horizontal lift for the Stiefel manifold:\n\n (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)\n\nInternally this performs \n\nSkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')\n\nto save memory. \n\n\n\n\n\n","category":"method"},{"location":"optimizers/adam_optimizer/#The-Adam-Optimizer","page":"Adam Optimizer","title":"The Adam Optimizer","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"The Adam Optimizer is one of the most widely (if not the most widely used) neural network optimizer. Like most modern neural network optimizers it contains a cache that is updated based on first-order gradient information and then, in a second step, the cache is used to compute a velocity estimate for updating the neural networ weights. ","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold. ","category":"page"},{"location":"optimizers/adam_optimizer/#All-weights-on-a-vector-space","page":"Adam Optimizer","title":"All weights on a vector space","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"The cache of the Adam optimizer consists of first and second moments. The first moments B_1 store linear information about the current and previous gradients, and the second moments B_2 store quadratic information about current and previous gradients (all computed from a first-order gradient). ","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"If all the weights are on a vector space, then we directly compute updates for B_1 and B_2:","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"B_1 gets ((rho_1 - rho_1^t)(1 - rho_1^t))cdotB_1 + (1 - rho_1)(1 - rho_1^t)cdotnablaL\nB_2 gets ((rho_2 - rho_1^t)(1 - rho_2^t))cdotB_2 + (1 - rho_2)(1 - rho_2^t)cdotnablaLodotnablaL\nwhere odotmathbbR^ntimesmathbbR^ntomathbbR^n is the Hadamard product: aodotb_i = a_ib_i. rho_1 and rho_2 are hyperparameters. Their defaults, rho_1=09 and rho_2=099, are taken from (Goodfellow et al., 2016, page 301). After having updated the cache (i.e. B_1 and B_2) we compute a velocity (step 3) with which the parameters Y_t are then updated (step 4).\nW_tgets -etaB_1sqrtB_2 + delta\nY_t+1 gets Y_t + W_t","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Here eta (with default 0.01) is the learning rate and delta (with default 3cdot10^-7) is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise. ","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Main.include_graphics(\"../tikz/adam_optimizer\") # hide","category":"page"},{"location":"optimizers/adam_optimizer/#Weights-on-manifolds","page":"Adam Optimizer","title":"Weights on manifolds","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"The problem with generalizing Adam to manifolds is that the Hadamard product odot as well as the other element-wise operations (, sqrt and + in step 3 above) lack a clear geometric interpretation. In GeometricMachineLearning we get around this issue by utilizing a so-called global tangent space representation. ","category":"page"},{"location":"optimizers/adam_optimizer/#References","page":"Adam Optimizer","title":"References","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Goodfellow I, Bengio Y, Courville A. Deep learning[M]. MIT press, 2016.","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"I. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).\n\n\n\n","category":"page"},{"location":"data_loader/TODO/#DATA-Loader-TODO","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"","category":"section"},{"location":"data_loader/TODO/","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"[x] Implement @views instead of allocating a new array in every step. \n[x] Implement sampling without replacement.\n[x] Store information on the epoch and the current loss. \n[x] Usually the training loss is computed over the entire data set, we are probably going to do this for one epoch via ","category":"page"},{"location":"data_loader/TODO/","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"loss_e = frac1batchessum_batchinbatchesloss(batch)","category":"page"},{"location":"data_loader/TODO/","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"Point 4 makes sense because the output of an AD routine is the value of the loss function as well as the pullback. ","category":"page"},{"location":"data_loader/data_loader/#Data-Loader","page":"Routines","title":"Data Loader","text":"","category":"section"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning, Markdown\nMarkdown.parse(description(Val(:DataLoader)))","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"The data loader can be called with various types of arrays as input, for example a snapshot matrix:","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSnapshotMatrix = rand(Float32, 10, 100)\n\ndl = DataLoader(SnapshotMatrix)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"or a snapshot tensor: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSnapshotTensor = rand(Float32, 10, 100, 5)\n\ndl = DataLoader(SnapshotTensor)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Here the DataLoader has different properties :RegularData and :TimeSeries. This indicates that in the first case we treat all columns in the input tensor independently (this is mostly used for autoencoder problems), whereas in the second case we have time series-like data, which are mostly used for integration problems. We can also treat a problem with a matrix as input as a time series-like problem by providing an additional keyword argument: autoencoder=false:","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSnapshotMatrix = rand(Float32, 10, 100)\n\ndl = DataLoader(SnapshotMatrix; autoencoder=false)\ndl.input_time_steps","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning, Markdown\nMarkdown.parse(description(Val(:data_loader_for_named_tuple)))","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSymplecticSnapshotTensor = (q = rand(Float32, 10, 100, 5), p = rand(Float32, 10, 100, 5))\n\ndl = DataLoader(SymplecticSnapshotTensor)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"dl.input_dim","category":"page"},{"location":"data_loader/data_loader/#The-Batch-struct","page":"Routines","title":"The Batch struct","text":"","category":"section"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning, Markdown\nMarkdown.parse(description(Val(:Batch)))","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nmatrix_data = rand(Float32, 2, 10)\ndl = DataLoader(matrix_data; autoencoder = true)\n\nbatch = Batch(3)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"This also works if the data are in qp form: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nqp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))\ndl = DataLoader(qp_data; autoencoder = true)\n\nbatch = Batch(3)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"In those two examples the autoencoder keyword was set to true (the default). This is why the first index was always 1. This changes if we set autoencoder = false: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nqp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))\ndl = DataLoader(qp_data; autoencoder = false) # false is default \n\nbatch = Batch(3)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Specifically the routines do the following: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"mathttn_indicesleftarrow mathttn_paramslormathttinput_time_steps \nmathttindices leftarrow mathttshuffle(mathtt1mathttn_indices)\nmathcalI_i leftarrow mathttindices(i - 1) cdot mathttbatch_size + 1 mathtt i cdot mathttbatch_sizetext for i=1 ldots (mathrmlast -1)\nmathcalI_mathttlast leftarrow mathttindices(mathttn_batches - 1) cdot mathttbatch_size + 1mathttend","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Note that the routines are implemented in such a way that no two indices appear double. ","category":"page"},{"location":"data_loader/data_loader/#Sampling-from-a-tensor","page":"Routines","title":"Sampling from a tensor","text":"","category":"section"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"We can also sample tensor data.","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nqp_data = (q = rand(Float32, 2, 20, 3), p = rand(Float32, 2, 20, 3))\ndl = DataLoader(qp_data)\n\n# also specify sequence length here\nbatch = Batch(4, 5)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Sampling from a tensor is done the following way (mathcalI_i again denotes the batch indices for the i-th batch): ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"mathtttime_indices leftarrow mathttshuffle(mathtt1(mathttinput_time_steps - mathttseq_length - mathttprediction_window)\nmathttparameter_indices leftarrow mathttshuffle(mathtt1n_params)\nmathttcomplete_indices leftarrow mathttproduct(mathtttime_indices mathttparameter_indices)\nmathcalI_i leftarrow mathttcomplete_indices(i - 1) cdot mathttbatch_size + 1 i cdot mathttbatch_sizetext for i=1 ldots (mathrmlast -1)\nmathcalI_mathrmlast leftarrow mathttcomplete_indices(mathrmlast - 1) cdot mathttbatch_size + 1mathttend","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"This algorithm can be visualized the following way (here batch_size = 4):","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Main.include_graphics(\"../tikz/tensor_sampling\") # hide","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Here the sampling is performed over the second axis (the time step dimension) and the third axis (the parameter dimension). Whereas each block has thickness 1 in the x direction (i.e. pertains to a single parameter), its length in the y direction is seq_length. In total we sample as many such blocks as the batch size is big. By construction those blocks are never the same throughout a training epoch but may intersect each other!","category":"page"},{"location":"manifolds/basic_topology/#Basic-Concepts-from-General-Topology","page":"Concepts from General Topology","title":"Basic Concepts from General Topology","text":"","category":"section"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"On this page we discuss basic notions of topology that are necessary to define manifolds and work with them. Here we largely omit concrete examples and only define concepts that are necessary for defining a manifold[1], namely the properties of being Hausdorff and second countable. For a detailed discussion of the theory and for a wide range of examples that illustrate the theory see e.g. [1]. The here-presented concepts are also (rudimentarily) covered in most differential geometry books such as [2, 3]. ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"[1]: Some authors (see e.g. [2]) do not require these properties. But since they constitute very weak restrictions and are always satisfied by the manifolds relevant for our purposes we require them here. ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now start by giving all the definitions, theorem and corresponding proofs that are needed to define manifolds. Every manifold is a topological space which is why we give this definition first: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A **topological space** is a set ``\\mathcal{M}`` for which we define a collection of subsets of ``\\mathcal{M}``, which we denote by ``\\mathcal{T}`` and call the *open subsets*. ``\\mathcal{T}`` further has to satisfy the following three conditions:\n\" *\nMain.indentation * raw\"1. The empty set and ``\\mathcal{M}`` belong to ``\\mathcal{T}``.\n\" *\nMain.indentation * raw\"2. Any union of an arbitrary number of elements of ``\\mathcal{T}`` again belongs to ``\\mathcal{T}``.\n\" *\nMain.indentation * raw\"3. Any intersection of a finite number of elements of ``\\mathcal{T}`` again belongs to ``\\mathcal{T}``.\n\" *\nMain.indentation * \"So an arbitrary union of open sets is again open and a finite intersection of open sets is again open.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Based on this definition of a topological space we can now define what it means to be Hausdorff: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A topological space ``\\mathcal{M}`` is said to be **Hausdorff** if for any two points ``x,y\\in\\mathcal{M}`` we can find two open sets ``U_x,U_y\\in\\mathcal{T}`` s.t. ``x\\in{}U_x, y\\in{}U_y`` and ``U_x\\cap{}U_y=\\{\\}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now give the second definition that we need for defining manifolds, that of second countability:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A topological space ``\\mathcal{M}`` is said to be **second-countable** if we can find a countable subcollection of ``\\mathcal{T}`` called ``\\mathcal{U}`` s.t. ``\\forall{}U\\in\\mathcal{T}`` and ``x\\in{}U`` we can find an element ``V\\in\\mathcal{U}`` for which ``x\\in{}V\\subset{}U``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now give a few definitions and results that are needed for the inverse function theorem which is essential for practical applications of manifold theory. We start with the definition of continuity: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A mapping ``f`` between topological spaces ``\\mathcal{M}`` and ``\\mathcal{N}`` is called **continuous** if the preimage of every open set is again an open set, i.e. if ``f^{-1}\\{U\\}\\in\\mathcal{T}`` for ``U`` open in ``\\mathcal{N}`` and ``\\mathcal{T}`` the topology on ``\\mathcal{M}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Continuity can also be formulated in terms of closed sets instead of doing it with open sets. The definition of closed sets is given below:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A **closed set** of a topological space ``\\mathcal{M}`` is one whose complement is an open set, i.e. ``F`` is closed if ``F^c\\in\\mathcal{T}``, where the superscript ``{}^c`` indicates the complement. For closed sets we thus have the following three properties:\n\" *\nMain.indentation * raw\"1. The empty set and ``\\mathcal{M}`` are closed sets.\n\" *\nMain.indentation * raw\"2. Any union of a finite number of closed sets is again closed.\n\" *\nMain.indentation * raw\"3. Any intersection of an arbitrary number of closed sets is again closed.\n\" *\nMain.indentation * \"So a finite union of closed sets is again closed and an arbitrary intersection of closed sets is again closed.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now give an equivalent definition of continuity: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"The definition of continuity is equivalent to the following, second definition: ``f:\\mathcal{M}\\to\\mathcal{N}`` is continuous if ``f^{-1}\\{F\\}\\subset\\mathcal{M}`` is a closed set for each closed set ``F\\subset\\mathcal{N}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"First assume that ``f`` is continuous according to the first definition and not to the second. Then ``f^{-1}\\{F\\}`` is not closed but ``f^{-1}\\{F^c\\}`` is open. But ``f^{-1}\\{F^c\\} = \\{x\\in\\mathcal{M}:f(x)\\not\\in\\mathcal{N}\\} = (f^{-1}\\{F\\})^c`` cannot be open, else ``f^{-1}\\{F\\}`` would be closed. The implication of the first definition under assumption of the second can be shown analogously.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"The next theorem makes the rather abstract definition of closed sets more concrete; this definition is especially important for many practical proofs:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"The property of a set ``F`` being closed is equivalent to the following statement: If a point ``y`` is such that for every open set ``U`` containing it we have ``U\\cap{}F\\ne\\{\\}`` then this point is contained in ``F``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"We first proof that if a set is closed then the statement holds. Consider a closed set ``F`` and a point ``y\\not\\in{}F`` s.t. every open set containing ``y`` has nonempty intersection with ``F``. But the complement ``F^c`` also is such a set, which is a clear contradiction. Now assume the above statement for a set ``F`` and further assume ``F`` is not closed. Its complement ``F^c`` is thus not open. Now consider the *interior* of this set: ``\\mathrm{int}(F^c):=\\cup\\{U:U\\subset{}F^c\\text{ and $U$ open}\\}``, i.e. the biggest open set contained within ``F^c``. Hence there must be a point ``y`` which is in ``F^c`` but is not in its interior, else ``F^c`` would be equal to its interior, i.e. would be open. We further must be able to find an open set ``U`` that contains ``y`` but is also contained in ``F^c``, else ``y`` would be an element of ``F``. A contradiction.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Next we define open covers, a concept that is very important in developing a theory of manifolds: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"An **open cover** of a topological space ``\\mathcal{M}`` is a (not necessarily countable) collection of open sets ``\\{U_i\\}_{i\\mathcal{I}}`` s.t. their union contains ``\\mathcal{M}``. A **finite open cover** is a finite collection of open sets that cover ``\\mathcal{M}``. We say that an open cover is **reducible** to a finite cover if we can find a finite number of elements in the open cover whose union still contains ``\\mathcal{M}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"And connected to this definition we state what it means for a topological space to be compact. This is a rather strong property that some of the manifolds treated in here have, for example the Stiefel manifold.","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A topological space ``\\mathcal{M}`` is called **compact** if every open cover is reducible to a finite cover.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"A very important result from general topology is that continuous functions preserve compactness[2]: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"[2]: We also say that compactness is a topological property [1].","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"Consider a continuous function ``f:\\mathcal{M}\\to\\mathcal{N}`` and a compact set ``K\\in\\mathcal{M}``. Then ``f(K)`` is also compact.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"Consider an open cover of ``f(K)``: ``\\{U_i\\}_{i\\in\\mathcal{I}}``. Then ``\\{f^{-1}\\{U_i\\}\\}_{i\\in\\mathcal{I}}`` is an open cover of ``K`` and hence reducible to a finite cover ``\\{f^{-1}\\{U_i\\}\\}_{i\\in\\{i_1,\\ldots,i_n\\}}``. But then ``\\{{U_i\\}_{i\\in\\{i_1,\\ldots,i_n}}`` also covers ``f(K)``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Moreover compactness is a property that is inherited by closed subspaces:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"A closed subset of a compact space is compact.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"Call the closed set ``F`` and consider an open cover of this set: ``\\{U\\}_{i\\in\\mathcal{I}}``. Then this open cover combined with ``F^c`` is an open cover for the entire compact space, hence reducible to a finite cover.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"A compact subset of a Hausdorff space is closed.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"Consider a compact subset ``K``. If ``K`` is not closed, then there has to be a point ``y\\not\\in{}K`` s.t. every open set containing ``y`` intersects ``K``. Because the surrounding space is Hausdorff we can now find the following two collections of open sets: ``\\{(U_z, U_{z,y}: U_z\\cap{}U_{z,y}=\\{\\})\\}_{z\\in{}K}``. The open cover ``\\{U_z\\}_{z\\in{}K}`` is then reducible to a finite cover ``\\{U_z\\}_{z\\in\\{z_1, \\ldots, z_n\\}}``. The intersection ``\\cap_{z\\in{z_1, \\ldots, z_n}}U_{z,y}`` is then an open set that contains ``y`` but has no intersection with ``K``. A contraction.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"This last theorem we will use in proofing the inverse function theorem:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"If ``\\mathcal{M}`` is compact and ``\\mathcal{N}`` is Hausdorff, then the inverse of a continuous function ``f:\\mathcal{M}\\to\\mathcal{N}`` is again continuous, i.e. ``f(V)`` is an open set in ``\\mathcal{N}`` for ``V\\in\\mathcal{T}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"We can equivalently show that every closed set is mapped to a closed set. First consider the set ``K\\in\\mathcal{M}``. Its image is again compact and hence closed because ``\\mathcal{N}`` is Hausdorff.\")","category":"page"},{"location":"manifolds/basic_topology/#References","page":"Concepts from General Topology","title":"References","text":"","category":"section"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).\n\n\n\nS. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\nS. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).\n\n\n\n","category":"page"},{"location":"layers/linear_symplectic_attention/#Linear-Symplectic-Attention","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"","category":"section"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"The attention layer introduced here is an extension of the Sympnet gradient layer to the setting where we deal with time series data. We first have to define a notion of symplecticity for multi-step methods. ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"This definition is essentially taken from [19, 20] and similar to the definition of volume-preservation in [21]. ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"Main.definition(raw\"\"\"\nA multi-step method ``\\times_T\\mathbb{R}^{2n}\\to\\times_T\\mathbb{R}^{2n}`` is called **symplectic** if it preserves the the symplectic product structure.\n\"\"\")","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"The symplectic product structure is the following skew-symmetric non-degenerate bilinear form: ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"mathbbJ(z^(1) ldots z^(T) tildez^(1) ldots tildez^(T)) = sum_i=1^T (z^(i))^Ttildez^(i)","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"In order to construct a symplectic attention mechanism we extend the principle SympNet gradient layer, i.e. we construct scalar functions that only depend on q^(1) ldots q^(T) or p^(1) ldots p^(T). The specific choice we make here is the following: ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"F(q^(1) q^(T)) = frac12mathrmTr(QAQ^T)","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"where Q = q^(1) ldots q^(T). We therefore have for the gradient:","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"nabla_Qf = frac12Q(A + A^T) = QbarA","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"where ``A\\in\\mathcal{S}_\\mathrm{skew}(T). So the map performs:","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"q^(1) ldots q^(T) mapsto left sum_i=1^Ta_1iq^(i) ldots sum_i=1^Ta_Tiq^(i) right","category":"page"},{"location":"layers/linear_symplectic_attention/#Library-Functions","page":"Linear Symplectic Attention","title":"Library Functions","text":"","category":"section"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"LinearSymplecticAttention\nLinearSymplecticAttentionQ\nLinearSymplecticAttentionP","category":"page"},{"location":"layers/linear_symplectic_attention/#GeometricMachineLearning.LinearSymplecticAttention-layers-linear_symplectic_attention","page":"Linear Symplectic Attention","title":"GeometricMachineLearning.LinearSymplecticAttention","text":"Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the Q or the P part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.\n\nConstructor\n\nFor the constructors simply call \n\nLinearSymplecticAttentionQ(sys_dim, seq_length)\n\nor \n\nLinearSymplecticAttentionP(sys_dim, seq_length)\n\nwhere sys_dim is the system dimension and seq_length is the sequence length.\n\n\n\n\n\n","category":"type"},{"location":"layers/linear_symplectic_attention/#GeometricMachineLearning.LinearSymplecticAttentionQ-layers-linear_symplectic_attention","page":"Linear Symplectic Attention","title":"GeometricMachineLearning.LinearSymplecticAttentionQ","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"layers/linear_symplectic_attention/#GeometricMachineLearning.LinearSymplecticAttentionP-layers-linear_symplectic_attention","page":"Linear Symplectic Attention","title":"GeometricMachineLearning.LinearSymplecticAttentionP","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"manifolds/riemannian_manifolds/#Riemannian-Manifolds","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"A Riemannian manifold is a manifold mathcalM that we endow with a mapping g that smoothly[1] assigns a metric g_x to each tangent space T_xmathcalM. By a slight abuse of notation we will also refer to this g as a metric.","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"[1]: Smooth here refers to the fact that gmathcalMtotext(Space of Metrics) has to be a smooth map. But in order to discuss this in detail we would have to define a topology on the space of metrics. A more detailed discussion can be found in [2, 3, 5].","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"After having defined a metric g we can associate a length to each curve gamma0 t to mathcalM through: ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"L(gamma) = int_0^t sqrtg_gamma(s)(gamma(s) gamma(s))ds","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"This L turns mathcalM into a metric space:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Main.definition(raw\"The **metric on a Riemannian manifold** ``\\mathcal{M}`` is \n\" * \nMain.indentation * raw\"```math\n\" *\nMain.indentation * raw\"d(x, y) = \\mathrm{inf}_{\\text{$\\gamma(0) = x$ and $\\gamma(t) = y$}}L(\\gamma),\n\" * \nMain.indentation * raw\"```\n\" *\nMain.indentation * raw\"where ``t`` can be chosen arbitrarily.\")","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"If a curve is minimal with respect to the function L we call it the shortest curve or a geodesic. So we say that a curve gamma0 ttomathcalM is a geodesic if there is no shorter curve that can connect two points in gamma(0 t), i.e. ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"d(gamma(t_i) gamma(t_f)) = int_t_i^t_fsqrtg_gamma(s)(gamma(s) gamma(s))ds","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"for any t_i t_fin0 t.","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"An important result of Riemannian geometry states that there exists a vector field X on TmathcalM, called the geodesic spray, whose integral curves are derivatives of geodesics.","category":"page"},{"location":"manifolds/riemannian_manifolds/#Geodesic-Sprays-and-the-Exponential-Map","page":"Riemannian Manifolds","title":"Geodesic Sprays and the Exponential Map","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"To every Riemannian manifold we can naturally associate a vector field called the geodesic spray or geodesic equation. For our purposes it is enough to state that this vector field is unique and well-defined [5].","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"The important property of the geodesic spray is","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Main.theorem(raw\"Given an initial point ``x`` and an initial velocity ``v_x``, an integral curve for the geodesic spray is of the form ``t \\mapsto (\\gamma_{v_x}(t), \\gamma_{v_x}'(t))`` where ``\\gamma_{v_x}`` is a geodesic. We further have the property that the integral curve for the geodesic spray for an initial point ``x`` and an initial velocity ``\\eta\\cdot{}v_x`` (where ``\\eta`` is a scalar) is of the form ``t \\mapsto (\\gamma_{\\eta\\cdot{}v_x}(t), \\gamma_{\\eta\\cdot{}v_x}'(t)) = (\\gamma_{v_x}(\\eta\\cdot{}t), \\eta\\cdot\\gamma_{v_x}'(\\eta\\cdot{}t)).``\")","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"It is therefore customary to introduce the exponential map expT_xmathcalMtomathcalM as","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"exp(v_x) = gamma_v_x(1)","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"and we see that gamma_v_x(t) = exp(tcdotv_x). In GeometricMachineLearning we denote the exponential map by geodesic to avoid confusion with the matrix exponential map[2]:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"[2]: The Riemannian exponential map and the matrix exponential map coincide for many matrix Lie groups.","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":" mathttgeodesic(x v_x) equiv exp(v_x)","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"We give an example here:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"using GeometricMachineLearning\nusing CairoMakie # hide\nimport Random # hide\nRandom.seed!(123) # hide\n\nY = rand(StiefelManifold, 3, 1)\n\nv = 5 * rand(3, 1)\nΔ = v - Y * (v' * Y)\n\nfig = Figure(; backgroundcolor = :transparent) # hide\ntext_color = Main.output_type == :html ? :white : :black # hide\nax = Axis3(fig[1, 1]; # hide\n backgroundcolor = :transparent, # hide\n aspect = (1., 1., 1.), # hide\n azimuth = π / 6, # hide\n elevation = π / 8, # hide\n xlabel = rich(\"x\", subscript(\"1\"), font = :italic, color = text_color), # hide\n ylabel = rich(\"x\", subscript(\"2\"), font = :italic, color = text_color), # hide\n zlabel = rich(\"x\", subscript(\"3\"), font = :italic, color = text_color), # hide\n ) # hide\n\n# plot a sphere with radius one and origin 0\nsurface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)\n\nmorange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide\npoint_vec = ([Y[1]], [Y[2]], [Y[3]])\nscatter!(ax, point_vec...; color = morange, marker = :star5)\n\nmred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide\narrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]])\narrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02)\n\nfig","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"We now solve the geodesic spray for etacdotDelta for eta = 01 02 03 ldots 25 and plot the corresponding points:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]\n\nY_increments = [geodesic(Y, Δ_increment) for Δ_increment in Δ_increments]\n\nfor Y_increment in Y_increments\n scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; \n color = mred, markersize = 5)\nend\n\nfig","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"So a geodesic can be seen as the equivalent of a straight line on a manifold. Also note that we drew a random element form StiefelManifold here and not from S^2. This is because Stiefel manifolds are more general spaces than S^n and also comprise them. ","category":"page"},{"location":"manifolds/riemannian_manifolds/#The-Riemannian-Gradient","page":"Riemannian Manifolds","title":"The Riemannian Gradient","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"The Riemannian gradient of a function LmathcalMtomathbbR is a vector field[3] mathrmgrad^gL (or simply mathrmgradL) for which we have","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"[3]: We also write mathrmgrad^gL(x) = mathrmgrad^g_xL","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":" g_x(mathrmgrad_x^gL v_x) = (nabla_varphi_U(x)(Lcircvarphi_U^-1))^T varphi_U(v_x) ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"where ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":" nabla_xf = beginpmatrix fracpartialfpartialx_1 cdots fracpartialfpartialx_n endpmatrix","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"is the Euclidean gradient. By the non-degeneracy of g the Riemannian gradient always exists [3]. We will give specific examples of this when discussing the Stiefel manifold and the Grassmann manifold. ","category":"page"},{"location":"manifolds/riemannian_manifolds/#Gradient-Flows-and-Riemannian-Optimization","page":"Riemannian Manifolds","title":"Gradient Flows and Riemannian Optimization","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"In GeometricMachineLearning we can include weights in neural networks that are part of a manifold. Training such neural networks amounts to Riemannian optimization and hence solving the gradient flow equation. The gradient flow equation is given by","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"X(x) = - mathrmgrad_xL","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Solving this gradient flow equation will then lead us to a local minimum on mathcalM. This will be elaborated on when talking about optimizers. In practice we cannot solve the gradient flow equation directly and have to rely on approximations. The most straightforward approximation (and one that serves as a basis for all the optimization algorithms in GeometricMachineLearning) is to take the point (x X(x)) as an initial condition for the geodesic spray and then solve the ODE for a small time step. We will call this ","category":"page"},{"location":"manifolds/riemannian_manifolds/#Library-Functions","page":"Riemannian Manifolds","title":"Library Functions","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"geodesic(::Manifold{T}, ::AbstractMatrix{T}) where T","category":"page"},{"location":"manifolds/riemannian_manifolds/#GeometricMachineLearning.geodesic-Union{Tuple{T}, Tuple{Manifold{T}, AbstractMatrix{T}}} where T-manifolds-riemannian_manifolds","page":"Riemannian Manifolds","title":"GeometricMachineLearning.geodesic","text":"The geodesic map for the manifolds. It takes as input an element x of mathcalM and an element of T_xmathcalM and returns mathttgeodesic(x v_x) = exp(v_x) For example: \n\nY = rand(StiefelManifold{Float64}, N, n)\nΔ = rgrad(Y, rand(N, n))\ngeodesic(Y, Δ)\n\nSee the docstring for rgrad for details on this function.\n\n\n\n\n\n","category":"method"},{"location":"manifolds/riemannian_manifolds/#References","page":"Riemannian Manifolds","title":"References","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\nM. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).\n\n\n\n","category":"page"},{"location":"architectures/transformer/#Standard-Transformer","page":"Standard Transformer","title":"Standard Transformer","text":"","category":"section"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"The transformer is a relatively modern neural network architecture [14] that has come to dominate the field of natural language processing (NLP, [31]) and replaced the previously dominant long-short term memory cells (LSTM, [24]). Its success is due to a variety of factors: ","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"unlike LSTMs it consists of very simple building blocks and hence is easier to interpret mathematically,\nit is very flexible in its application and the data it is fed with do not have to conform to a rigid pattern, \ntransformers utilize modern hardware (especially GPUs) very effectively. ","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"The transformer architecture is sketched below: ","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"Main.include_graphics(\"../tikz/transformer_encoder\") # hide","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"It is nothing more than a combination of a multihead attention layer and a residual neural network[1] (ResNet).","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"[1]: A ResNet is nothing more than a neural network to whose output we again add the input, i.e. every ResNet is of the form mathrmResNet(x) = x + mathcalNN(x).","category":"page"},{"location":"architectures/transformer/#Library-Functions","page":"Standard Transformer","title":"Library Functions","text":"","category":"section"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"StandardTransformerIntegrator","category":"page"},{"location":"architectures/transformer/#GeometricMachineLearning.StandardTransformerIntegrator-architectures-transformer","page":"Standard Transformer","title":"GeometricMachineLearning.StandardTransformerIntegrator","text":"The regular transformer used as an integrator (multi-step method). \n\nThe constructor is called with one argument: \n\nsys_dim::Int\n\nThe following are keyword arguments:\n\ntransformer_dim::Int: the default is transformer_dim = sys_dim.\nn_blocks::Int: The default is 1.\nn_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)\nL::Int the number of transformer blocks (default is L = 2).\nupscaling_activation: by default identity\nresnet_activation: by default tanh\nadd_connection:Bool=true: if the input should be added to the output.\n\n\n\n\n\n","category":"type"},{"location":"optimizers/manifold_related/global_sections/#Global-Sections","page":"Global Sections","title":"Global Sections","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"Global sections are needed needed for the generalization of Adam and other optimizers to homogeneous spaces. They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general optimizer framework.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#Computing-the-global-section","page":"Global Sections","title":"Computing the global section","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"In differential geometry a section is always associated to some bundle, in our case this bundle is piGtomathcalMAmapstoAE. A section is a mapping mathcalMtoG for which pi is a left inverse, i.e. picirclambda = mathrmid. ","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"For the Stiefel manifold St(n N)subsetmathbbR^Ntimesn we compute the global section the following way: ","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"Start with an element YinSt(nN),\nDraw a random matrix AinmathbbR^Ntimes(N-n),\nRemove the subspace spanned by Y from the range of A: AgetsA-YY^TA\nCompute a QR decomposition of A and take as section lambda(Y) = Y Q_1N 1(N-n) = Y barlambda.","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"It is easy to check that lambda(Y)inG=SO(N).","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"In GeometricMachineLearning, GlobalSection takes an element of YinSt(nN)equivStiefelManifold{T} and returns an instance of GlobalSection{T, StiefelManifold{T}}. The application O(N)timesSt(nN)toSt(nN) is done with the functions apply_section! and apply_section.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#Computing-the-global-tangent-space-representation-based-on-a-global-section","page":"Global Sections","title":"Computing the global tangent space representation based on a global section","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"The output of the horizontal lift Omega is an element of mathfrakg^mathrmhorY. For this mapping Omega(Y BY) = B if Binmathfrakg^mathrmhorY, i.e. there is no information loss and no projection is performed. We can map the Binmathfrakg^mathrmhorY to mathfrakg^mathrmhor with Bmapstolambda(Y)^-1Blambda(Y).","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"The function global_rep performs both mappings at once[1], i.e. it takes an instance of GlobalSection and an element of T_YSt(nN), and then returns an element of frakg^mathrmhorequivStiefelLieAlgHorMatrix.","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"[1]: For computational reasons.","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"In practice we use the following: ","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"beginaligned\nlambda(Y)^TOmega(YDelta)lambda(Y) = lambda(Y)^T(mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)lambda(Y) \n = lambda(Y)^T(mathbbI - frac12YY^T)DeltaE^T - YDelta^T(lambda(Y) - frac12YE^T) \n = lambda(Y)^TDeltaE^T - frac12EY^TDeltaE^T - EDelta^Tlambda(Y) + frac12EDelta^TYE^T \n = beginbmatrix Y^TDeltaE^T barlambdaDeltaE^T endbmatrix - frac12EY^TDeltaE - beginbmatrix EDelta^TY EDelta^Tbarlambda endbmatrix + frac12EDelta^TYE^T \n = beginbmatrix Y^TDeltaE^T barlambdaDeltaE^T endbmatrix + EDelta^TYE^T - beginbmatrixEDelta^TY EDelta^Tbarlambda endbmatrix \n = EY^TDeltaE^T + EDelta^TYE^T - EDelta^TYE^T + beginbmatrix mathbbO barlambdaDeltaE^T endbmatrix - beginbmatrix mathbbO EDelta^Tbarlambda endbmatrix \n = EY^TDeltaE^T + beginbmatrix mathbbO barlambdaDeltaE^T endbmatrix - beginbmatrix mathbbO EDelta^Tbarlambda endbmatrix\nendaligned","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"meaning that for an element of the horizontal component of the Lie algebra mathfrakg^mathrmhor we store A=Y^TDelta and B=barlambda^TDelta.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#Optimization","page":"Global Sections","title":"Optimization","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"The output of global_rep is then used for all the optimization steps.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#References","page":"Global Sections","title":"References","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).\n\n\n\n","category":"page"},{"location":"optimizers/bfgs_optimizer/#The-BFGS-Algorithm","page":"BFGS Optimizer","title":"The BFGS Algorithm","text":"","category":"section"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The presentation shown here is largely taken from chapters 3 and 6 of reference [12] with a derivation based on an online comment. The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"It is a version of a quasi-Newton method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"m_k(x) = f(x_k) + (nabla_x_kf)^T(x - x_k) + frac12(x - x_k)^TB_k(x - x_k)","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where B_k is referred to as the approximate Hessian. We further require B_k to be symmetric and positive definite. Differentiating the above expression and setting the derivative to zero gives us: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"nabla_xm_k = nabla_x_kf + B_k(x - x_k) = 0","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"or written differently: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"x - x_k = -B_k^-1nabla_x_kf","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"This value we will from now on call p_k = x - x_k and refer to as the search direction. The new iterate then is: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"x_k+1 = x_k + alpha_kp_k","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where alpha_k is the step length. Techniques that describe how to pick an appropriate alpha_k are called line-search methods and are discussed below. First we discuss what requirements we impose on B_k. A first reasonable condition would be to require the gradient of m_k to be equal to that of f at the points x_k-1 and x_k: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nnabla_x_km_k = nabla_x_kf + B_k(x_k - x_k) overset= nabla_x_kf text and \nnabla_x_k-1m_k = nablax_kf + B_k(x_k-1 - x_k) overset= nabla_x_k-1f\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The first one of these conditions is of course automatically satisfied. The second one can be rewritten as: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"B_k(x_k - x_k-1) = overset= nabla_x_kf - nabla_x_k-1f ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The following notations are often used: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"s_k-1 = alpha_k-1p_k-1 = x_k - x_k-1 text and y_k-1 = nabla_x_kf - nabla_x_k-1f ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The conditions mentioned above then becomes: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"B_ks_k-1 overset= y_k-1","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"and we call it the secant equation. A second condition we impose on B_k is that is has to be positive-definite at point s_k-1:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"s_k-1^Ty_k-1 0","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"This is referred to as the curvature condition. If we impose the Wolfe conditions, the curvature condition hold automatically. The Wolfe conditions are stated with respect to the parameter alpha_k.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The Wolfe conditions are:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"f(x_k+alphap_k)leqf(x_k) + c_1alpha(nabla_x_kf)^Tp_k for c_1in(01).\n(nabla_(x_k + alpha_kp_k)f)^Tp_k geq c_2(nabla_x_kf)^Tp_k for c_2in(c_11).","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"A possible choice for c_1 and c_2 are 10^-4 and 09 (see [12]). The two Wolfe conditions above are respectively called the sufficient decrease condition and the curvature condition respectively. Note that the second Wolfe condition (also called curvature condition) is stronger than the one mentioned before under the assumption that the first Wolfe condition is true:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"(nabla_x_kf)^Tp_k-1 - c_2(nabla_x_k-1f)^Tp_k-1 = y_k-1^Tp_k-1 + (1 - c_2)(nabla_x_k-1f)^Tp_k-1 geq 0","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"and the second term in this expression is (1 - c_2)(nabla_x_k-1f)^Tp_k-1geqfrac1-c_2c_1alpha_k-1(f(x_k) - f(x_k-1)), which is negative. ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"In order to pick the ideal B_k we solve the following problem: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nmin_B B - B_k-1_W \ntextst B = B^Ttext and Bs_k-1=y_k-1\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where the first condition is symmetry and the second one is the secant equation. For the norm cdot_W we pick the weighted Frobenius norm:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"A_W = W^12AW^12_F","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where cdot_F is the usual Frobenius norm[1] and the matrix W=tildeB_k-1 is the inverse of the average Hessian:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB_k-1 = int_0^1 nabla^2f(x_k-1 + taualpha_k-1p_k-1)dtau","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"[1]: The Frobenius norm is A_F^2 = sum_ija_ij^2.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"In order to find the ideal B_k under the conditions described above, we introduce some notation: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB_k-1 = W^12B_k-1W^12,\ntildeB = W^12BW^12, \ntildey_k-1 = W^12y_k-1, \ntildes_k-1 = W^-12s_k-1.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"With this notation we can rewrite the problem of finding B_k as: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nmin_tildeB tildeB - tildeB_k-1_F \ntextst tildeB = tildeB^Ttext and tildeBtildes_k-1=tildey_k-1\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"We further have Wy_k-1 = s_k-1 (by the mean value theorem ?) and therefore tildey_k-1 = tildes_k-1.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"Now we rewrite B and B_k-1 in a new basis U = uu_perp, where u = tildes_k-1tildes_k-1 and u_perp is an orthogonal complement[2] of u:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"[2]: So we must have u^Tu_perp=0 and further u_perp^Tu_perp=mathbbI.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nU^TtildeB_k-1U - U^TtildeBU = beginbmatrix u^T u_perp^T endbmatrix(tildeB_k-1 - tildeB)beginbmatrix u u_perp endbmatrix = \nbeginbmatrix\n u^TtildeB_k-1u - 1 u^TtildeB_k-1u \n u_perp^TtildeB_k-1u u_perp^T(tildeB_k-1-tildeB_k)u_perp\nendbmatrix\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"By a property of the Frobenius norm: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB_k-1 - tildeB^2_F = (u^TtildeB_k-1 -1)^2 + u^TtildeB_k-1u_perp_F^2 + u_perp^TtildeB_k-1u_F^2 + u_perp^T(tildeB_k-1 - tildeB)u_perp_F^2","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"We see that tildeB only appears in the last term, which should therefore be made zero. This then gives: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB = Ubeginbmatrix 1 0 0 u^T_perptildeB_k-1u_perp endbmatrix = uu^T + (mathbbI-uu^T)tildeB_k-1(mathbbI-uu^T)","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"If we now map back to the original coordinate system, the ideal solution for B_k is: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"B_k = (mathbbI - frac1y_k-1^Ts_k-1y_k-1s_k-1^T)B_k-1(mathbbI - frac1y_k-1^Ts_k-1s_k-1y_k-1^T) + frac1y_k-1^Ts_k-1y_ky_k^T","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"What we need in practice however is not B_k, but its inverse H_k. This is because we need to find s_k-1 based on y_k-1. To get H_k based on the expression for B_k above we can use the Sherman-Morrison-Woodbury formula[3] to obtain:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"[3]: The Sherman-Morrison-Woodbury formula states (A + UCV)^-1 = A^-1 - A^-1 - A^-1U(C^-1 + VA^-1U)^-1VA^-1.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"H_k = H_k-1 - fracH_k-1y_k-1y_k-1^TH_k-1y_k-1^TH_k-1y_k-1 + fracs_k-1s_k-1^Ty_k-1^Ts_k-1","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"TODO: Example where this works well!","category":"page"},{"location":"optimizers/bfgs_optimizer/#References","page":"BFGS Optimizer","title":"References","text":"","category":"section"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"J. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).\n\n\n\n","category":"page"},{"location":"manifolds/inverse_function_theorem/#Foundational-Theorem-for-Differential-Manifolds","page":"Foundations of Differential Manifolds","title":"Foundational Theorem for Differential Manifolds","text":"","category":"section"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Here we state and proof all the theorem necessary to define differential manifold. All these theorems (including proofs) can be found in e.g. [2].","category":"page"},{"location":"manifolds/inverse_function_theorem/#The-Fixed-Point-Theorem","page":"Foundations of Differential Manifolds","title":"The Fixed-Point Theorem","text":"","category":"section"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"The fixed-point theorem will be used in the proof of the inverse function theorem below and the existence-and-uniqueness theorem. ","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.theorem(raw\"A function ``f:U \\to U`` defined on an open subset ``U`` of a complete metric vector space ``\\mathcal{V} \\supset U`` that is contractive, i.e. ``|f(z) - f(y)| \\leq q|z - y|`` with ``q < 1``, has a unique fixed point ``y^*`` such that ``f(y^*) = y^*``. Further ``y^*`` can be found by taking any ``y\\in{}U`` through ``y^* = \\lim_{m\\to\\infty}f^m(y)``.\"; name = \"Banach Fixed-Point Theorem\")","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.proof(raw\"Fix a point ``y\\in{}U``. We proof that the sequence ``(f^m(y))_{m\\in\\mathbb{N}}`` is Cauchy and because ``\\mathcal{V}`` is a complete metric space, the limit of this sequence exists. Take ``\\tilde{m} > m`` and we have\n\" *\nMain.indentation * raw\"```math\n\" *\nMain.indentation * raw\"\\begin{aligned}\n\" *\nMain.indentation * raw\"|f^{\\tilde{m}}(y) - f^m(y)| & \\leq \\sum_{i = m}^{\\tilde{m} - 1}|f^{i+1}(y) - f^{i}(y)| \\\\\n\" *\nMain.indentation * raw\" & \\leq \\sum_{i = m}^{\\tilde{m} - 1}q^i|f(y) - y| \\\\ \n\" *\nMain.indentation * raw\" & \\leq \\sum_{i = m}^\\infty{}q^i|f(y) - y| = (f(y) - y)\\left( \\frac{q}{1 - q} - \\sum_{i = 1}^{m-1}q^i \\right)\\\\\n\" *\nMain.indentation * raw\" & = (f(y) - y)\\left( \\frac{q}{1 - q} - \\frac{q - q^m}{q - 1} \\right) = (f(y) - y)\\frac{q^{m+1}}{1 - q}.\n\" *\nMain.indentation * raw\"\\end{aligned} \n\" *\nMain.indentation * raw\"```\n\" *\nMain.indentation * raw\"And the sequence is clearly Cauchy.\")","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Note that we stated the fixed-point theorem for arbitrary complete metric spaces here, not just for mathbbR^n. For the section on manifolds we only need the theorem for mathbbR^n, but for the existence-and-uniqueness theorem we need the statement for more general spaces. ","category":"page"},{"location":"manifolds/inverse_function_theorem/#The-Inverse-Function-Theorem","page":"Foundations of Differential Manifolds","title":"The Inverse Function Theorem","text":"","category":"section"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"The inverse function theorem gives a sufficient condition on a vector-valued function to be invertible in a neighborhood of a specific point. This theorem serves as a basis for the implicit function theorem and further for the preimage theorem and is critical in developing a theory of manifolds. Here we first state the theorem and then give a proof.","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.theorem(raw\"Consider a vector-valued differentiable function ``F:\\mathbb{R}^N\\to\\mathbb{R}^N`` and assume its Jacobian is non-degenerate at a point ``x\\in\\mathbb{R}^N``. Then there exists a neighborhood ``U`` that contains ``F(x)`` and on which ``F`` is invertible, i.e. ``\\exists{}H:U\\to\\mathbb{R}^N`` s.t. ``\\forall{}y\\in{}U,\\,F\\circ{}H(y) = y`` and ``H`` is differentiable.\"; name = \"Inverse function theorem\")","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.proof(raw\"\"\"Consider a mapping ``F:\\mathbb{R}^N\\to\\mathbb{R}^N`` and assume its Jacobian has full rank at point ``x``, i.e. ``\\det{}F'(x)\\neq0``. We further assume that ``F(x) = 0``, ``F'(x) = \\mathbb{I}`` and ``x = 0``. Now consider a ball around ``x`` whose radius ``r`` we do not yet fix and two points ``y`` and ``z`` in that ball: ``y,z\\in{}B(r)``. We further introduce the function ``G(y):=y-F(y)``. By the *mean value theorem* we have ``|G(y)| = |G(y) - x| = |G(y) - G(x)|\\leq|y-x|\\sup_{05, width is set to 5). See the theory section for more details; there depth was called n.\nnhidden : the number of pairs of linear and activation layers with default value set to 1 (i.e the LA-SympNet is a composition of a linear layer, an activation layer and then again a single layer). \nactivation : the activation function for all the activations layers with default set to tanh,\ninitupperlinear : a boolean that indicates whether the first linear layer changes q first. By default this is true.\ninitupperact : a boolean that indicates whether the first activation layer changes q first. By default this is true.","category":"page"},{"location":"tutorials/sympnet_tutorial/#G-SympNet","page":"Sympnets","title":"G-SympNet","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"To call a G-SympNet, one needs to write","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"gsympnet = GSympNet(dim; upscaling_dimension=2*dim, n_layers=2, activation=tanh, init_upper=true) ","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"GSympNet takes one obligatory argument:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"dim : the dimension of the phase space (i.e. an integer) or optionally an instance of DataLoader. This latter option will be used below.","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"and severals keywords argument :","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"upscaling_dimension: The first dimension of the matrix with which the input is multiplied. In the theory section this matrix is called K and the upscaling dimension is called m.\nn_layers: the number of gradient layers with default value set to 2.\nactivation : the activation function for all the activations layers with default set to tanh.\ninit_upper : a boolean that indicates whether the first gradient layer changes q first. By default this is true.","category":"page"},{"location":"tutorials/sympnet_tutorial/#Loss-function","page":"Sympnets","title":"Loss function","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"The loss function described in the theory section is the default choice used in GeometricMachineLearning.jl for training SympNets.","category":"page"},{"location":"tutorials/sympnet_tutorial/#Data-Structures-in-GeometricMachineLearning.jl","page":"Sympnets","title":"Data Structures in GeometricMachineLearning.jl","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Main.include_graphics(\"../tikz/structs_visualization\") # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/#Examples","page":"Sympnets","title":"Examples","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Let us see how to use it on several examples.","category":"page"},{"location":"tutorials/sympnet_tutorial/#Example-of-a-pendulum-with-G-SympNet","page":"Sympnets","title":"Example of a pendulum with G-SympNet","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Let us begin with a simple example, the pendulum system, the Hamiltonian of which is ","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"H(qp)inmathbbR^2 mapsto frac12p^2-cos(q) in mathbbR","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Here we generate pendulum data with the script GeometricMachineLearning/scripts/pendulum.jl:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"using GeometricMachineLearning # hide\nimport Random # hide\n\nRandom.seed!(1234)\n\n# load script\ninclude(\"../../../scripts/pendulum.jl\")\n# specify the data type\ntype = Float16 \n# get data \nqp_data = GeometricMachineLearning.apply_toNT(a -> type.(a), pendulum_data((q=[0.], p=[1.]); tspan=(0.,100.)))\n# call the DataLoader\ndl = DataLoader(qp_data)\n# this last line is a hack so as to not display the output # hide\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Next we specify the architectures. GeometricMachineLearning.jl provides useful defaults for all parameters although they can be specified manually (which is done in the following):","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"# layer dimension for gradient module \nconst upscaling_dimension = 2\n# hidden layers\nconst nhidden = 1\n# activation function\nconst activation = tanh\n\n# calling G-SympNet architecture \ngsympnet = GSympNet(dl, upscaling_dimension=upscaling_dimension, n_layers=4, activation=activation)\n\n# calling LA-SympNet architecture \nlasympnet = LASympNet(dl, nhidden=nhidden, activation=activation)\n\n# specify the backend\nconst backend = CPU()\n\n# initialize the networks\nla_nn = NeuralNetwork(lasympnet, backend, type) \ng_nn = NeuralNetwork(gsympnet, backend, type)\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"If we want to obtain information on the number of parameters in a neural network, we can do that very simply with the function parameterlength. For the LASympNet:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"parameterlength(la_nn.model)","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"And for the GSympNet:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"parameterlength(g_nn.model)","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Remark: We can also specify whether we would like to start with a layer that changes the q-component or one that changes the p-component. This can be done via the keywords init_upper for GSympNet, and init_upper_linear and init_upper_act for LASympNet.","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We have to define an optimizer which will be use in the training of the SympNet. For more details on optimizer, please see the corresponding documentation. In this example we use Adam:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"# set up optimizer; for this we first need to specify the optimization method (argue for why we need the optimizer method)\nopt_method = AdamOptimizer(type)\nla_opt = Optimizer(opt_method, la_nn)\ng_opt = Optimizer(opt_method, g_nn)\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We can now perform the training of the neural networks. The syntax is the following :","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"# number of training epochs\nconst nepochs = 300\n# Batchsize used to compute the gradient of the loss function with respect to the parameters of the neural networks.\nconst batch_size = 100\n\nbatch = Batch(batch_size)\n\n# perform training (returns array that contains the total loss for each training step)\ng_loss_array = g_opt(g_nn, dl, batch, nepochs)\nla_loss_array = la_opt(la_nn, dl, batch, nepochs)\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We can also plot the training errors against the epoch (here the y-axis is in log-scale):","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"using Plots\np1 = plot(g_loss_array, xlabel=\"Epoch\", ylabel=\"Training error\", label=\"G-SympNet\", color=3, yaxis=:log)\nplot!(p1, la_loss_array, label=\"LA-SympNet\", color=2)","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"The trainings data data_q and data_p must be matrices of mathbbR^ntimes d where n is the length of data and d is the half of the dimension of the system, i.e data_q[i,j] is q_j(t_i) where (t_1t_n) are the corresponding time of the training data.","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Now we can make a prediction. Let's compare the initial data with a prediction starting from the same phase space point using the function iterate:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"ics = (q=qp_data.q[:,1], p=qp_data.p[:,1])\n\nsteps_to_plot = 200\n\n#predictions\nla_trajectory = iterate(la_nn, ics; n_points = steps_to_plot)\ng_trajectory = iterate(g_nn, ics; n_points = steps_to_plot)\n\nusing Plots\np2 = plot(qp_data.q'[1:steps_to_plot], qp_data.p'[1:steps_to_plot], label=\"training data\")\nplot!(p2, la_trajectory.q', la_trajectory.p', label=\"LA Sympnet\")\nplot!(p2, g_trajectory.q', g_trajectory.p', label=\"G Sympnet\")","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We see that GSympNet outperforms the LASympNet on this problem.","category":"page"},{"location":"optimizers/general_optimization/#Optimization-for-Neural-Networks","page":"General Optimization","title":"Optimization for Neural Networks","text":"","category":"section"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the gradient flow equation:","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"dottheta = -nabla_thetaL","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"by means of a Euler time-stepping scheme: ","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"theta^t+1 = theta^t - hnabla_theta^tL","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"where eta (the time step of the Euler scheme) is referred to as the learning rate","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"This equation can easily be generalized to manifolds by replacing the Euclidean gradient nabla_theta^tL by a Riemannian gradient -hmathrmgrad_theta^tL and addition by -hnabla_theta^tL with a retraction by -hmathrmgrad_theta^tL.","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/#The-horizontal-component-of-the-Lie-algebra-\\mathfrak{g}-for-the-Grassmann-manifold","page":"Grassmann Global Tangent Space","title":"The horizontal component of the Lie algebra mathfrakg for the Grassmann manifold","text":"","category":"section"},{"location":"arrays/grassmann_lie_alg_hor_matrix/#Tangent-space-to-the-element-\\mathcal{E}","page":"Grassmann Global Tangent Space","title":"Tangent space to the element mathcalE","text":"","category":"section"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"Consider the tangent space to the distinct element mathcalE=mathrmspan(E)inGr(nN), where E is again:","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"E = beginbmatrix\nmathbbI_n \nmathbbO\nendbmatrix","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"The tangent tangent space T_mathcalEGr(nN) can be represented through matrices: ","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"beginpmatrix\n 0 cdots 0 \n cdots cdots cdots \n 0 cdots 0 \n a_11 cdots a_1n \n cdots cdots cdots \n a_(N-n)1 cdots a_(N-n)n\nendpmatrix","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"where we have used the identification T_mathcalEGr(nN)toT_EmathcalS_E that was discussed in the section on the Grassmann manifold. The Grassmann manifold can also be seen as the Stiefel manifold modulo an equivalence class. This leads to the following (which is used for optimization):","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"mathfrakg^mathrmhor = mathfrakg^mathrmhormathcalE = leftbeginpmatrix 0 -B^T B 0 endpmatrix textB arbitraryright","category":"page"},{"location":"arrays/grassmann_lie_alg_hor_matrix/","page":"Grassmann Global Tangent Space","title":"Grassmann Global Tangent Space","text":"This is equivalent to the horizontal component of mathfrakg for the Stiefel manifold for the case when A is zero. This is a reflection of the rotational invariance of the Grassmann manifold: the skew-symmetric matrices A are connected to the group of rotations O(n) which is factored out in the Grassmann manifold Gr(nN)simeqSt(nN)O(n).","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Symplectic-Autoencoders-and-the-Toda-Lattice","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders and the Toda Lattice","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"In this tutorial we use a SymplecticAutoencoder to approximate the linear wave equation with a lower-dimensional Hamiltonian model and compare it with standard proper symplectic decomposition (PSD).","category":"page"},{"location":"tutorials/symplectic_autoencoder/#The-system","page":"Symplectic Autoencoders","title":"The system","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"The Toda lattice is a prototypical example of a Hamiltonian PDE. It is described by ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":" H(q p) = sum_ninmathbbZleft( fracp_n^22 + alpha e^q_n - q_n+1 right)","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We further assume a finite number of particles N and impose periodic boundary conditions: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"beginaligned\n q_n+N equiv q_n \n p_n+N equiv p_n\nendaligned","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"In this tutorial we want to reduce the dimension of the big system by a significant factor with (i) proper symplectic decomposition (PSD) and (ii) symplectic autoencoders. The first approach is strictly linear whereas the second one allows for more general mappings. ","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Using-the-Toda-lattice-in-numerical-experiments","page":"Symplectic Autoencoders","title":"Using the Toda lattice in numerical experiments","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"In order to use the Toda lattice in numerical experiments we have to pick suitable initial conditions. For this, consider the third-degree spline: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"h(s) = begincases\n 1 - frac32s^2 + frac34s^3 textif 0 leq s leq 1 \n frac14(2 - s)^3 textif 1 s leq 2 \n 0 textelse \nendcases","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Plotted on the relevant domain it looks like this: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Main.include_graphics(\"../tikz/third_degree_spline\") # hide","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We end up with the following choice of parametrized initial conditions: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"u_0(mu)(omega) = h(s(omega mu)) quad s(omega mu) = 20 mu omega + fracmu2","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"For the purposes of this tutorial we will use the default value for mu provided in GeometricMachineLearning:","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"using GeometricProblems.TodaLattice: μ\n\nμ","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Get-the-data","page":"Symplectic Autoencoders","title":"Get the data","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"The training data can very easily be obtained by using the packages GeometricProblems and GeometricIntegrators:","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"using GeometricProblems.TodaLattice: hodeproblem\nusing GeometricIntegrators: integrate, ImplicitMidpoint\nusing GeometricMachineLearning \nusing Plots\nimport Random\n\npr = hodeproblem(; tspan = (0.0, 100.))\nsol = integrate(pr, ImplicitMidpoint())\ndl = DataLoader(sol; autoencoder = true)\n\ndl.input_dim","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Here we first integrate the system with implicit midpoint and then put the training data into the right format by calling DataLoader. We can get the dimension of the system by calling dl.input_dim. Also note that the keyword autoencoder was set to true.","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Train-the-network","page":"Symplectic Autoencoders","title":"Train the network","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We now want to compare two different approaches: PSDArch and SymplecticAutoencoder. For this we first have to set up the networks: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"const reduced_dim = 2\n\npsd_arch = PSDArch(dl.input_dim, reduced_dim)\nsae_arch = SymplecticAutoencoder(dl.input_dim, reduced_dim; n_encoder_blocks = 4, n_decoder_blocks = 4, n_encoder_layers = 4, n_decoder_layers = 1)\n\nRandom.seed!(123)\npsd_nn = NeuralNetwork(psd_arch)\nsae_nn = NeuralNetwork(sae_arch)\n\nnothing # hide","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Training a neural network is usually done by calling an instance of Optimizer in GeometricMachineLearning. PSDArch however can be solved directly by using singular value decomposition and this is done by calling solve!. The SymplecticAutoencoder we train with the AdamOptimizer however: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"const n_epochs = 8\nconst batch_size = 16\n\no = Optimizer(sae_nn, AdamOptimizer(Float64))\n\npsd_error = solve!(psd_nn, dl)\nsae_error = o(sae_nn, dl, Batch(batch_size), n_epochs)\n\nhline([psd_error]; color = 2, label = \"PSD error\")\nplot!(sae_error; color = 3, label = \"SAE error\", xlabel = \"epoch\", ylabel = \"training error\")","category":"page"},{"location":"tutorials/symplectic_autoencoder/#The-online-stage","page":"Symplectic Autoencoders","title":"The online stage","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"After having trained our neural network we can now evaluate it in the online stage of reduced complexity modeling: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"psd_rs = HRedSys(pr, encoder(psd_nn), decoder(psd_nn); integrator = ImplicitMidpoint())\nsae_rs = HRedSys(pr, encoder(sae_nn), decoder(sae_nn); integrator = ImplicitMidpoint())\n\nprojection_error(psd_rs)","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"projection_error(sae_rs)","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Next we plot a comparison between the PSD prediction and the symplectic autoencoder prediction: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"sol_full = integrate_full_system(psd_rs)\nsol_psd_reduced = integrate_reduced_system(psd_rs)\nsol_sae_reduced = integrate_reduced_system(sae_rs)\n\nconst t_step = 100\nplot(sol_full.s.q[t_step], label = \"Implicit Midpoint\")\nplot!(psd_rs.decoder((q = sol_psd_reduced.s.q[t_step], p = sol_psd_reduced.s.p[t_step])).q, label = \"PSD\")\nplot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_step], p = sol_sae_reduced.s.p[t_step])).q, label = \"SAE\")","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We can see that the autoencoder approach has much more approximation capabilities than the psd approach. The jiggly lines are due to the fact that training was done for only 8 epochs. ","category":"page"},{"location":"tutorials/symplectic_autoencoder/#References","page":"Symplectic Autoencoders","title":"References","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).\n\n\n\nL. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).\n\n\n\nC. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).\n\n\n\n","category":"page"},{"location":"optimizers/manifold_related/retractions/#Retractions","page":"Retractions","title":"Retractions","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/#Classical-Definition","page":"Retractions","title":"Classical Definition","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Classically, retractions are defined as maps smooth maps ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"R TmathcalMtomathcalM(xv)mapstoR_x(v)","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"such that each curve c(t) = R_x(tv) satisfies c(0) = x and c(0) = v.","category":"page"},{"location":"optimizers/manifold_related/retractions/#In-GeometricMachineLearning","page":"Retractions","title":"In GeometricMachineLearning","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Retractions are a map from the horizontal component of the Lie algebra mathfrakg^mathrmhor to the respective manifold.","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"For optimization in neural networks (almost always first order) we solve a gradient flow equation ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"dotW = -mathrmgrad_WL ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"where mathrmgrad_WL is the Riemannian gradient of the loss function L evaluated at position W.","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"If we deal with Euclidean spaces (vector spaces), then the Riemannian gradient is just the result of an AD routine and the solution of the equation above can be approximated with W^t+1 gets W^t - etanabla_W^tL, where eta is the learning rate. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"For manifolds, after we obtained the Riemannian gradient (see e.g. the section on Stiefel manifold), we have to solve a geodesic equation. This is a canonical ODE associated with any Riemannian manifold. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"The general theory of Riemannian manifolds is rather complicated, but for the neural networks treated in GeometricMachineLearning, we only rely on optimization of matrix Lie groups and homogeneous spaces, which is much simpler. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"For Lie groups each tangent space is isomorphic to its Lie algebra mathfrakgequivT_mathbbIG. The geodesic map from mathfrakg to G, for matrix Lie groups with bi-invariant Riemannian metric like SO(N), is simply the application of the matrix exponential exp. Alternatively this can be replaced by the Cayley transform (see (Absil et al, 2008).)","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Starting from this basic map expmathfrakgtoG we can build mappings for more complicated cases: ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"General tangent space to a Lie group T_AG: The geodesic map for an element VinT_AG is simply Aexp(A^-1V).\nSpecial tangent space to a homogeneous space T_EmathcalM: For V=BEinT_EmathcalM the exponential map is simply exp(B)E. \nGeneral tangent space to a homogeneous space T_YmathcalM with Y = AE: For Delta=ABEinT_YmathcalM the exponential map is simply Aexp(B)E. This is the general case which we deal with. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function retraction in GeometricMachineLearning performs mathfrakg^mathrmhortomathcalM, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with apply_section and represented through the red vertical line in the diagram on the general optimizer framework.","category":"page"},{"location":"optimizers/manifold_related/retractions/#Word-of-caution","page":"Retractions","title":"Word of caution","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"The Lie group corresponding to the Stiefel manifold SO(N) has a bi-invariant Riemannian metric associated with it: (B_1B_2)mapsto mathrmTr(B_1^TB_2). For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult (see (Bendokat et al, 2021).)","category":"page"},{"location":"optimizers/manifold_related/retractions/#References","page":"Retractions","title":"References","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Absil P A, Mahony R, Sepulchre R. Optimization algorithms on matrix manifolds[M]. Princeton University Press, 2008.\nBendokat T, Zimmermann R. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications[J]. arXiv preprint arXiv:2108.12447, 2021.\nO'Neill, Barrett. Semi-Riemannian geometry with applications to relativity. Academic press, 1983.","category":"page"},{"location":"arrays/skew_symmetric_matrix/#SymmetricMatrix-and-SkewSymMatrix","page":"Symmetric and Skew-Symmetric Matrices","title":"SymmetricMatrix and SkewSymMatrix","text":"","category":"section"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"There are special implementations of symmetric and skew-symmetric matrices in GeometricMachineLearning.jl. They are implemented to work on GPU and for multiplication with tensors. The following image demonstrates how the data necessary for an instance of SkewSymMatrix are stored[1]:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"[1]: It works similarly for SymmetricMatrix. ","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"Main.include_graphics(\"../tikz/skew_sym_visualization\") # hide","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"So what is stored internally is a vector of size n(n-1)2 for the skew-symmetric matrix and a vector of size n(n+1)2 for the symmetric matrix. We can sample a random skew-symmetric matrix: ","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"using GeometricMachineLearning # hide\n\nA = rand(SkewSymMatrix, 5)","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"and then access the vector:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"A.S ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#Pullbacks-and-Automatic-Differentiation","page":"Pullbacks","title":"Pullbacks and Automatic Differentiation","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"Automatic Differentiation is an important part of modern machine learning libraries. It is essentially a tool to compute the gradient of a loss function with respect to its input arguments. ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#How-to-Compute-Pullbacks","page":"Pullbacks","title":"How to Compute Pullbacks","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"GeometricMachineLearning has many pullbacks for custom array types and other operations implemented. The need for this essentially comes from the fact that we cannot trivially differentiate custom GPU kernels at the moment[1].","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"[1]: This will change once we switch to Enzyme (see [9]), but the package is still in its infancy. ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#What-is-a-pullback?","page":"Pullbacks","title":"What is a pullback?","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"Here we first explain the principle of a pullback with the example of a vector-valued function. The generalization to matrices and higher-order tensors is straight-forward. ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"The pullback of a vector-valued function fmathbbR^ntomathbbR^m can be interpreted as the sensitivities in the input space mathbbR^n with respect to variations in the output space mathbbR^m via the function f: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"leftmathrmpullback(f)ainmathbbR^n dbinmathbbR^mright_i = sum_j=1^mfracpartialf_jpartiala_idb_j","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"This principle can easily be generalized to matrices. For this consider the function gmathbbR^n_1timesn_2tomathbbR^m_1timesm_2. For this case we have: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"leftmathrmpullback(g)AinmathbbR^n_1timesn_2 dBinmathbbR^m_1timesm_2right_(i_1 i_2) = sum_j_1=1^m_1sum_j_2=1^m_2fracpartialf_(j_1 j_2)partiala_(i_1 i_2)db_(j_1 j_2)","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"The generalization to higher-order tensors is again straight-forward.","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#Illustrative-example","page":"Pullbacks","title":"Illustrative example","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"Consider the matrix inverse mathrminv mathbbR^ntimesntomathbbR^ntimesn as an example. This fits into the above framework where inv is a matrix-valued function from mathbbR^ntimesn to mathbbR^ntimesn. We here write B = A^-1 = mathrminv(A). We thus have to compute: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"leftmathrmpullback(mathrminv)AinmathbbR^ntimesn dBinmathbbR^ntimesnright_(i j) = sum_k=1^nsum_ell=1^nfracpartialb_k ellpartiala_i jdb_k ell","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"For a matrix A that depends on a parameter varepsilon we have that: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"fracpartialpartialvarepsilonB = -Bleft( fracpartialpartialvarepsilon right) B","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"This can easily be checked: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"mathbbO = fracpartialpartialvarepsilonmathbbI = fracpartialpartialvarepsilon(AB) = AfracpartialpartialvarepsilonB + left(fracpartialpartialvarepsilonAright)B","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"We can then write: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"beginaligned\nsum_kellleft( fracpartialpartiala_ij b_kell right) db_kell = sum_kellleft fracpartialpartiala_ij B right_kell db_kell \n = - sum_kellleftB left(fracpartialpartiala_ij Aright) B right_kell db_kell \n = - sum_kellmnb_km left(fracpartiala_mnpartiala_ijright) b_nell db_kell \n = - sum_kellmnb_km delta_imdelta_jn b_nell db_kell \n = - sum_kellb_ki b_jell db_kell \n equiv - B^TcdotdBcdotB^T \nendaligned","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#Motivation-from-a-differential-geometric-perspective","page":"Pullbacks","title":"Motivation from a differential-geometric perspective","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"The notions of a pullback in automatic differentiation and differential geometry are closely related (see e.g. [10] and [11]). In both cases we want to compute, based on a mapping fmathcalVtomathcalW a mapsto f(a) = b, a map of differentials db mapsto da. In the differential geometry case db and da are part of the associated cotangent spaces, i.e. dbinT^*_bmathcalW and dainT^*_amathcalV; in AD we (mostly) deal with spaces of arrays, i.e. vector spaces, which means that dbinmathcalW and dainmathcalV.","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"M. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).\n\n\n\nJ. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).\n\n\n\n","category":"page"},{"location":"reduced_order_modeling/autoencoder/#Reduced-Order-modeling-and-Autoencoders","page":"POD and Autoencoders","title":"Reduced Order modeling and Autoencoders","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"Reduced order modeling is a data-driven technique that exploits the structure of parametric PDEs to make solving those PDEs easier.","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"Consider a parametric PDE written in the form: F(z(mu)mu)=0 where z(mu) evolves on a infinite-dimensional Hilbert space V. ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of V, which will be denoted by V_h. ","category":"page"},{"location":"reduced_order_modeling/autoencoder/#Solution-manifold","page":"POD and Autoencoders","title":"Solution manifold","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"To any parametric PDE we associate a solution manifold: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"mathcalM = z(mu)F(z(mu)mu)=0 muinmathbbP","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"(Image: )","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"In the image above a 2-dimensional solution manifold is visualized as a sub-manifold in 3-dimensional space. In general the embedding space is an infinite-dimensional function space.","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"As an example of this consider the 1-dimensional wave equation: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"partial_tt^2q(tximu) = mu^2partial_xixi^2q(tximu)text on ItimesOmega","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"where I = (01) and Omega=(-1212). As initial condition for the first derivative we have partial_tq(0ximu) = -mupartial_xiq_0(ximu) and furthermore q(tximu)=0 on the boundary (i.e. xiin-1212).","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"The solution manifold is a 1-dimensional submanifold: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"mathcalM = (t xi)mapstoq(tximu)=q_0(xi-mutmu)muinmathbbPsubsetmathbbR","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"If we provide an initial condition u_0, a parameter instance mu and a time t, then ximapstoq(tximu) will be the momentary solution. If we consider the time evolution of q(tximu), then it evolves on a two-dimensional submanifold barmathcalM = ximapstoq(tximu)tinImuinmathbbP.","category":"page"},{"location":"reduced_order_modeling/autoencoder/#General-workflow","page":"POD and Autoencoders","title":"General workflow","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"In reduced order modeling we aim to construct a mapping to a space that is close to this solution manifold. This is done through the following steps: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"Discretize the PDE.\nSolve the discretized PDE for a certain set of parameter instances muinmathbbP.\nBuild a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the reduction mathcalP and the reconstruction mathcalR.","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been Proper orthogonal decomposition (POD), but in recent years autoencoders have also become a popular alternative (see (Fresca et al, 2021)). ","category":"page"},{"location":"reduced_order_modeling/autoencoder/#References","page":"POD and Autoencoders","title":"References","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).\n\n\n\n","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/#The-Existence-And-Uniqueness-Theorem","page":"Differential Equations and the EAU theorem","title":"The Existence-And-Uniqueness Theorem","text":"","category":"section"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"The existence-and-uniqueness theorem, also known as the Picard-Lindelöf theorem, Picard's existence theorem and the Cauchy-Lipschitz theorem gives a proof of the existence of solutions for ODEs. Here we state the existence-and-uniqueness theorem for manifolds as vector fields are just a special case of this. Its proof relies on the Banach fixed-point theorem[1].","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"[1]: It has to be noted that the proof given here is not entirely self-contained. The proof of the fundamental theorem of calculus, i.e. the proof of the existence of an antiderivative of a continuous function [4], is omitted for example. ","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"Main.theorem(raw\"Let ``X`` a vector field on the manifold ``\\mathcal{M}`` that is differentiable at ``x``. Then we can find an ``\\epsilon>0`` and a unique curve ``\\gamma:(-\\epsilon, \\epsilon)\\to\\mathcal{M}`` such that ``\\gamma'(t) = X(\\gamma(t))``.\"; name = \"Existence-And-Uniqueness Theorem\")","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"Main.proof(raw\"We consider a ball around a point ``x\\in\\mathcal{M}`` with radius ``r`` that we pick such that the ball ``B(x, r)`` fits into the ``U`` of some coordinate chart ``\\varphi_U``; we further use ``X`` and ``\\varphi'\\circ{}X\\circ\\varphi^{-1}`` interchangeably in this proof. We then define ``L := \\mathrm{sup}_{y,z\\in{}B(x,r)}|X(y) - X(z)|/|y - z|.`` Note that this ``L`` is always finite because ``X`` is bounded and differentiable. We now define the map ``\\Gamma: C^\\infty((-\\epsilon, \\epsilon), \\mathbb{R}^n)\\to{}C^\\infty((-\\epsilon, \\epsilon), \\mathbb{R}^n)`` (for some ``\\epsilon`` that we do not yet fix) as \n\" * \nMain.indentation * raw\"```math\n\" * \nMain.indentation * raw\"\\Gamma\\gamma(t) = x + \\int_0^tX(\\gamma(s))ds,\n\" * \nMain.indentation * raw\"```\n\" * \nMain.indentation * raw\"i.e. ``\\Gamma`` maps ``C^\\infty`` curves through ``x`` into ``C^\\infty`` curves through ``x``. We further have with the norm ``||\\gamma||_\\infty = \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}|\\gamma(t)|``:\n\" * \nMain.indentation * raw\"```math\n\" *\nMain.indentation * raw\"\\begin{aligned} \n\" * \nMain.indentation * raw\"||\\Gamma(\\gamma_1 - \\gamma_2)||_\\infty & = \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}\\left| \\int_0^t (X(\\gamma_1(s)) - X(\\gamma_2(s)))ds \\right| \\\\\n\" * \nMain.indentation * raw\"& \\leq \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}\\int_0^t | X(\\gamma_1(s)) - X(\\gamma_2(s)) | ds \\\\\n\" * \nMain.indentation * raw\"& \\leq \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}\\int_0^t L |\\gamma_1(s) - \\gamma_2(s)| ds \\\\\n\" * \nMain.indentation * raw\"& \\leq \\epsilon{}L \\cdot \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}|\\gamma_1(t) - \\gamma_2(t)|,\n\" * \nMain.indentation * raw\"\\end{aligned}\n\" * \nMain.indentation * raw\"```\n\" * \nMain.indentation * raw\"and we see that ``\\Gamma`` is a contractive mapping if we pick ``\\epsilon`` small enough and we can hence apply the fixed-point theorem. So there has to exist a ``C^\\infty`` curve through ``x`` that we call ``\\gamma^*`` such that \n\" * \nMain.indentation * raw\"```math\n\" * \nMain.indentation * raw\"\\gamma^*(t) = \\int_0^tX(\\gamma^*(s))ds,\n\" *\nMain.indentation * raw\"and this ``\\gamma^*`` is the curve we were looking for. Its uniqueness is guaranteed by the fixed-point theorem.\")","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"For all the problems we discuss here we can extend the integral curves of X from the finite interval (-epsilon epsilon) to all of mathbbR. The solution gamma we call an integral curve or flow of the vector field (ODE).","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/#Time-Dependent-Vector-Fields","page":"Differential Equations and the EAU theorem","title":"Time-Dependent Vector Fields","text":"","category":"section"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"We proved the theorem above for a time-independent vector field X, but it also holds for time-dependent vector fields, i.e. for mapping of the form: ","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"X 0TtimesmathcalMtoTM","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"The proof for this case proceeds analogously to the case of the time-independent vector field; to apply the proof we simply have to extend the vector field to (here written for a specific coordinate chart varphi_U): ","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"barX 0 TtimesmathbbR^ntomathbbR^n+1 (t x_1 ldots x_n) mapsto (1 X(x_1 ldots x_n))","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"More details on this can be found in e.g. [2]. For GeometricMachineLearning time-dependent vector fields are important because many of the optimizers we are using (such as the Adam optimizer) can be seen as approximating the flow of a time-dependent vector field.","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/#Reference","page":"Differential Equations and the EAU theorem","title":"Reference","text":"","category":"section"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).\n\n\n\nS. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\n","category":"page"},{"location":"tutorials/grassmann_layer/#Example-of-a-Neural-Network-with-a-Grassmann-Layer","page":"Grassmann manifold","title":"Example of a Neural Network with a Grassmann Layer","text":"","category":"section"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"Here we show how to implement a neural network that contains a layer whose weight is an element of the Grassmann manifold and where this might be useful. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"To answer where we would need this consider the following scenario","category":"page"},{"location":"tutorials/grassmann_layer/#Problem-statement","page":"Grassmann manifold","title":"Problem statement","text":"","category":"section"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"We are given data in a big space mathcalD=d_i_iinmathcalIsubsetmathbbR^N and know these data live on an n-dimensional[1] submanifold[2] in mathbbR^N. Based on these data we would now like to generate new samples from the distributions that produced our original data. This is where the Grassmann manifold is useful: each element V of the Grassmann manifold is an n-dimensional subspace of mathbbR^N from which we can easily sample. We can then construct a (bijective) mapping from this space V onto a space that contains our data points mathcalD. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"[1]: We may know n exactly or approximately. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"[2]: Problems and solutions related to this scenario are commonly summarized under the term manifold learning (see [37]).","category":"page"},{"location":"tutorials/grassmann_layer/#Example","page":"Grassmann manifold","title":"Example","text":"","category":"section"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"Consider the following toy example: We want to sample from the graph of the (scaled) Rosenbrock function f(xy) = ((1 - x)^2 + 100(y - x^2)^2)1000 while pretending we do not know the function. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"using Plots # hide\n# hide\nrosenbrock(x::Vector) = ((1.0 - x[1]) ^ 2 + 100.0 * (x[2] - x[1] ^ 2) ^ 2) / 1000\nx, y = -1.5:0.1:1.5, -1.5:0.1:1.5\nz = Surface((x,y)->rosenbrock([x,y]), x, y)\np = surface(x,y,z; camera=(30,20), alpha=.6, colorbar=false, xlims=(-1.5, 1.5), ylims=(-1.5, 1.5), zlims=(0.0, rosenbrock([-1.5, -1.5])))","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"We now build a neural network whose task it is to map a product of two Gaussians mathcalN(01)timesmathcalN(01) onto the graph of the Rosenbrock function where the range for x and for y is -1515.","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"For computing the loss between the two distributions, i.e. Psi(mathcalN(01)timesmathcalN(01)) and f(-1515 -1515) we use the Wasserstein distance[3].","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"[3]: The implementation of the Wasserstein distance is taken from [38].","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"using GeometricMachineLearning, Zygote, BrenierTwoFluid\nusing LinearAlgebra: norm # hide\nimport Random # hide \nRandom.seed!(123)\n\nmodel = Chain(GrassmannLayer(2,3), Dense(3, 8, tanh), Dense(8, 3, identity))\n\nnn = NeuralNetwork(model, CPU(), Float64)\n\n# this computes the cost that is associated to the Wasserstein distance\nc = (x,y) -> .5 * norm(x - y)^2\n∇c = (x,y) -> x - y\n\nconst ε = 0.1 # entropic regularization. √ε is a length. # hide\nconst q = 1.0 # annealing parameter # hide\nconst Δ = 1.0 # characteristic domain size # hide\nconst s = ε # current scale: no annealing -> equals ε # hide\nconst tol = 1e-6 # marginal condition tolerance # hide \nconst crit_it = 20 # acceleration inference # hide\nconst p_η = 2\n\nfunction compute_wasserstein_gradient(ensemble1::AT, ensemble2::AT) where AT<:AbstractArray\n number_of_particles1 = size(ensemble1, 2)\n number_of_particles2 = size(ensemble2, 2)\n V = SinkhornVariable(copy(ensemble1'), ones(number_of_particles1) / number_of_particles1)\n W = SinkhornVariable(copy(ensemble2'), ones(number_of_particles2) / number_of_particles2)\n params = SinkhornParameters(; ε=ε,q=1.0,Δ=1.0,s=s,tol=tol,crit_it=crit_it,p_η=p_η,sym=false,acc=true) # hide\n S = SinkhornDivergence(V, W, c, params; islog = true)\n initialize_potentials!(S)\n compute!(S)\n value(S), x_gradient!(S, ∇c)'\nend\n\nxyz_points = hcat([[x,y,rosenbrock([x,y])] for x in x for y in y]...)\n\nfunction compute_gradient(ps::Tuple)\n samples = randn(2, size(xyz_points, 2))\n\n estimate, nn_pullback = Zygote.pullback(ps -> model(samples, ps), ps)\n\n valS, wasserstein_gradient = compute_wasserstein_gradient(estimate, xyz_points)\n valS, nn_pullback(wasserstein_gradient)[1]\nend\n\n# note the very high value for the learning rate\noptimizer = Optimizer(nn, AdamOptimizer(1e-1))\n\n# note the small number of training steps\nconst training_steps = 40\nloss_array = zeros(training_steps)\nfor i in 1:training_steps\n val, dp = compute_gradient(nn.params)\n loss_array[i] = val\n optimization_step!(optimizer, model, nn.params, dp)\nend\nplot(loss_array, xlabel=\"training step\", label=\"loss\")","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"Now we plot a few points to check how well they match the graph:","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"const number_of_points = 35\n\ncoordinates = nn(randn(2, number_of_points))\nscatter3d!(p, [coordinates[1, :]], [coordinates[2, :]], [coordinates[3, :]], alpha=.5, color=4, label=\"mapped points\")","category":"page"},{"location":"architectures/linear_symplectic_transformer/#Linear-Symplectic-Transformer","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"","category":"section"},{"location":"architectures/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"The linear symplectic transformer consists of a combination of linear symplectic attention and gradient layers and is visualized below: ","category":"page"},{"location":"architectures/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"Main.include_graphics(\"../tikz/linear_symplectic_transformer\"; caption = raw\"Visualization of the linear symplectic transformer architecutre. \\texttt{n\\_sympnet} refers to the number of SympNet layers (\\texttt{n\\_sympnet=2} in this figure) and \\texttt{L} refers to the number of transformer blocks (\\texttt{L=1} in this figure).\", width = .3) # hide","category":"page"},{"location":"architectures/linear_symplectic_transformer/#Library-Functions","page":"Linear Symplectic Transformer","title":"Library Functions","text":"","category":"section"},{"location":"architectures/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"LinearSymplecticTransformer","category":"page"},{"location":"architectures/linear_symplectic_transformer/#GeometricMachineLearning.LinearSymplecticTransformer-architectures-linear_symplectic_transformer","page":"Linear Symplectic Transformer","title":"GeometricMachineLearning.LinearSymplecticTransformer","text":"Realizes the linear Symplectic Transformer.\n\nConstructor:\n\nThe constructor is called with the following arguments\n\ndim::Int: System dimension \nseq_length::Int: Number of time steps that the transformer considers. \n\nOptional keyword arguments:\n\nn_sympnet::Int=2: The number of sympnet layers in the transformer.\nupscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer. \nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function for the SympNet layers. \ninit_upper::Bool=true: Specifies if the first layer is a Q-type layer (init_upper=true) or if it is a P-type layer (init_upper=false).\n\n\n\n\n\n","category":"type"},{"location":"layers/volume_preserving_feedforward/#Volume-Preserving-Feedforward-Layer","page":"Volume-Preserving Layers","title":"Volume-Preserving Feedforward Layer","text":"","category":"section"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"Volume preserving feedforward layers are a special type of ResNet layer for which we restrict the weight matrices to be of a particular form. I.e. each layer computes: ","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"x mapsto x + sigma(Ax + b)","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"where sigma is a nonlinearity, A is the weight and b is the bias. The matrix A is either a lower-triangular matrix L or an upper-triangular matrix U[1]. The lower triangular matrix is of the form (the upper-triangular layer is simply the transpose of the lower triangular): ","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"[1]: Implemented as LowerTriangular and UpperTriangular in GeometricMachineLearning.","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"L = beginpmatrix\n 0 0 cdots 0 \n a_21 ddots vdots \n vdots ddots ddots vdots \n a_n1 cdots a_n(n-1) 0 \nendpmatrix","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"The Jacobian of a layer of the above form then is of the form","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"J = beginpmatrix\n 1 0 cdots 0 \n b_21 ddots vdots \n vdots ddots ddots vdots \n b_n1 cdots b_n(n-1) 1 \nendpmatrix","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"and the determinant of J is 1, i.e. the map is volume-preserving. ","category":"page"},{"location":"layers/volume_preserving_feedforward/#Library-Functions","page":"Volume-Preserving Layers","title":"Library Functions","text":"","category":"section"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"VolumePreservingFeedForwardLayer","category":"page"},{"location":"layers/volume_preserving_feedforward/#GeometricMachineLearning.VolumePreservingFeedForwardLayer-layers-volume_preserving_feedforward","page":"Volume-Preserving Layers","title":"GeometricMachineLearning.VolumePreservingFeedForwardLayer","text":"Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following: \n\nx mapsto begincases sigma(Lx + b) textwhere L is mathttLowerTriangular sigma(Ux + b) textwhere U is mathttUpperTriangular endcases\n\nThe functor can be applied to a vecotr, a matrix or a tensor. \n\nConstructor\n\nThe constructors are called with:\n\nsys_dim::Int: the system dimension. \nactivation=tanh: the activation function. \ninclude_bias::Bool=true (keyword argument): specifies whether a bias should be used. \n\n\n\n\n\n","category":"type"},{"location":"architectures/sympnet/#SympNet-Architecture","page":"SympNet","title":"SympNet Architecture","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"This document discusses the SympNet architecture and its implementation in GeometricMachineLearning.jl.","category":"page"},{"location":"architectures/sympnet/#Quick-overview-of-the-theory-of-SympNets","page":"SympNet","title":"Quick overview of the theory of SympNets","text":"","category":"section"},{"location":"architectures/sympnet/#Principle","page":"SympNet","title":"Principle","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"SympNets (see [29] for the eponymous paper) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take (q^Tp^T)^T=(q_1ldotsq_dp_1ldotsp_d)^Tin mathbbR^2d as the coordinates in phase space, where q=(q_1 ldots q_d)^Tin mathbbR^d is refered to as the position and p=(p_1 ldots p_d)^Tin mathbbR^d the momentum. Given a point (q^Tp^T)^T in mathbbR^2d the SympNet aims to compute the next position ((q)^T(p)^T)^T and thus predicts the trajectory while preserving the symplectic structure of the system. SympNets are enforcing symplecticity strongly, meaning that this property is hard-coded into the network architecture. The layers are reminiscent of traditional neural network feedforward layers, but have a strong restriction imposed on them in order to be symplectic.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"SympNets can be viewed as a \"symplectic integrator\" (see [7] and [22]). Their goal is to predict, based on an initial condition ((q^(0))^T(p^(0))^T)^T, a sequence of points in phase space that fit the training data as well as possible:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"beginpmatrix q^(0) p^(0) endpmatrix cdots beginpmatrix tildeq^(1) tildep^(1) endpmatrix cdots beginpmatrix tildeq^(n) tildep^(n) endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The tilde in the above equation indicates predicted data. The time step between predictions is not a parameter we can choose but is related to the temporal frequency of the training data. This means that if data is recorded in an interval of e.g. 0.1 seconds, then this will be the time step of our integrator.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" Main.include_graphics(\"../tikz/sympnet_architecture\"; # hide\n label = \"fig:SympNetArchitecture\", # hide\n caption = \"Visualization of the SympNet architecture\" # hide\n ) # hide","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"There are two types of SympNet architectures: LA-SympNets and G-SympNets. ","category":"page"},{"location":"architectures/sympnet/#LA-SympNet","page":"SympNet","title":"LA-SympNet","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The first type of SympNets, LA-SympNets, are obtained from composing two types of layers: symplectic linear layers and symplectic activation layers. For a given integer n, a symplectic linear layer is defined by","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"mathcalL^nq\nbeginpmatrix\n q \n p \nendpmatrix\n = \nbeginpmatrix \n I S^n0 \n 0S^n I \nendpmatrix\n cdots \nbeginpmatrix \n I 0 \n S^2 I \nendpmatrix\nbeginpmatrix \n I S^1 \n 0 I \nendpmatrix\nbeginpmatrix\n q \n p \nendpmatrix\n+ b ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"or ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"mathcalL^np\nbeginpmatrix q \n p endpmatrix = \n beginpmatrix \n I 0S^n \n S^n0 I\n endpmatrix cdots \n beginpmatrix \n I S^2 \n 0 I\n endpmatrix\n beginpmatrix \n I 0 \n S^1 I\n endpmatrix\n beginpmatrix q \n p endpmatrix\n + b ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The superscripts q and p indicate whether the q or the p part is changed. The learnable parameters are the symmetric matrices S^iinmathbbR^dtimes d and the bias binmathbbR^2d. The integer n is the width of the symplectic linear layer. It can be shown that five of these layers, i.e. ngeq5, can represent any linear symplectic map (see [30]), so n need not be larger than five. We denote the set of symplectic linear layers by mathcalM^L.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The second type of layer needed for LA-SympNets are so-called activation layers:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalA^q beginpmatrix q \n p endpmatrix = \n beginbmatrix \n Ihatsigma^a \n 0I\n endbmatrix beginpmatrix q \n p endpmatrix =\n beginpmatrix \n mathrmdiag(a)sigma(p)+q \n p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"and","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalA^p beginpmatrix q \n p endpmatrix = \n beginbmatrix \n I0 \n hatsigma^aI\n endbmatrix beginpmatrix q \n p endpmatrix\n =\n beginpmatrix \n q \n mathrmdiag(a)sigma(q)+p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The activation function sigma can be any nonlinearity (on which minor restrictions are imposed below). Here the scaling vector ainmathbbR^d constitutes the learnable weights. We denote the set of symplectic activation layers by mathcalM^A. ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"An LA-SympNet is a function of the form Psi=l_k circ a_k circ l_k-1 circ cdots circ a_1 circ l_0 where (l_i)_0leq ileq k subset (mathcalM^L)^k+1 and (a_i)_1leq ileq k subset (mathcalM^A)^k. We will refer to k as the number of hidden layers of the SympNet[1] and the number n above as the depth of the linear layer.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"[1]: Note that if k=1 then the LA-SympNet consists of only one linear layer.","category":"page"},{"location":"architectures/sympnet/#G-SympNets","page":"SympNet","title":"G-SympNets","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"G-SympNets are an alternative to LA-SympNets. They are built with only one kind of layer, called gradient layer. For a given activation function sigma and an integer ngeq d, a gradient layers is a symplectic map from mathbbR^2d to mathbbR^2d defined by","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalG^up beginpmatrix q \n p endpmatrix = \n beginbmatrix \n Ihatsigma^Kab \n 0I\n endbmatrix beginpmatrix q \n p endpmatrix =\n beginpmatrix \n K^T mathrmdiag(a)sigma(Kp+b)+q \n p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"or","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalG^low beginpmatrix q \n p endpmatrix = \n beginbmatrix \n I0 \n hatsigma^KabI\n endbmatrix beginpmatrix q \n p endpmatrix\n =\n beginpmatrix \n q \n K^T mathrmdiag(a)sigma(Kq+b)+p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The parameters of this layer are the scaling matrix KinmathbbR^mtimes d, the bias binmathbbR^m and the scaling vector ainmathbbR^m. The name \"gradient layer\" has its origin in the fact that the expression K^Tmathrmdiag(a)sigma(Kq+b)_i = sum_jk_jia_jsigma(sum_ellk_jellq_ell+b_j) is the gradient of a function sum_ja_jtildesigma(sum_ellk_jellq_ell+b_j), where tildesigma is the antiderivative of sigma. The first dimension of K we refer to as the upscaling dimension.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"If we denote by mathcalM^G the set of gradient layers, a G-SympNet is a function of the form Psi=g_k circ g_k-1 circ cdots circ g_0 where (g_i)_0leq ileq k subset (mathcalM^G)^k. The index k is again the number of hidden layers.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Further note here the different roles played by round and square brackets: the latter indicates a nonlinear operation as opposed to a regular vector or matrix. ","category":"page"},{"location":"architectures/sympnet/#Universal-approximation-theorems","page":"SympNet","title":"Universal approximation theorems","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"In order to state the universal approximation theorem for both architectures we first need a few definitions:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Let U be an open set of mathbbR^2d, and let us denote by mathcalSP^r(U) the set of C^r smooth symplectic maps on U. We now define a topology on C^r(K mathbbR^n), the set of C^r-smooth maps from a compact set KsubsetmathbbR^n to mathbbR^n through the norm","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"f_C^r(KmathbbR^n) = undersetalphaleq rsum underset1leq i leq nmaxundersetxin Ksup D^alpha f_i(x)","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"where the differential operator D^alpha is defined by ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"D^alpha f = fracpartial^alpha fpartial x_1^alpha_1x_n^alpha_n","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"with alpha = alpha_1 ++ alpha_n. ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Definition sigma is r-finite if sigmain C^r(mathbbRmathbbR) and int D^rsigma(x)dx +infty.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Definition Let mnrin mathbbN with mn0 be given, U an open set of mathbbR^m, and IJsubset C^r(UmathbbR^n). We say J is r-uniformly dense on compacta in I if J subset I and for any fin I, epsilon0, and any compact Ksubset U, there exists gin J such that f-g_C^r(KmathbbR^n) epsilon.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"We can now state the universal approximation theorems:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Theorem (Approximation theorem for LA-SympNet) For any positive integer r0 and open set Uin mathbbR^2d, the set of LA-SympNet is r-uniformly dense on compacta in SP^r(U) if the activation function sigma is r-finite.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Theorem (Approximation theorem for G-SympNet) For any positive integer r0 and open set Uin mathbbR^2d, the set of G-SympNet is r-uniformly dense on compacta in SP^r(U) if the activation function sigma is r-finite.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"There are many r-finite activation functions commonly used in neural networks, for example:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"sigmoid sigma(x)=frac11+e^-x for any positive integer r, \ntanh tanh(x)=frace^x-e^-xe^x+e^-x for any positive integer r. ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on mathbbR^2d. But this does not tell us anything about how to optimize the network. This is can be done with any common neural network optimizer and these neural network optimizers always rely on a corresponding loss function. ","category":"page"},{"location":"architectures/sympnet/#Loss-function","page":"SympNet","title":"Loss function","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"To train the SympNet, one need data along a trajectory such that the model is trained to perform an integration. These data are (QP) where Qij (respectively Pij) is the real number q_j(t_i) (respectively pij) which is the j-th coordinates of the generalized position (respectively momentum) at the i-th time step. One also need a loss function defined as :","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Loss(QP) = undersetisum d(Phi(Qi-Pi-) Qi- Pi-^T)","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"where d is a distance on mathbbR^d.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"See the tutorial section for an introduction into using SympNets with GeometricMachineLearning.jl.","category":"page"},{"location":"architectures/sympnet/#References","page":"SympNet","title":"References","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"P. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).\n\n\n\n","category":"page"}] +[{"location":"architectures/neural_network_integrators/#Neural-Network-Integrators","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"In GeometricMachineLearning we can divide most neural network architectures (that are used for applications to physical systems) into two categories: autoencoders and integrators. Integrator in its most general form refers to an approximation of the flow of an ODE (see the section on the existence and uniqueness theorem) by a numerical scheme. Traditionally these numerical schemes were constructed by defining certain relationships between a known time step z^(t) and a future unknown one z^(t+1) [7, 24]: ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":" f(z^(t) z^(t+1)) = 0","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"One usually refers to such a relationship as an \"integration scheme\". If this relationship can be reformulated as ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":" z^(t+1) = g(z^(t))","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"then we refer to the scheme as explicit, if it cannot be reformulated in such a way then we refer to it as implicit. Implicit schemes are typically more expensive to solve than explicit ones. The Julia library GeometricIntegrators [25] offers a wide variety of integration schemes both implicit and explicit. ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"The neural network integrators in GeometricMachineLearning (the corresponding type is NeuralNetworkIntegrator) are all explicit integration schemes where the function g above is modeled with a neural network.","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"Neural networks, as an alternative to traditional methods, are employed because of (i) potentially superior performance and (ii) an ability to learn unknown dynamics from data. ","category":"page"},{"location":"architectures/neural_network_integrators/#Multi-step-methods","page":"Neural Network Integrators","title":"Multi-step methods","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"Multi-step method [21, 22] refers to schemes that are of the form[1]: ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"[1]: We again assume that all the steps up to and including t are known.","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":" f(z^(t - mathttsl + 1) z^(t - mathttsl + 2) ldots z^(t) z^(t + 1) ldots z^(mathttpw + 1)) = 0","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"where sl is short for sequence length and pw is short for prediction window. In contrast to traditional single-step methods, sl and pw can be greater than 1. An explicit multi-step method has the following form: ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"z^(t+1) ldots z^(t+mathttpw) = g(z^(t - mathttsl + 1) ldots z^(t))","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"There are essentially two ways to construct multi-step methods with neural networks: the older one is using recurrent neural networks such as long short-term memory cells (LSTMs, [26]) and the newer one is using transformer neural networks [17]. Both of these approaches have been successfully employed to learn multi-step methods (see [27, 28] for the former and [23, 29, 30] for the latter), but because the transformer architecture exhibits superior performance on modern hardware and can be imbued with geometric properties it is recommended to always use a transformer-derived architecture when dealing with time series[2].","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"[2]: GeometricMachineLearning also has an LSTM implementation, but this may be deprecated in the future. ","category":"page"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"Explicit multi-step methods derived from he transformer are always subtypes of the type TransformerIntegrator in GeometricMachineLearning. In GeometricMachineLearning the standard transformer, the volume-preserving transformer and the linear symplectic transformer are implemented. ","category":"page"},{"location":"architectures/neural_network_integrators/#Library-Functions","page":"Neural Network Integrators","title":"Library Functions","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"NeuralNetworkIntegrator \nTransformerIntegrator","category":"page"},{"location":"architectures/neural_network_integrators/#GeometricMachineLearning.NeuralNetworkIntegrator-architectures-neural_network_integrators","page":"Neural Network Integrators","title":"GeometricMachineLearning.NeuralNetworkIntegrator","text":"This is a super type of various neural network architectures such as SympNet and ResNet whose purpose is to approximate the flow of an ordinary differential equation (ODE).\n\n\n\n\n\n","category":"type"},{"location":"architectures/neural_network_integrators/#GeometricMachineLearning.TransformerIntegrator-architectures-neural_network_integrators","page":"Neural Network Integrators","title":"GeometricMachineLearning.TransformerIntegrator","text":"Encompasses various transformer architectures, such as the structure-preserving transformer and the linear symplectic transformer. \n\n\n\n\n\n","category":"type"},{"location":"architectures/neural_network_integrators/#References","page":"Neural Network Integrators","title":"References","text":"","category":"section"},{"location":"architectures/neural_network_integrators/","page":"Neural Network Integrators","title":"Neural Network Integrators","text":"E. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).\n\n\n\nB. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).\n\n\n\nM. Kraus. GeometricIntegrators.jl: Geometric Numerical Integration in Julia, https://github.com/JuliaGNI/GeometricIntegrators.jl (2020).\n\n\n\nK. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).\n\n\n\nS. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).\n\n\n\nA. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).\n\n\n\nS. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).\n\n\n\nK. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).\n\n\n\nA. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).\n\n\n\nA. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, beta-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).\n\n\n\nB. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).\n\n\n\n","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/#Projection-and-Reduction-Errors-of-Reduced-Models","page":"Projection and Reduction Error","title":"Projection and Reduction Errors of Reduced Models","text":"","category":"section"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"Two errors that are of very big importance in reduced order modeling are the projection and the reduction error. During training one typically aims to miminimze the projection error, but for the actual application of the model the reduction error is often more important. ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/#Projection-Error","page":"Projection and Reduction Error","title":"Projection Error","text":"","category":"section"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"The projection error computes how well a reduced basis, represented by the reduction mathcalP and the reconstruction mathcalR, can represent the data with which it is build. In mathematical terms: ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"e_mathrmproj(mu) = \n frac mathcalRcircmathcalP(M) - M M ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"where cdot is the Frobenius norm (one could also optimize for different norms).","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/#Reduction-Error","page":"Projection and Reduction Error","title":"Reduction Error","text":"","category":"section"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"The reduction error measures how far the reduced system diverges from the full-order system during integration (online stage). In mathematical terms (and for a single initial condition): ","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"e_mathrmred(mu) = sqrt\n fracsum_t=0^K mathbfx^(t)(mu) - mathcalR(mathbfx^(t)_r(mu)) ^2sum_t=0^K mathbfx^(t)(mu) ^2\n","category":"page"},{"location":"reduced_order_modeling/projection_reduction_errors/","page":"Projection and Reduction Error","title":"Projection and Reduction Error","text":"where mathbfx^(t) is the solution of the FOM at point t and mathbfx^(t)_r is the solution of the ROM (in the reduced basis) at point t. The reduction error, as opposed to the projection error, not only measures how well the solution manifold is represented by the reduced basis, but also measures how well the FOM dynamics are approximated by the ROM dynamics (via the induced vector field on the reduced basis).","category":"page"},{"location":"architectures/autoencoders/#Variational-Autoencoders","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Variational autoencoders (Lee and Carlberg, 2020) train on the following set: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"mathcalX(mathbbP_mathrmtrain) = mathbfx^k(mu) - mathbfx^0(mu)0leqkleqKmuinmathbbP_mathrmtrain","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathbfx^k(mu)approxmathbfx(t^kmu). Note that mathbf0inmathcalX(mathbbP_mathrmtrain) as k can also be zero. ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"The encoder Psi^mathrmenc and decoder Psi^mathrmdec are then trained on this set mathcalX(mathbbP_mathrmtrain) by minimizing the reconstruction error: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":" mathbfx - Psi^mathrmdeccircPsi^mathrmenc(mathbfx) text for mathbfxinmathcalX(mathbbP_mathrmtrain)","category":"page"},{"location":"architectures/autoencoders/#Initial-condition","page":"Variational Autoencoders","title":"Initial condition","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"No matter the parameter mu the initial condition in the reduced system is always mathbfx_r0(mu) = mathbfx_r0 = Psi^mathrmenc(mathbf0). ","category":"page"},{"location":"architectures/autoencoders/#Reconstructed-solution","page":"Variational Autoencoders","title":"Reconstructed solution","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"In order to arrive at the reconstructed solution one first has to decode the reduced state and then add the reference state:","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"mathbfx^mathrmreconstr(tmu) = mathbfx^mathrmref(mu) + Psi^mathrmdec(mathbfx_r(tmu))","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathbfx^mathrmref(mu) = mathbfx(t_0mu) - Psi^mathrmdeccircPsi^mathrmdec(mathbf0).","category":"page"},{"location":"architectures/autoencoders/#Symplectic-reduced-vector-field","page":"Variational Autoencoders","title":"Symplectic reduced vector field","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"A symplectic vector field is one whose flow conserves the symplectic structure mathbbJ. This is equivalent[1] to there existing a Hamiltonian H s.t. the vector field X can be written as X = mathbbJnablaH.","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"[1]: Technically speaking the definitions are equivalent only for simply-connected manifolds, so also for vector spaces. ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"If the full-order Hamiltonian is H^mathrmfullequivH we can obtain another Hamiltonian on the reduces space by simply setting: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"H^mathrmred(mathbfx_r(tmu)) = H(mathbfx^mathrmreconstr(tmu)) = H(mathbfx^mathrmref(mu) + Psi^mathrmdec(mathbfx_r(tmu)))","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"The ODE associated to this Hamiltonian is also the one corresponding to Manifold Galerkin ROM (see (Lee and Carlberg, 2020)).","category":"page"},{"location":"architectures/autoencoders/#Manifold-Galerkin-ROM","page":"Variational Autoencoders","title":"Manifold Galerkin ROM","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Define the FOM ODE residual as: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"r (mathbfv xi tau mu) mapsto mathbfv - f(xi tau mu)","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"The reduced ODE is then defined to be: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"dothatmathbfx(tmu) = mathrmargmin_hatmathbfvinmathbbR^p r(mathcalJ(hatmathbfx(tmu))hatmathbfvhatmathbfx^mathrmref(mu) + Psi^mathrmdec(hatmathbfx(tmu))tmu) _2^2","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathcalJ is the Jacobian of the decoder Psi^mathrmdec. This leads to: ","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"mathcalJ(hatmathbfx(tmu))hatmathbfv - f(hatmathbfx^mathrmref(mu) + Psi^mathrmdec(hatmathbfx(tmu)) t mu) overset= 0 implies \nhatmathbfv = mathcalJ(hatmathbfx(tmu))^+f(hatmathbfx^mathrmref(mu) + Psi^mathrmdec(hatmathbfx(tmu)) t mu)","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"where mathcalJ(hatmathbfx(tmu))^+ is the pseudoinverse of mathcalJ(hatmathbfx(tmu)). Because mathcalJ(hatmathbfx(tmu)) is a symplectic matrix the pseudoinverse is the symplectic inverse (see (Peng and Mohseni, 2016)).","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Furthermore, because f is Hamiltonian, the vector field describing dothatmathbfx(tmu) will also be Hamiltonian. ","category":"page"},{"location":"architectures/autoencoders/#References","page":"Variational Autoencoders","title":"References","text":"","category":"section"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"K. Lee and K. Carlberg. “Model reduction of dynamical systems on nonlinear manifolds using","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"deep convolutional autoencoders”. In: Journal of Computational Physics 404 (2020), p. 108973.","category":"page"},{"location":"architectures/autoencoders/","page":"Variational Autoencoders","title":"Variational Autoencoders","text":"Peng L, Mohseni K. Symplectic model reduction of Hamiltonian systems[J]. SIAM Journal on Scientific Computing, 2016, 38(1): A1-A27.","category":"page"},{"location":"tutorials/mnist_tutorial/#MNIST-tutorial","page":"MNIST","title":"MNIST tutorial","text":"","category":"section"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"This is a short tutorial that shows how we can use GeometricMachineLearning to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. This is also the result presented in [38].","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"First, we need to import the relevant packages: ","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"using GeometricMachineLearning, CUDA, Plots\nimport Zygote, MLDatasets, KernelAbstractions","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"For the AD routine we here use the GeometricMachineLearning default and we get the dataset from MLDatasets. First we need to load the data set, and put it on GPU (if you have one):","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"train_x, train_y = MLDatasets.MNIST(split=:train)[:]\ntest_x, test_y = MLDatasets.MNIST(split=:test)[:]\ntrain_x = train_x |> cu \ntest_x = test_x |> cu \ntrain_y = train_y |> cu \ntest_y = test_y |> cu","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"GeometricMachineLearning has built-in data loaders that make it particularly easy to handle data: ","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"patch_length = 7\ndl = DataLoader(train_x, train_y, patch_length=patch_length)\ndl_test = DataLoader(train_x, train_y, patch_length=patch_length)","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"Here patch_length indicates the size one patch has. One image in MNIST is of dimension 28times28, this means that we decompose this into 16 (7times7) images (also see [38]).","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"We next define the model with which we want to train:","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"model = ClassificationTransformer(dl, n_heads=n_heads, n_layers=n_layers, Stiefel=true)","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"Here we have chosen a ClassificationTransformer, i.e. a composition of a specific number of transformer layers composed with a classification layer. We also set the Stiefel option to true, i.e. we are optimizing on the Stiefel manifold.","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"We now have to initialize the neural network weights. This is done with the constructor for NeuralNetwork:","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"backend = KernelAbstractions.get_backend(dl)\nT = eltype(dl)\nnn = NeuralNetwork(model, backend, T)","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"And with this we can finally perform the training:","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"# an instance of batch is needed for the optimizer\nbatch = Batch(batch_size)\n\noptimizer_instance = Optimizer(AdamOptimizer(), nn)\n\n# this prints the accuracy and is optional\nprintln(\"initial test accuracy: \", accuracy(Ψᵉ, ps, dl_test), \"\\n\")\n\nloss_array = optimizer_instance(nn, dl, batch, n_epochs)\n\nprintln(\"final test accuracy: \", accuracy(Ψᵉ, ps, dl_test), \"\\n\")","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"It is instructive to play with n_layers, n_epochs and the Stiefel property.","category":"page"},{"location":"tutorials/mnist_tutorial/","page":"MNIST","title":"MNIST","text":"B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\n","category":"page"},{"location":"tutorials/volume_preserving_attention/#Comparison-of-different-VolumePreservingAttention","page":"Volume-Preserving Attention","title":"Comparison of different VolumePreservingAttention","text":"","category":"section"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"In the section of volume-preserving attention we mentioned two ways of computing volume-preserving attention: one where we compute the correlations with a skew-symmetric matrix and one where we compute the correlations with an arbitrary matrix. Here we compare the two approaches. When calling the VolumePreservingAttention layer we can specify whether we want to use the skew-symmetric or the arbitrary weighting by setting the keyword skew_sym = true and skew_sym = false respectively. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"In here we demonstrate the differences between the two approaches for computing correlations. For this we first generate a training set consisting of two collections of curves: (i) sine curves and (ii) cosine curve. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"using GeometricMachineLearning # hide\nusing GeometricMachineLearning: FeedForwardLoss, TransformerLoss # hide\nusing Plots # hide\nimport Random # hide \nRandom.seed!(123) # hide\n\nsine_cosine = zeros(1, 1000, 2)\nsine_cosine[1, :, 1] .= sin.(0.:.1:99.9)\nsine_cosine[1, :, 2] .= cos.(0.:.1:99.9)\n\n\nconst dl = DataLoader(Float16.(sine_cosine))","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"The third axis (i.e. the parameter axis) has length two, meaning we have two different kinds of curves: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"plot(dl.input[1, :, 1], label = \"sine\")\nplot!(dl.input[1, :, 2], label = \"cosine\")","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We want to train a single neural network on both these curves. We compare three networks which are of the following form: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"mathttnetwork = mathcalNN_dcircPsicircmathcalNN_u","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"where mathcalNN_u refers to a neural network that scales up and mathcalNN_d refers to a neural network that scales down. The up and down scaling is done with simple dense layers: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"mathcalNN_u(x) = mathrmtanh(a_ux + b_u) text and mathcalNN_d(x) = a_d^Tx + b_d","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"where a_u b_u a_dinmathbbR^mathrmud and b_d is a scalar. ud refers to upscaling dimension. For Psi we consider three different choices:","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"a volume-preserving attention with skew-symmetric weighting,\na volume-preserving attention with arbitrary weighting,\nan identity layer.","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We further choose a sequence length 5 (i.e. the network always sees the last 5 time steps) and always predict one step into the future (i.e. the prediction window is set to 1):","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"const seq_length = 3\nconst prediction_window = 1\n\nconst upscale_dimension_1 = 2\n\nconst T = Float16\n\nfunction set_up_networks(upscale_dimension::Int = upscale_dimension_1)\n model_skew = Chain(Dense(1, upscale_dimension, tanh), VolumePreservingAttention(upscale_dimension, seq_length; skew_sym = true), Dense(upscale_dimension, 1, identity; use_bias = true))\n model_arb = Chain(Dense(1, upscale_dimension, tanh), VolumePreservingAttention(upscale_dimension, seq_length; skew_sym = false), Dense(upscale_dimension, 1, identity; use_bias = true))\n model_comp = Chain(Dense(1, upscale_dimension, tanh), Dense(upscale_dimension, 1, identity; use_bias = true))\n\n nn_skew = NeuralNetwork(model_skew, CPU(), T)\n nn_arb = NeuralNetwork(model_arb, CPU(), T)\n nn_comp = NeuralNetwork(model_comp, CPU(), T)\n\n nn_skew, nn_arb, nn_comp\nend\n\nnn_skew, nn_arb, nn_comp = set_up_networks()","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We expect the third network to not be able to learn anything useful since it cannot resolve time series data: a regular feedforward network only ever sees one datum at a time. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"Next we train the networks (here we pick a batch size of 30):","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"function set_up_optimizers(nn_skew, nn_arb, nn_comp)\n o_skew = Optimizer(AdamOptimizer(T), nn_skew)\n o_arb = Optimizer(AdamOptimizer(T), nn_arb)\n o_comp = Optimizer(AdamOptimizer(T), nn_comp)\n\n o_skew, o_arb, o_comp\nend\n\no_skew, o_arb, o_comp = set_up_optimizers(nn_skew, nn_arb, nn_comp)\n\nconst n_epochs = 1000\n\nconst batch_size = 30\n\nconst batch = Batch(batch_size, seq_length, prediction_window)\nconst batch2 = Batch(batch_size)\n\nfunction train_networks!(nn_skew, nn_arb, nn_comp)\n loss_array_skew = o_skew(nn_skew, dl, batch, n_epochs, TransformerLoss(batch))\n loss_array_arb = o_arb( nn_arb, dl, batch, n_epochs, TransformerLoss(batch))\n loss_array_comp = o_comp(nn_comp, dl, batch2, n_epochs, FeedForwardLoss())\n\n loss_array_skew, loss_array_arb, loss_array_comp\nend\n\nloss_array_skew, loss_array_arb, loss_array_comp = train_networks!(nn_skew, nn_arb, nn_comp)\n\nfunction plot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)\n p = plot(loss_array_skew, color = 2, label = \"skew\", yaxis = :log)\n plot!(p, loss_array_arb, color = 3, label = \"arb\")\n plot!(p, loss_array_comp, color = 4, label = \"comp\")\n\n p\nend\n\nplot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"Looking at the training errors, we can see that the network with the skew-symmetric weighting is stuck at a relatively high error rate, whereas the loss for the network with the arbitrary weighting is decreasing to a significantly lower level. The feedforward network without the attention mechanism is not able to learn anything useful (as was expected). ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"The following demonstrates the predictions of our approaches[1]: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"[1]: Here we have to use the architectures DummyTransformer and DummyNNIntegrator to reformulate the three neural networks defined here as NeuralNetworkIntegrators. Normally the user should try to use predefined architectures in GeometricMachineLearning, that way they never use DummyTransformer and DummyNNIntegrator. ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"initial_condition = dl.input[:, 1:seq_length, 2]\n\nfunction make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)\n nn_skew = NeuralNetwork(GeometricMachineLearning.DummyTransformer(seq_length), nn_skew.model, nn_skew.params, CPU())\n nn_arb = NeuralNetwork(GeometricMachineLearning.DummyTransformer(seq_length), nn_arb.model, nn_arb.params, CPU())\n nn_comp = NeuralNetwork(GeometricMachineLearning.DummyNNIntegrator(), nn_comp.model, nn_comp.params, CPU())\n\n nn_skew, nn_arb, nn_comp\nend\n\nnn_skew, nn_arb, nn_comp = make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)\n\nfunction produce_validation_plot(n_points::Int, nn_skew = nn_skew, nn_arb = nn_arb, nn_comp = nn_comp; initial_condition::Matrix=initial_condition, type = :cos)\n validation_skew = iterate(nn_skew, initial_condition; n_points = n_points, prediction_window = 1)\n validation_arb = iterate(nn_arb, initial_condition; n_points = n_points, prediction_window = 1)\n validation_comp = iterate(nn_comp, initial_condition[:, 1]; n_points = n_points)\n\n p2 = type == :cos ? plot(dl.input[1, 1:n_points, 2], color = 1, label = \"reference\") : plot(dl.input[1, 1:n_points, 1], color = 1, label = \"reference\")\n\n plot!(validation_skew[1, :], color = 2, label = \"skew\")\n plot!(p2, validation_arb[1, :], color = 3, label = \"arb\")\n plot!(p2, validation_comp[1, :], color = 4, label = \"comp\")\n vline!([seq_length], color = :red, label = \"start of prediction\")\n\n p2 \nend\n\np2 = produce_validation_plot(40)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"In the above plot we can see that the network with the arbitrary weighting performs much better; even though the green line does not fit the blue line very well either, it manages to least qualitatively reflect the training data. We can also plot the predictions for longer time intervals: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"p3 = produce_validation_plot(400)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"We can also plot the comparison with the sine function: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"initial_condition = dl.input[:, 1:seq_length, 1]\n\np2 = produce_validation_plot(40, initial_condition = initial_condition, type = :sin)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"This advantage of the volume-preserving attention with arbitrary weighting may however be due to the fact that the skew-symmetric attention only has 3 learnable parameters, as opposed to 9 for the arbitrary weighting. If we increase the upscaling dimension the result changes: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"const upscale_dimension_2 = 10\n\nnn_skew, nn_arb, nn_comp = set_up_networks(upscale_dimension_2)\n\no_skew, o_arb, o_comp = set_up_optimizers(nn_skew, nn_arb, nn_comp)\n\nloss_array_skew, loss_array_arb, loss_array_comp = train_networks!(nn_skew, nn_arb, nn_comp)\n\nplot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"initial_condition = dl.input[:, 1:seq_length, 2]\n\nnn_skew, nn_arb, nn_comp = make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)\n\np2 = produce_validation_plot(40, nn_skew, nn_arb, nn_comp)","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"And for a longer time interval: ","category":"page"},{"location":"tutorials/volume_preserving_attention/","page":"Volume-Preserving Attention","title":"Volume-Preserving Attention","text":"p3 = produce_validation_plot(200, nn_skew, nn_arb, nn_comp)","category":"page"},{"location":"optimizers/manifold_related/geodesic/#Geodesic-Retraction","page":"Geodesic Retraction","title":"Geodesic Retraction","text":"","category":"section"},{"location":"optimizers/manifold_related/geodesic/","page":"Geodesic Retraction","title":"Geodesic Retraction","text":"General retractions are approximations of the exponential map. In GeometricMachineLearning we can, instead of using an approximation, solve the geodesic equation exactly (up to numerical error) by specifying Geodesic() as the argument of layers that have manifold weights. ","category":"page"},{"location":"optimizers/manifold_related/cayley/#The-Cayley-Retraction","page":"Cayley Retraction","title":"The Cayley Retraction","text":"","category":"section"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"The Cayley transformation is one of the most popular retractions. For several matrix Lie groups it is a mapping from the Lie algebra mathfrakg onto the Lie group G. They Cayley retraction reads: ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":" mathrmCayley(C) = left(mathbbI -frac12Cright)^-1left(mathbbI +frac12Cright)","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"This is easily checked to be a retraction, i.e. mathrmCayley(mathbbO) = mathbbI and fracpartialpartialtmathrmCayley(tC) = C.","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"What we need in practice is not the computation of the Cayley transform of an arbitrary matrix, but the Cayley transform of an element of mathfrakg^mathrmhor, the global tangent space representation. ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"The elements of mathfrakg^mathrmhor can be written as: ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"C = beginbmatrix\n A -B^T \n B mathbbO\nendbmatrix = beginbmatrix frac12A mathbbI B mathbbO endbmatrix beginbmatrix mathbbI mathbbO frac12A -B^T endbmatrix","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"where the second expression exploits the sparse structure of the array, i.e. it is a multiplication of a Ntimes2n with a 2ntimesN matrix. We can hence use the Sherman-Morrison-Woodbury formula to obtain:","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"(mathbbI - frac12UV)^-1 = mathbbI + frac12U(mathbbI - frac12VU)^-1V","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"So what we have to invert is the term ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"mathbbI - frac12beginbmatrix mathbbI mathbbO frac12A -B^T endbmatrixbeginbmatrix frac12A mathbbI B mathbbO endbmatrix = \nbeginbmatrix mathbbI - frac14A - frac12mathbbI frac12B^TB - frac18A^2 mathbbI - frac14A endbmatrix","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"The whole Cayley transform is then: ","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"left(mathbbI + frac12beginbmatrix frac12A mathbbI B mathbbO endbmatrix beginbmatrix mathbbI - frac14A - frac12mathbbI frac12B^TB - frac18A^2 mathbbI - frac14A endbmatrix^-1 beginbmatrix mathbbI mathbbO frac12A -B^T endbmatrix right)left( E + frac12beginbmatrix frac12A mathbbI B mathbbO endbmatrix beginbmatrix mathbbI frac12A endbmatrix right) = \nE + frac12beginbmatrix frac12A mathbbI B mathbbO endbmatrixleft(\n beginbmatrix mathbbI frac12A endbmatrix + \n beginbmatrix mathbbI - frac14A - frac12mathbbI frac12B^TB - frac18A^2 mathbbI - frac14A endbmatrix^-1left(\n beginbmatrix mathbbI frac12A endbmatrix + \n beginbmatrix frac12A frac14A^2 - frac12B^TB endbmatrix\n right)\n right)","category":"page"},{"location":"optimizers/manifold_related/cayley/","page":"Cayley Retraction","title":"Cayley Retraction","text":"Note that for computational reason we compute mathrmCayley(C)E instead of just the Cayley transform (see the section on retractions).","category":"page"},{"location":"data_loader/snapshot_matrix/#Snapshot-matrix","page":"Snapshot matrix & tensor","title":"Snapshot matrix","text":"","category":"section"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct reduced bases in a data-driven way. So (for a single parameter[1]) the snapshot matrix takes the following form: ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"[1]: If we deal with a parametrized PDE then there are two stages at which the snapshot matrix has to be processed: the offline stage and the online stage. ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"M = leftbeginarraycccc\nhatu_1(t_0) hatu_1(t_1) quadldotsquad hatu_1(t_f) \nhatu_2(t_0) hatu_2(t_1) ldots hatu_2(t_f) \nhatu_3(t_0) hatu_3(t_1) ldots hatu_3(t_f) \nldots ldots ldots ldots \nhatu_2N(t_0) hatu_2N(t_1) ldots hatu_2N(t_f) \nendarrayright","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"In the above example we store a matrix whose first axis is the system dimension (i.e. a state is an element of mathbbR^2n) and the second dimension gives the time step. ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"The starting point for using the snapshot matrix as data for a machine learning model is that all the columns of M live on a lower-dimensional solution manifold and we can use techniques such as POD and autoencoders to find this solution manifold. We also note that the second axis of M does not necessarily indicate time but can also represent various parameters (including initial conditions). The second axis in the DataLoader struct is therefore saved in the field n_params.","category":"page"},{"location":"data_loader/snapshot_matrix/#Snapshot-tensor","page":"Snapshot matrix & tensor","title":"Snapshot tensor","text":"","category":"section"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"The snapshot tensor fulfills the same role as the snapshot matrix but has a third axis that describes different initial parameters (such as different initial conditions). ","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"Main.include_graphics(\"../tikz/tensor\") # hide","category":"page"},{"location":"data_loader/snapshot_matrix/","page":"Snapshot matrix & tensor","title":"Snapshot matrix & tensor","text":"When drawing training samples from the snapshot tensor we also need to specify a sequence length (as an argument to the Batch struct). When sampling a batch from the snapshot tensor we sample over the starting point of the time interval (which is of length seq_length) and the third axis of the tensor (the parameters). The total number of batches in this case is lceilmathtt(dlinput_time_steps - batchseq_length) * dln_params batchbatch_sizerceil. ","category":"page"},{"location":"architectures/symplectic_autoencoder/#Symplectic-Autoencoder","page":"Symplectic Autoencoders","title":"Symplectic Autoencoder","text":"","category":"section"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"A visualization of an instance of SymplecticAutoencoder is shown below: ","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Main.include_graphics(\"../tikz/symplectic_autoencoder_architecture\") # hide","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"The intermediate dimension M is calculated via n : (N - n) ÷ (n_blocks - 1) : N. Further we have the following choices:","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"n_encoder_layers::Integer = 4\nn_encoder_blocks::Integer = 2 \nn_decoder_layers::Integer = 2 \nn_decoder_blocks::Integer = 3\nencoder_init_q::Bool = true\ndecoder_init_q::Bool = true","category":"page"},{"location":"architectures/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Note that all of these are keyword arguments that can be supplied to SymplecticAutoencoder.","category":"page"},{"location":"tutorials/linear_symplectic_transformer/#linear_symplectic_transformer_tutorial","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"","category":"section"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"In this tutorial we compare the linear symplectic transformer to the standard transformer. ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"using GeometricMachineLearning # hide\nusing GeometricProblems.CoupledHarmonicOscillator: hodeensemble, default_parameters\nusing GeometricIntegrators: ImplicitMidpoint, integrate \nusing LaTeXStrings\nusing Plots\nimport Random\n\nRandom.seed!(123)\n\nconst tstep = .3\nconst n_init_con = 5\n\n# ensemble problem\nep = hodeensemble([rand(2) for _ in 1:n_init_con], [rand(2) for _ in 1:n_init_con]; tstep = tstep)\n\ndl_nt = DataLoader(integrate(ep, ImplicitMidpoint()))\ndl = DataLoader(vcat(dl_nt.input.q, dl_nt.input.p))\n\nnothing # hide","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"We now define the architectures and train them: ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"const seq_length = 4\nconst batch_size = 16384\nconst n_epochs = 2000\n\narch_standard = StandardTransformerIntegrator(dl.input_dim; n_heads = 2, L = 1, n_blocks = 2)\narch_symplectic = LinearSymplecticTransformer(dl.input_dim, seq_length; n_sympnet = 2, L = 1, upscaling_dimension = 2 * dl.input_dim)\narch_sympnet = GSympNet(dl.input_dim; n_layers = 4, upscaling_dimension = 2 * dl.input_dim)\n\nnn_standard = NeuralNetwork(arch_standard)\nnn_symplectic = NeuralNetwork(arch_symplectic)\nnn_sympnet = NeuralNetwork(arch_sympnet)\n\no_method = AdamOptimizerWithDecay(n_epochs, Float64)\n\no_standard = Optimizer(o_method, nn_standard)\no_symplectic = Optimizer(o_method, nn_symplectic)\no_sympnet = Optimizer(o_method, nn_sympnet)\n\nbatch = Batch(batch_size, seq_length)\nbatch2 = Batch(batch_size)\n\nloss_array_standard = o_standard(nn_standard, dl, batch, n_epochs)\nloss_array_symplectic = o_symplectic(nn_symplectic, dl, batch, n_epochs)\nloss_array_sympnet = o_sympnet(nn_sympnet, dl, batch2, n_epochs)\n\np_train = plot(loss_array_standard; color = 2, xlabel = \"epoch\", ylabel = \"training error\", label = \"ST\", yaxis = :log)\nplot!(p_train, loss_array_symplectic; color = 4, label = \"LST\")\nplot!(p_train, loss_array_sympnet; color = 3, label = \"SympNet\")\n\np_train","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"We further evaluate a trajectory with the trained networks: ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"const index = 1\ninit_con = dl.input[:, 1:seq_length, index]\n\nconst n_steps = 30\n\nfunction make_validation_plot(n_steps = n_steps; kwargs...)\n prediction_standard = iterate(nn_standard, init_con; n_points = n_steps, prediction_window = seq_length)\n prediction_symplectic = iterate(nn_symplectic, init_con; n_points = n_steps, prediction_window = seq_length)\n prediction_sympnet = iterate(nn_sympnet, init_con[:, 1]; n_points = n_steps)\n\n p_validate = plot(dl.input[1, 1:n_steps, index]; color = 1, ylabel = L\"q_1\", label = \"implicit midpoint\", kwargs...)\n plot!(p_validate, prediction_standard[1, :]; color = 2, label = \"ST\", kwargs...)\n plot!(p_validate, prediction_symplectic[1, :]; color = 4, label = \"LST\", kwargs...)\n plot!(p_validate, prediction_sympnet[1, :]; color = 3, label = \"SympNet\", kwargs...)\n\n p_validate\nend\n\nmake_validation_plot(; linewidth = 2)","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"We can see that the standard transformer is not able to stay close to the trajectory coming from implicit midpoint very well. The linear symplectic transformer outperforms the standard transformer as well as the SympNet while needed much fewer parameters than the standard transformer: ","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"parameterlength(nn_standard), parameterlength(nn_symplectic), parameterlength(nn_sympnet)","category":"page"},{"location":"tutorials/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"It is also interesting to note that the training error for the SympNet gets lower than the one for the linear symplectic transformer, but it does not manage to outperform it when looking at the validation. ","category":"page"},{"location":"layers/multihead_attention_layer/#Multihead-Attention","page":"Multihead Attention","title":"Multihead Attention","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"In order to arrive from the attention layer at the multihead attention layer we have to do a few modifications: ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Note that these neural networks were originally developed for natural language processing (NLP) tasks and the terminology used here bears some resemblance to that field. The input to a multihead attention layer typicaly comprises three components:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Values VinmathbbR^ntimesT: a matrix whose columns are value vectors, \nQueries QinmathbbR^ntimesT: a matrix whose columns are query vectors, \nKeys KinmathbbR^ntimesT: a matrix whose columns are key vectors.","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Regular attention performs the following operation: ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"mathrmAttention(QKV) = Vmathrmsoftmax(fracK^TQsqrtn)","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"where n is the dimension of the vectors in V, Q and K. The softmax activation function here acts column-wise, so it can be seen as a transformation mathrmsoftmaxmathbbR^TtomathbbR^T with mathrmsoftmax(v)_i = e^v_ileft(sum_j=1e^v_jright). The K^TQ term is a similarity matrix between the queries and the vectors. ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"The transformer contains a self-attention mechanism, i.e. takes an input X and then transforms it linearly to V, Q and K, i.e. V = P^VX, Q = P^QX and K = P^KX. What distinguishes the multihead attention layer from the singlehead attention layer, is that there is not just one P^V, P^Q and P^K, but there are several: one for each head of the multihead attention layer. After computing the individual values, queries and vectors, and after applying the softmax, the outputs are then concatenated together in order to obtain again an array that is of the same size as the input array:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Main.include_graphics(\"../tikz/mha\") # hide","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Here the various P matrices can be interpreted as being projections onto lower-dimensional subspaces, hence the designation by the letter P. Because of this interpretation as projection matrices onto smaller spaces that should capture features in the input data it makes sense to constrain these elements to be part of the Stiefel manifold. ","category":"page"},{"location":"layers/multihead_attention_layer/#Computing-Correlations-in-the-Multihead-Attention-Layer","page":"Multihead Attention","title":"Computing Correlations in the Multihead-Attention Layer","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"The attention mechanism describes a reweighting of the \"values\" V_i based on correlations between the \"keys\" K_i and the \"queries\" Q_i. First note the structure of these matrices: they are all a collection of T vectors (Ndivmathttn_heads)-dimensional vectors, i.e. V_i=v_i^(1) ldots v_i^(T) K_i=k_i^(1) ldots k_i^(T) Q_i=q_i^(1) ldots q_i^(T) . Those vectors have been obtained by applying the respective projection matrices onto the original input I_iinmathbbR^NtimesT.","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"When performing the reweighting of the columns of V_i we first compute the correlations between the vectors in K_i and in Q_i and store the results in a correlation matrix C_i: ","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":" C_i_mn = left(k_i^(m)right)^Tq_i^(n)","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"The columns of this correlation matrix are than rescaled with a softmax function, obtaining a matrix of probability vectors mathcalP_i:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":" mathcalP_i_bulletn = mathrmsoftmax(C_i_bulletn)","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Finally the matrix mathcalP_i is multiplied onto V_i from the right, resulting in 16 convex combinations of the 16 vectors v_i^(m) with m=1ldotsT:","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":" V_imathcalP_i = leftsum_m=1^16mathcalP_i_m1v_i^(m) ldots sum_m=1^TmathcalP_i_mTv_i^(m)right","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"With this we can now give a better interpretation of what the projection matrices W_i^V, W_i^K and W_i^Q should do: they map the original data to lower-dimensional subspaces. We then compute correlations between the representation in the K and in the Q basis and use this correlation to perform a convex reweighting of the vectors in the V basis. These reweighted values are then fed into a standard feedforward neural network.","category":"page"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"Because the main task of the W_i^V, W_i^K and W_i^Q matrices here is for them to find bases, it makes sense to constrain them onto the Stiefel manifold; they do not and should not have the maximum possible generality.","category":"page"},{"location":"layers/multihead_attention_layer/#Library-Functions","page":"Multihead Attention","title":"Library Functions","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"MultiHeadAttention","category":"page"},{"location":"layers/multihead_attention_layer/#GeometricMachineLearning.MultiHeadAttention-layers-multihead_attention_layer","page":"Multihead Attention","title":"GeometricMachineLearning.MultiHeadAttention","text":"MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data. \n\nConstructor\n\nTakes input arguments: \n\ndim::Int: The system dimension \nn_heads::Int: The number of heads. \nStiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold. \nretraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction. \nadd_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result. \n\n\n\n\n\n","category":"type"},{"location":"layers/multihead_attention_layer/#References","page":"Multihead Attention","title":"References","text":"","category":"section"},{"location":"layers/multihead_attention_layer/","page":"Multihead Attention","title":"Multihead Attention","text":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).\n\n\n\n","category":"page"},{"location":"layers/sympnet_gradient/#SympNet-Gradient-Layer","page":"Sympnet Gradient Layers","title":"SympNet Gradient Layer","text":"","category":"section"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"The Sympnet gradient layer (called GradientLayer in GeometricMachineLearning) is based on the following theorem: ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"Main.theorem(raw\"\"\"Given a symplectic vector space ``\\mathbb{R}^{2n}`` which coordinates ``q_1, \\ldots, q_n, p_1, \\ldots, p_n`` and a function ``f:\\mathbb{R}^n\\to\\mathbb{R}`` that only acts on the ``q`` part, the map ``(q, p) \\mapsto (q, p + \\nabla_qf)`` is symplectic. A similar statement holds if ``f`` only acts on the ``p`` part.\"\"\")","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"Proofing this is straightforward by looking at the gradient of the mapping:","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" beginpmatrix\n mathbbI mathbbO \n nabla_q^2f mathbbI\n endpmatrix","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"where nabla_q^2f is the Hessian of f. This matrix is symmetric and for any symmetric matrix A we have that: ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" beginpmatrix\n mathbbI mathbbO \n A mathbbI\n endpmatrix^T mathbbJ_2n \n beginpmatrix \n mathbbI mathbbO \n A mathbbI \n endpmatrix = \n beginpmatrix\n mathbbI A \n mathbbO mathbbI\n endpmatrix \n beginpmatrix \n mathbbO mathbbI \n -mathbbI mathbbO \n endpmatrix \n beginpmatrix\n mathbbI mathbbO \n A mathbbI\n endpmatrix = \n beginpmatrix\n mathbbO mathbbI \n -mathbbI mathbbO \n endpmatrix","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"If we deal with GSympNets this function f is ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" f(q) = a^T Sigma(Kq + b)","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"where a binmathbbR^m, KinmathbbR^mtimesn and Sigma is the antiderivative of some common activation function sigma. We routinely refer to m as the upscaling dimension in GeometricMachineLearning. Computing the gradient of f gives: ","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":" nabla_qf_k = sum_i=1^m a_i sigma(sum_j=1^nk_ijq_j + b_i)k_ik = = K^T a odot sigma(Kq + b)","category":"page"},{"location":"layers/sympnet_gradient/","page":"Sympnet Gradient Layers","title":"Sympnet Gradient Layers","text":"where odot is the element-wise product, i.e. aodotv_k = a_kv_k.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Symplectic-Autoencoder","page":"PSD and Symplectic Autoencoders","title":"Symplectic Autoencoder","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Symplectic Autoencoders are a type of neural network suitable for treating Hamiltonian parametrized PDEs with slowly decaying Kolmogorov n-width. It is based on proper symplectic decomposition (PSD) and symplectic neural networks (SympNets).","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Hamiltonian-Model-Order-Reduction","page":"PSD and Symplectic Autoencoders","title":"Hamiltonian Model Order Reduction","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Hamiltonian PDEs are partial differential equations that, like its ODE counterpart, have a Hamiltonian associated with it. An example of this is the linear wave equation (see [34]) with Hamiltonian ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"mathcalH(q p mu) = frac12int_Omegamu^2(partial_xiq(tximu))^2 + p(tximu)^2dxi","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"The PDE for to this Hamiltonian can be obtained similarly as in the ODE case:","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"partial_tq(tximu) = fracdeltamathcalHdeltap = p(tximu) quad partial_tp(tximu) = -fracdeltamathcalHdeltaq = mu^2partial_xixiq(tximu)","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Symplectic-Solution-Manifold","page":"PSD and Symplectic Autoencoders","title":"Symplectic Solution Manifold","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"As with regular parametric PDEs, we also associate a solution manifold with Hamiltonian PDEs. This is a finite-dimensional manifold, on which the dynamics can be described through a Hamiltonian ODE. I NEED A PROOF OR SOME EXPLANATION FOR THIS!","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Workflow-for-Symplectic-ROM","page":"PSD and Symplectic Autoencoders","title":"Workflow for Symplectic ROM","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"As with any other reduced order modeling technique we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"mathcalH_mathrmdiscr(z(tmu)mu) = frac12x(tmu)^Tbeginbmatrix -mu^2D_xixi mathbbO mathbbO mathbbI endbmatrix x(tmu)","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"In Hamiltonian reduced order modelling we try to find a symplectic submanifold of the solution space[1] that captures the dynamics of the full system as well as possible.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"[1]: The submanifold is: tildemathcalM = Psi^mathrmdec(z_r)inmathbbR^2Nu_rinmathrmR^2n where z_r is the reduced state of the system. ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Similar to the regular PDE case we again build an encoder Psi^mathrmenc and a decoder Psi^mathrmdec; but now both these mappings are required to be symplectic!","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Concretely this means: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"The encoder is a mapping from a high-dimensional symplectic space to a low-dimensional symplectic space, i.e. Psi^mathrmencmathbbR^2NtomathbbR^2n such that nablaPsi^mathrmencmathbbJ_2N(nablaPsi^mathrmenc)^T = mathbbJ_2n.\nThe decoder is a mapping from a low-dimensional symplectic space to a high-dimensional symplectic space, i.e. Psi^mathrmdecmathbbR^2ntomathbbR^2N such that (nablaPsi^mathrmdec)^TmathbbJ_2NnablaPsi^mathrmdec = mathbbJ_2n.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"If these two maps are constrained to linear maps, then one can easily find good solutions with proper symplectic decomposition (PSD).","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Proper-Symplectic-Decomposition","page":"PSD and Symplectic Autoencoders","title":"Proper Symplectic Decomposition","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"For PSD the two mappings Psi^mathrmenc and Psi^mathrmdec are constrained to be linear, orthonormal (i.e. Psi^TPsi = mathbbI) and symplectic. The easiest way to enforce this is through the so-called cotangent lift: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Psi_mathrmCL = \nbeginbmatrix Phi mathbbO mathbbO Phi endbmatrix","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"and PhiinSt(nN)subsetmathbbR^Ntimesn, i.e. is an element of the Stiefel manifold. If the snapshot matrix is of the form: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"M = leftbeginarraycccc\nhatq_1(t_0) hatq_1(t_1) quadldotsquad hatq_1(t_f) \nhatq_2(t_0) hatq_2(t_1) ldots hatq_2(t_f) \nldots ldots ldots ldots \nhatq_N(t_0) hatq_N(t_1) ldots hatq_N(t_f) \nhatp_1(t_0) hatp_1(t_1) ldots hatp_1(t_f) \nhatp_2(t_0) hatp_2(t_1) ldots hatp_2(t_f) \nldots ldots ldots ldots \nhatp_N(t_0) hatp_N(t_1) ldots hatp_N(t_f) \nendarrayright","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"then Phi can be computed in a very straight-forward manner: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Rearrange the rows of the matrix M such that we end up with a Ntimes2(f+1) matrix: hatM = M_q M_p.\nPerform SVD: hatM = USigmaV^T; set PhigetsUmathtt1n.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"For details on the cotangent lift (and other methods for linear symplectic model reduction) consult [35].","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#Symplectic-Autoencoders","page":"PSD and Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"PSD suffers from the similar shortcomings as regular POD: it is a linear map and the approximation space tildemathcalM= Psi^mathrmdec(z_r)inmathbbR^2Nu_rinmathrmR^2n is strictly linear. For problems with slowly-decaying Kolmogorov n-width this leads to very poor approximations. ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"In order to overcome this difficulty we use neural networks, more specifically SympNets, together with cotangent lift-like matrices. The resulting architecture, symplectic autoencoders, are demonstrated in the following image: ","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"Main.include_graphics(\"../tikz/symplectic_autoencoder\") # hide","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices PhiinSt(nN) we have to optimize on the Stiefel manifold.","category":"page"},{"location":"reduced_order_modeling/symplectic_autoencoder/#References","page":"PSD and Symplectic Autoencoders","title":"References","text":"","category":"section"},{"location":"reduced_order_modeling/symplectic_autoencoder/","page":"PSD and Symplectic Autoencoders","title":"PSD and Symplectic Autoencoders","text":"P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).\n\n\n\nL. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).\n\n\n\n","category":"page"},{"location":"manifolds/metric_and_vector_spaces/#(Topological)-Metric-Spaces","page":"Metric and Vector Spaces","title":"(Topological) Metric Spaces","text":"","category":"section"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"A metric space is a certain class of a topological space where the topology is induced through a metric.","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **metric** on a topological space ``\\mathcal{M}`` is a mapping ``d:\\mathcal{M}\\times\\mathcal{M}\\to\\mathbb{R}`` such that the following three conditions hold: \n\" * \nMain.indentation * raw\"1. ``d(x, y) = 0 \\iff x = y`` for every ``x,y\\in\\mathcal{M}``, i.e. the distance between 2 points is only zero if and only if they are the same,\n\" * \nMain.indentation * raw\"2. ``d(x, y) = d(y, x)``,\n\" *\nMain.indentation * raw\"3. ``d(x, z) \\leq d(x, y) + d(y, z)``.\n\" *\nMain.indentation * raw\"The second condition is referred to as *symmetry* and the third condition is referred to as the *triangle inequality*.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"We give some examples of metric spaces that are relevant for us: ","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"The real line ``\\mathbb{R}`` with the metric defined by the absolute distance between two points: ``d(x, y) = |y - x|``.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"The vector space ``\\mathbb{R}^n`` with the *Euclidean distance* ``d_2(x, y) = \\sqrt{\\sum_{i=1}^n (x_i - y_i)^2}``.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"The space of continuous functions ``\\mathcal{C} = \\{f:(-\\epsilon, \\epsilon)\\to\\mathbb{R}^n\\}`` with the metric ``d_\\infty(f_1, f_2) = \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_2(t)|.``\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.proof(raw\"We have to show the triangle inequality: \n\" * \nMain.indentation * raw\"```math\n\" * \nMain.indentation * raw\"\\begin{aligned}\n\" *\nMain.indentation * raw\"d_\\infty(d_1, d_3) = \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_3(t)| & \\leq \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}(|f_1(t) - f_2(t)| + |f_2(t) - f_3(t)|) \\\\\n\" *\nMain.indentation * raw\"& \\leq \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_2(t)| + \\mathrm{sup}_{t\\in(-\\epsilon, \\epsilon)}|f_1(t) - f_2(t)|.\n\" * \nMain.indentation * raw\"\\end{aligned}\n\" * \nMain.indentation * raw\"```\n\" *\nMain.indentation * raw\"This shows that ``d_\\infty`` is indeed a metric.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.example(raw\"Any Riemannian manifold is a metric space.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"This last example shows that metric spaces need not be vector spaces, i.e. spaces for which we can define a metric but not addition of two elements. This will be discussed in more detail in the section on riemannian manifolds.","category":"page"},{"location":"manifolds/metric_and_vector_spaces/#Complete-Metric-Spaces","page":"Metric and Vector Spaces","title":"Complete Metric Spaces","text":"","category":"section"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"To define complete metric spaces we first need the definition of a Cauchy sequence.","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **Cauchy sequence** is a sequence ``(a_n)_{n\\in\\mathbb{N}}`` for which, given any `epsilon>0`, we can find an integer ``N`` such that ``d(a_n, a_m) < \\epsilon`` for all ``n, m \\geq N``.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Now we can give the definition of a complete metric space:","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **complete metric space** is one for which every Cauchy sequence converges.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Completeness of the real numbers is most often seen as an axiom and therefore stated without proof. This also implies completeness of mathbbR^n [4].","category":"page"},{"location":"manifolds/metric_and_vector_spaces/#(Topological)-Vector-Spaces","page":"Metric and Vector Spaces","title":"(Topological) Vector Spaces","text":"","category":"section"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Vector Spaces are, like metric spaces, topological spaces which we endow with additional structure. ","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"Main.definition(raw\"A **vector space** ``\\mathcal{V}`` is a topological space for which we define an operation called *addition* and denoted by ``+`` and an operation called *scalar multiplication* (by elements of ``\\mathbb{R}``) denoted by ``x \\mapsto ax`` for ``x\\in\\mathcal{V}`` and ``x\\in\\mathbb{R}`` for which the following hold for all ``x, y, z\\in\\mathcal{V}`` and ``a, b\\in\\mathbb{R}``:\n\" * \nMain.indentation * raw\"1. ``x + (y + z) = (x + y) + z,``\n\" * \nMain.indentation * raw\"2. ``x + y = y + x,``\n\" * \nMain.indentation * raw\"3. ``\\exists 0 \\in \\mathcal{V}`` such that ``x + 0 = x,``\n\" * \nMain.indentation * raw\"4. ``\\exists -x \\in \\mathcal{V} such that ``x + (-x) = 0,``\n\" * \nMain.indentation * raw\"5. ``a(ax) = (ab)x,``\n\" * \nMain.indentation * raw\"6. ``1x = x`` for ``1\\in\\mathbb{R},``\n\" * \nMain.indentation * raw\"7. ``a(x + y) = ax + ay,``\n\" * \nMain.indentation * raw\"8. ``(a + b)x = ax + bx.``\n\" * \nMain.indentation * raw\"The first law is known as *associativity*, the second one as *commutativity* and the last two ones are known *distributivity*.\")","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"The topological spaces mathbbR and mathbbR^n are (almost) trivially vector spaces. The same is true for many function spaces. One of the special aspects of GeometricMachineLearning is that it can deal with spaces that are not vector spaces, but manifolds. All vector spaces are however manifolds. ","category":"page"},{"location":"manifolds/metric_and_vector_spaces/","page":"Metric and Vector Spaces","title":"Metric and Vector Spaces","text":"S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).\n\n\n\n","category":"page"},{"location":"architectures/volume_preserving_transformer/#Volume-Preserving-Transformer","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"","category":"section"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"The volume-preserving transformer is, similar to the standard transformer, a combination of two different neural networks: a volume-preserving attention layer and a volume-preserving feedforward layer. It is visualized below:","category":"page"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"Main.include_graphics(\"../tikz/vp_transformer\") # hide","category":"page"},{"location":"architectures/volume_preserving_transformer/#Library-Functions","page":"Volume-Preserving Transformer","title":"Library Functions","text":"","category":"section"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"VolumePreservingTransformer","category":"page"},{"location":"architectures/volume_preserving_transformer/#GeometricMachineLearning.VolumePreservingTransformer-architectures-volume_preserving_transformer","page":"Volume-Preserving Transformer","title":"GeometricMachineLearning.VolumePreservingTransformer","text":"The volume-preserving transformer with the Cayley activation function and built-in upscaling.\n\nConstructor\n\nThe arguments for the constructor are: \n\nsys_dim::Int\nseq_length::Int: The sequence length of the data fed into the transformer.\n\nThe following are keyword argumetns:\n\nn_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function.\ninit_upper::Bool=false: Specifies if the network first acts on the q component. \nskew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.\n\n\n\n\n\n","category":"type"},{"location":"architectures/volume_preserving_transformer/#References","page":"Volume-Preserving Transformer","title":"References","text":"","category":"section"},{"location":"architectures/volume_preserving_transformer/","page":"Volume-Preserving Transformer","title":"Volume-Preserving Transformer","text":"B. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).\n\n\n\n","category":"page"},{"location":"layers/attention_layer/#The-Attention-Layer","page":"Attention","title":"The Attention Layer","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The attention mechanism was originally developed for image and natural language processing (NLP) tasks. It is motivated by the need to handle time series data in an efficient way[1]. Its essential idea is to compute correlations between vectors in input sequences. I.e. given sequences ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z_q^(1) z_q^(2) ldots z_q^(T)) text and (z_p^(1) z_p^(2) ldots z_p^(T))","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"an attention mechanism computes pair-wise correlations between all combinations of two input vectors from these sequences. In [16] \"additive\" attention is used to compute such correlations: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[1]: Recurrent neural networks have the same motivation. ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z_q z_k) mapsto v^Tsigma(Wz_q + Uz_k) ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"where z_q z_k in mathbbR^d are elements of the input sequences. The learnable parameters are W U in mathbbR^ntimesd and v in mathbbR^n.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"However multiplicative attention (see e.g. [17])is more straightforward to interpret and cheaper to handle computationally: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z_q z_k) mapsto z_q^TWz_k","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"where W in mathbbR^dtimesd is a learnable weight matrix with respect to which correlations are computed as scalar products. Regardless of the type of attention used, they all try to compute correlations among input sequences on whose basis further computation is performed. Given two input sequences Z_q = (z_q^(1) ldots z_q^(T)) and Z_k = (z_k^(1) ldots z_k^(T)), we can arrange the various correlations into a correlation matrix CinmathbbR^TtimesT with entries C_ij = mathttattention(z_q^(i) z_k^(j)). In the case of multiplicative attention this matrix is just C = Z^TWZ.","category":"page"},{"location":"layers/attention_layer/#Reweighting-of-the-input-sequence","page":"Attention","title":"Reweighting of the input sequence","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In GeometricMachineLearning we always compute self-attention, meaning that the two input sequences Z_q and Z_k are the same, i.e. Z = Z_q = Z_k.[2]","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[2]: Multihead attention also falls into this category. Here the input Z is multiplied from the left with several projection matrices P^Q_i and P^K_i, where i indicates the head. For each head we then compute a correlation matrix (P^Q_i Z)^T(P^K Z). ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"This is then used to reweight the columns in the input sequence Z. For this we first apply a nonlinearity sigma onto C and then multiply sigma(C) onto Z from the right, i.e. the output of the attention layer is Zsigma(C). So we perform the following mappings:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z xrightarrowmathrmcorrelations C(Z) = C xrightarrowsigma sigma(C) xrightarrowtextright multiplication Z sigma(C)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"After the right multiplication the outputs is of the following form: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":" sum_i=1^Tp^(1)_iz^(i) ldots sum_i=1^Tp^(T)_iz^(i)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"for p^(i) = sigma(C)_bulleti. What is learned during training are T different linear combinations of the input vectors, where the coefficients p^(i)_j in these linear combinations depend on the input Z nonlinearly. ","category":"page"},{"location":"layers/attention_layer/#Volume-Preserving-Attention","page":"Attention","title":"Volume-Preserving Attention","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The attention layer (and the activation function sigma defined for it) in GeometricMachineLearning was specifically designed to apply it to data coming from physical systems that can be described through a divergence-free or a symplectic vector field. Traditionally the nonlinearity in the attention mechanism is a softmax[3] (see [17]) and the self-attention layer performs the following mapping: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[3]: The softmax acts on the matrix C in a vector-wise manner, i.e. it operates on each column of the input matrix C = c^(1) ldots c^(T). The result is a sequence of probability vectors p^(1) ldots p^(T) for which sum_i=1^Tp^(j)_i=1quadforalljin1dotsT","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z = z^(1) ldots z^(T) mapsto Zmathrmsoftmax(Z^TWZ)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The softmax activation acts vector-wise, i.e. if we supply it with a matrix C as input it returns: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"mathrmsoftmax(C) = mathrmsoftmax(c_bullet1) ldots mathrmsoftmax(c_bulletT)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The output of a softmax is a probability vector (also called stochastic vector) and the matrix P = p^(1) ldots p^(T), where each column is a probability vector, is sometimes referred to as a stochastic matrix (see [18]). This attention mechanism finds application in transformer neural networks [17]. The problem with this matrix from a geometric point of view is that all the columns are independent of each other and the nonlinear transformation could in theory produce a stochastic matrix for which all columns are identical and thus lead to a loss of information. So the softmax activation function is inherently non-geometric. ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Besides the traditional attention mechanism GeometricMachineLearning therefore also has a volume-preserving transformation that fulfills a similar role. There are two approaches implemented to realize similar transformations. Both of them however utilize the Cayley transform to produce orthogonal matrices sigma(C) instead of stochastic matrices. For an orthogonal matrix Sigma we have Sigma^TSigma = mathbbI, so all the columns are linearly independent which is not necessarily true for a stochastic matrix P. The following explains how this new activation function is implemented.","category":"page"},{"location":"layers/attention_layer/#The-Cayley-transform","page":"Attention","title":"The Cayley transform","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The Cayley transform maps from skew-symmetric matrices to orthonormal matrices[4]. It takes the form:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"[4]: A matrix A is skew-symmetric if A = -A^T and a matrix B is orthonormal if B^TB = mathbbI. The orthonormal matrices form a Lie group, i.e. the set of orthonormal matrices can be endowed with the structure of a differential manifold and this set also satisfies the group axioms. The corresponding Lie algebra are the skew-symmetric matrices and the Cayley transform is a so-called retraction in this case. For more details consult e.g. [7] and [10].","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"mathrmCayley A mapsto (mathbbI - A)(mathbbI + A)^-1","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"We can easily check that mathrmCayley(A) is orthogonal if A is skew-symmetric. For this consider varepsilon mapsto A(varepsilon)inmathcalS_mathrmskew with A(0) = mathbbI and A(0) = B. Then we have: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"fracdeltamathrmCayleydeltaA = fracddvarepsilon_varepsilon=0 mathrmCayley(A(varepsilon))^T mathrmCayley(A(varepsilon)) = mathbbO","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In order to use the Cayley transform as an activation function we further need a mapping from the input Z to a skew-symmetric matrix. This is realized in two ways in GeometricMachineLearning: via a scalar-product with a skew-symmetric weighting and via a scalar-product with an arbitrary weighting.","category":"page"},{"location":"layers/attention_layer/#First-approach:-scalar-products-with-a-skew-symmetric-weighting","page":"Attention","title":"First approach: scalar products with a skew-symmetric weighting","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"For this the attention layer is modified in the following way: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z = z^(1) ldots z^(T) mapsto Zsigma(Z^TAZ)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"where sigma(C)=mathrmCayley(C) and A is a skew-symmetric matrix that is learnable, i.e. the parameters of the attention layer are stored in A.","category":"page"},{"location":"layers/attention_layer/#Second-approach:-scalar-products-with-an-arbitrary-weighting","page":"Attention","title":"Second approach: scalar products with an arbitrary weighting","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"For this approach we compute correlations between the input vectors with a skew-symmetric weighting. The correlations we consider here are based on: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"(z^(2))^TAz^(1) (z^(3))^TAz^(1) ldots (z^(d))^TAz^(1) (z^(3))^TAz^(2) ldots (z^(d))^TAz^(2) ldots (z^(d))^TAz^(d-1)","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"So in total we consider correlations (z^(i))^Tz^(j) for which i j. We now arrange these correlations into a skew-symmetric matrix: ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"C = beginbmatrix\n 0 -(z^(2))^TAz^(1) -(z^(3))^TAz^(1) ldots -(z^(d))^TAz^(1) \n (z^(2))^TAz^(1) 0 -(z^(3))^TAz^(2) ldots -(z^(d))^TAz^(2) \n ldots ldots ldots ldots ldots \n (z^(d))^TAz^(1) (z^(d))^TAz^(2) (z^(d))^TAz^(3) ldots 0 \nendbmatrix","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"This correlation matrix can now again be used as an input for the Cayley transform to produce an orthogonal matrix.","category":"page"},{"location":"layers/attention_layer/#How-is-structure-preserved?","page":"Attention","title":"How is structure preserved?","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In order to discuss how structure is preserved we first have to define what structure we mean precisely. This structure is strongly inspired by traditional multi-step methods (see [19]). We now define what volume preservation means for the product space mathbbR^dtimescdotstimesmathbbR^dequivtimes_textT timesmathbbR^d.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Consider an isomorphism hat times_text(T times)mathbbR^dstackrelapproxlongrightarrowmathbbR^dT. Specifically, this isomorphism takes the form:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Z = leftbeginarraycccc\n z_1^(1) z_1^(2) quadcdotsquad z_1^(T) \n z_2^(1) z_2^(2) cdots z_2^(T) \n cdots cdots cdots cdots \n z_d^(1) z_d^(2) cdots z_d^(T)\n endarrayright mapsto \n leftbeginarrayc z_1^(1) z_1^(2) cdots z_1^(T) z_2^(1) cdots z_d^(T) endarrayright = Z_mathrmvec","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"The inverse of Z mapsto hatZ we refer to as Y mapsto tildeY. In the following we also write hatvarphi for the mapping hatcircvarphicirctilde.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"DEFINITION: We say that a mapping varphi times_textT timesmathbbR^d to times_textT timesmathbbR^d is volume-preserving if the associated hatvarphi is volume-preserving.","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"In the transformed coordinate system (in terms of the vector Z_mathrmvec defined above) this is equivalent to multiplication by a sparse matrix tildeLambda(Z) from the left:","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":" tildeLambda(Z) Z_mathrmvec =\n beginpmatrix\n Lambda(Z) mathbbO cdots mathbbO \n mathbbO Lambda(Z) cdots mathbbO \n cdots cdots ddots cdots \n mathbbO mathbbO cdots Lambda(Z) \n endpmatrix\n leftbeginarrayc z_1^(1) z_1^(2) ldots z_1^(T) z_2^(1) ldots z_d^(T) endarrayright ","category":"page"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"tildeLambda(Z) in m[eq:LambdaApplication]m(@latex) is easily shown to be an orthogonal matrix. ","category":"page"},{"location":"layers/attention_layer/#Historical-Note","page":"Attention","title":"Historical Note","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"Attention was used before, but always in connection with recurrent neural networks (see [20] and [16]). ","category":"page"},{"location":"layers/attention_layer/#References","page":"Attention","title":"References","text":"","category":"section"},{"location":"layers/attention_layer/","page":"Attention","title":"Attention","text":"D. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).\n\n\n\nM.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).\n\n\n\n","category":"page"},{"location":"manifolds/homogeneous_spaces/#Homogeneous-Spaces","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Homogeneous spaces are very important in GeometricMachineLearning as we can generalize existing neural network optimizers from vector spaces to such homogenous spaces. They are intricately linked to the notion of a Lie Group and its Lie Algebra[1].","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"[1]: Recall that a Lie group is a manifold that also has group structure. We say that a Lie group G acts on a manifold mathcalM if there is a map GtimesmathcalM to mathcalM such that (ab)x = a(bx) for abinG and xinmathcalM. For us the Lie algebra belonging to a Lie group, denoted by mathfrakg, is the tangent space to the identity element T_mathbbIG. ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.definition(raw\"A **homogeneous space** is a manifold ``\\mathcal{M}`` on which a Lie group ``G`` acts transitively, i.e.\n\" * Main.indentation * raw\" ```math\n\" * Main.indentation * raw\"\\forall X,Y\\in\\mathcal{M} \\exists{}A\\in{}G\\text{ s.t. }AX = Y.\n\" * Main.indentation * raw\"```\n\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Now fix a distinct element EinmathcalM; we will refer to this as the canonical element. We can also establish an isomorphism between mathcalM and the quotient space Gsim with the equivalence relation: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"A_1 sim A_2 iff A_1E = A_2E","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Note that this is independent of the chosen E.","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The tangent spaces of mathcalM are of the form T_YmathcalM = mathfrakgcdotY, i.e. can be fully described through its Lie algebra. Based on this we can perform a splitting of mathfrakg into two parts:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.definition(raw\"A **splitting of the Lie algebra** ``mathfrak{g}`` at an element of a homogeneous space ``Y`` is a decomposition into a **vertical** and a **horizontal** component, denoted by ``\\mathfrak{g} = \\mathfrak{g}^{\\mathrm{ver},Y} \\oplus \\mathfrak{g}^{\\mathrm{hor},Y}`` such that\n\" * Main.indentation * raw\"1. The *vertical component* ``\\mathfrak{g}^{\\mathrm{ver},Y}`` is the kernel of the map ``\\mathfrak{g}\\to{}T_Y\\mathcal{M}, V \\mapsto VY``, i.e. ``\\mathfrak{g}^{\\mathrm{ver},Y} = \\{V\\in\\mathfrak{g}:VY = 0\\}.``\n\" * Main.indentation * raw\"2. The *horizontal component* ``\\mathfrak{g}^{\\mathrm{hor},Y}`` is the orthogonal complement of ``\\mathfrak{g}^{\\mathrm{ver},Y}`` in ``\\mathfrak{g}``. It is isomorphic to ``T_Y\\mathcal{M}``.\n\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"We will refer to the mapping from T_YmathcalM to mathfrakg^mathrmhor Y by Omega. We will give explicit examples of Omega below. If we have now defined a metric langlecdotcdotrangle on mathfrakg, then this induces a Riemannian metric on mathcalM:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"g_Y(Delta_1 Delta_2) = langleOmega(YDelta_1)Omega(YDelta_2)rangletext for Delta_1Delta_2inT_YmathcalM","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Two examples of homogeneous spaces implemented in GeometricMachineLearning are the Stiefel and the Grassmann manifold. The Lie group SO(N) acts transitively on both of these manifolds, i.e. turns them into homogeneous spaces. The Lie algebra of SO(N) are the skew-symmetric matrices mathfrakso(N)=VinmathbbR^NtimesNV^T + V = 0 and the canonical metric associated with it is simply (V_1V_2)mapstofrac12mathrmTr(V_1^TV_2).","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Stiefel-Manifold","page":"Homogeneous Spaces","title":"The Stiefel Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The Stiefel manifold St(n N) is the space of all orthonormal frames in mathbbR^Ntimesn, i.e. matrices YinmathbbR^Ntimesn s.t. Y^TY = mathbbI_n. It can also be seen as SO(N) modulo an equivalence relation: AsimBiffAE = BE for ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"E = beginbmatrix\nmathbbI_n \nmathbbO\nendbmatrixinSt(n N)","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"which is the canonical element of the Stiefel manifold. In words: the first n columns of A and B are the same. We also use this principle to draw random elements from the Stiefel manifold.","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.example(raw\"Drawing random elements from the Stiefel (and the Grassmann) manifold is done by first calling `rand(N, n)` (i.e. drawing from a normal distribution) and then performing a ``QR`` decomposition. We then take the first ``n`` columns of the ``Q`` matrix to be an element of the Stiefel manifold.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The tangent space to the element YinSt(nN) can be determined by considering C^infty curves on SO(N) through mathbbI which we write tmapstoA(t). Because SO(N) acts transitively on St(n N) each C^infty curve on St(n N) through Y can be written as A(t)Y and we get: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"T_YSt(nN)=BY Binmathfrakg = DeltainmathbbR^Ntimesn Delta^TY + Y^TDelta = mathbbO","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"where the last equality can be established through the isomorphism ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Omega T_YSt(n N) to mathfrakg^mathrmvec Y Delta mapsto (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"That this is an isomorphism can be easily checked: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" Omega(Delta)Y = (mathbbI - frac12YY^T)Delta - frac12YDelta^TY = Delta","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The isomorphism is also implemented in GeometricMachineLearning:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"using GeometricMachineLearning\n\nY = rand(StiefelManifold{Float32}, 5, 3)\nΔ = rgrad(Y, rand(Float32, 5, 3))\nGeometricMachineLearning.Ω(Y, Δ) * Y.A ≈ Δ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The function rgrad is introduced below. ","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Riemannian-Gradient-for-the-Stiefel-Manifold","page":"Homogeneous Spaces","title":"The Riemannian Gradient for the Stiefel Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"We defined the Riemannian gradient to be a vector field mathrmgrad^gL such that it is compatible with the Riemannian metric in some sense; the definition we gave relied on an explicit coordinate chart. We can also express the Riemannian gradient for matrix manifolds by not relying on an explicit coordinate representation (which would be computationally expensive) [6].","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.definition(raw\"Given a Riemannian matrix manifold ``\\mathcal{M}`` we define the **Riemannian gradient** of ``L:\\mathcal{M}\\to\\mathbb{R}`` at ``Y``, called ``\\mathrm{grad}_YL\\in{}T_Y\\mathcal{M}``, as the unique element of ``T_Y\\mathcal{M}`` such that for any other ``\\Delta\\in{}T_Y\\mathcal{M}`` we have\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\mathrm{Tr}((\\nabla{}L)^T\\Delta) = g_Y(\\mathrm{grad}_YL, \\Delta),\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where Tr indicates the usual matrix trace.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"For the Stiefel manifold the Riemannian gradient is given by: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" mathrmgrad_YL = nabla_YL - Y(nabla_YL)^TY = mathttrgrad(Y nabla_YL)","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"where nabla_YL refers to the Euclidean gradient, i.e. ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" nabla_YL_ij = fracpartialLpartialy_ij","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The Euclidean gradient nablaL can in practice be obtained with an AD routine. We then use the function rgrad to map nabla_YL from mathbbR^Ntimesn to T_YSt(nN). We can check that this mapping indeed maps to the Riemannian gradient","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"using GeometricMachineLearning\nusing LinearAlgebra: tr\n\nY = rand(StiefelManifold{Float32}, 5, 3)\n∇L = rand(Float32, 5, 3)\ngradL = rgrad(Y, ∇L)\nΔ = rgrad(Y, rand(Float32, 5, 3))\n\nmetric(Y, gradL, Δ) ≈ tr(∇L' * Δ)","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Grassmann-Manifold","page":"Homogeneous Spaces","title":"The Grassmann Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The Grassmann manifold is closely related to the Stiefel manifold, and an element of the Grassmann manifold can be represented through an element of the Stiefel manifold (but not vice-versa). An element of the Grassmann manifold G(nN) is a vector subspace subsetmathbbR^N of dimension n. Each such subspace (i.e. element of the Grassmann manifold) can be represented by a full-rank matrix AinmathbbR^Ntimesn and we identify two elements with the following equivalence relation: ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" A_1 sim A_2 iff existsCinmathbbR^ntimesntext st A_1C = A_2","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"The resulting manifold is of dimension n(N-n). One can find a parametrization of the manifold the following way: Because the matrix Y has full rank, there have to be n independent columns in it: i_1 ldots i_n. For simplicity assume that i_1 = 1 i_2=2 ldots i_n=n and call the matrix made up of these columns C. Then the mapping to the coordinate chart is: YC^-1 and the last N-n columns are the coordinates.","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"We can also define the Grassmann manifold based on the Stiefel manifold since elements of the Stiefel manifold are already full-rank matrices. In this case we have the following equivalence relation (for Y_1 Y_2inSt(nN)): ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":" Y_1 sim Y_2 iff existsCinSO(n)text st Y_1C = Y_2","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"In GeometricMachineLearning elements of the Grassmann manifold are drawn the same way as elements of the Stiefel manifold:","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"using GeometricMachineLearning\n\nrand(GrassmannManifold{Float32}, 5, 3)","category":"page"},{"location":"manifolds/homogeneous_spaces/#The-Riemannian-Gradient-of-the-Grassmann-Manifold","page":"Homogeneous Spaces","title":"The Riemannian Gradient of the Grassmann Manifold","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Obtaining the Riemannian Gradient for the Grassmann manifold is slightly more difficult than it is in the case of the Stiefel manifold [6]. Since the Grassmann manifold can be obtained from the Stiefel manifold through an equivalence relation, we can however use this as a starting point. ","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.theorem(raw\"The Riemannian gradient of a function ``L`` defined on the Grassmann manifold can be written as\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\mathrm{grad}_\\mathcal{Y}^{Gr}L \\simeq \\nabla_Y{}L - YY^T\\nabla_YL,\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``\\nabla_Y{}L`` again is again the Euclidean gradient.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"Main.proof(raw\"In a first step we identify charts on the Grassmann manifold to make dealing with it easier. For this consider the following open cover of the Grassmann manifold. \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\{\\mathcal{U}_W\\}_{W\\in{}St(n, N)} \\quad\\text{where}\\quad \\mathcal{U}_W = \\{\\mathrm{span}(Y):\\mathrm{det}(W^TY)\\neq0\\}.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"We can find a canonical bijective mapping from the set ``\\mathcal{U}_W`` to the set ``\\mathcal{S}_W := \\{Y\\in\\mathbb{R}^{N\\times{}n}:W^TY=\\mathbb{I}_n\\}``:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\sigma_W: \\mathcal{U}_W \\to \\mathcal{S}_W,\\, \\mathcal{Y}=\\mathrm{span}(Y)\\mapsto{}Y(W^TY)^{-1} =: \\hat{Y}.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"That ``\\sigma_W`` is well-defined is easy to see: Consider ``YC`` with ``C\\in\\mathbb{R}^{n\\times{}n}`` non-singular. Then ``YC(W^TYC)^{-1}=Y(W^TY)^{-1} = \\hat{Y}``. With this isomorphism we can also find a representation of elements of the tangent space:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"T_\\mathcal{Y}\\sigma_W: T_\\mathcal{Y}Gr(n,N)\\to{}T_{\\hat{Y}}\\mathcal{S}_W.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"We give an explicit representation of this isomorphism; because the map ``\\sigma_W`` does not care about the representation of ``\\mathrm{span}(Y)`` we can perform the variations in ``St(n,N)``. We write the variations as ``Y(t)\\in{}St(n,N)`` for ``t\\in(-\\varepsilon,\\varepsilon)``. We also set ``Y(0) = Y`` and hence\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\frac{d}{dt}Y(t)(W^TY(t))^{-1} = (\\dot{Y}(0) - Y(W^TY)^{-1}W^T\\dot{Y}(0))(W^TY)^{-1},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``\\dot{Y}(0)\\in{}T_YSt(n,N)``. Also note note that we have ``T_\\mathcal{Y}\\mathcal{U}_W = T_\\mathcal{Y}Gr(n,N)`` because ``\\mathcal{U}_W`` is an open subset of ``Gr(n,N)``. We thus can identify the tangent space ``T_\\mathcal{Y}Gr(n,N)`` with the following set:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"T_{\\hat{Y}}\\mathcal{S}_W = \\{(\\Delta - YW^T\\Delta)(W^T\\Delta)^{-1}: Y\\in{}St(n,N)\\text{ s.t. }\\mathrm{span}(Y)=\\mathcal{Y}\\text{ and }\\Delta\\in{}T_YSt(n,N)\\}.\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"Further note that we can pick any element ``W`` to construct the charts for a neighborhood around the point ``\\mathcal{Y}\\in{}Gr(n,N)`` as long as we have ``\\mathrm{det}(W^TY)\\neq0`` for ``\\mathrm{span}(Y)=\\mathcal{Y}``. We hence take ``W=Y`` and get the identification: \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"T_\\mathcal{Y}Gr(n,N) \\equiv \\{\\Delta - YY^T\\Delta: Y\\in{}St(n,N)\\text{ s.t. }\\mathrm{span}(Y)=\\mathcal{Y}\\text{ and }\\Delta\\in{}T_YSt(n,N)\\},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"which is very easy to handle computationally (we simply store and change the matrix ``Y`` that represents an element of the Grassmann manifold). The Riemannian gradient is then \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\mathrm{grad}_\\mathcal{Y}^{Gr}L = \\mathrm{grad}_Y^{St}L - YY^T\\mathrm{grad}_Y^{St}L = \\nabla_Y{}L - YY^T\\nabla_YL,\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``\\mathrm{grad}^{St}_YL`` is the Riemannian gradient of the Stiefel manifold at ``Y``. We proved our assertion.\")","category":"page"},{"location":"manifolds/homogeneous_spaces/#Library-Functions","page":"Homogeneous Spaces","title":"Library Functions","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"StiefelManifold\nGrassmannManifold\nrand(manifold_type::Type{MT}, ::Integer, ::Integer) where MT <: Manifold\nGeometricMachineLearning.rgrad(::StiefelManifold, ::AbstractMatrix)\nGeometricMachineLearning.rgrad(::GrassmannManifold, ::AbstractMatrix)\nGeometricMachineLearning.metric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)\nGeometricMachineLearning.metric(::GrassmannManifold, ::AbstractMatrix, ::AbstractMatrix)\nGeometricMachineLearning.Ω(::StiefelManifold{T}, ::AbstractMatrix{T}) where T\nGeometricMachineLearning.Ω(::GrassmannManifold{T}, ::AbstractMatrix{T}) where T","category":"page"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.StiefelManifold-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.StiefelManifold","text":"An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices YinmathbbR^Ntimesn whose columns are orthonormal, i.e. \n\n St(n N) = Y Y^TY = mathbbI_n \n\nThe Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.\n\n\n\n\n\n","category":"type"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.GrassmannManifold-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.GrassmannManifold","text":"The GrassmannManifold is based on the StiefelManifold.\n\n\n\n\n\n","category":"type"},{"location":"manifolds/homogeneous_spaces/#Base.rand-Union{Tuple{MT}, Tuple{Type{MT}, Integer, Integer}} where MT<:Manifold-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"Base.rand","text":"rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold\n\nDraw random elements from the Stiefel and the Grassmann manifold. \n\nBecause both of these manifolds are compact spaces we can sample them uniformly [8].\n\nExamples\n\nWhen we call ...\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round # hide\nimport Random\nRandom.seed!(123)\n\nN, n = 5, 3\nY = rand(StiefelManifold{Float32}, N, n)\n_round(Y; digits = 5) # hide\n\n# output\n\n5×3 StiefelManifold{Float32, Matrix{Float32}}:\n -0.27575 0.32991 0.77275\n -0.62485 -0.33224 -0.0686\n -0.69333 0.36724 -0.18988\n -0.09295 -0.73145 0.46064\n 0.2102 0.33301 0.38717\n\n... the sampling is done by first allocating a random matrix of size Ntimesn via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix. \n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.rgrad-Tuple{StiefelManifold, AbstractMatrix}-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.rgrad","text":"rgrad(Y::StiefelManifold, e_grad::AbstractMatrix)\n\nCompute the Riemannian gradient for the Stiefel manifold at YinSt(Nn) based on nablaLinmathbbR^Ntimesn (the Euclidean gradient). \n\nThe function computes the Riemannian gradient with respect to the canonical metric.\n\nThe precise form of the mapping is: \n\nmathttrgrad(Y nablaL) mapsto nablaL - Y(nablaL)^TY\n\nNote the property Y^Tmathrmrgrad(Y nablaL)inmathcalS_mathrmskew(n)\n\nExamples\n\nusing GeometricMachineLearning\n\nY = StiefelManifold([1 0 ; 0 1 ; 0 0; 0 0])\nΔ = [1 2; 3 4; 5 6; 7 8]\nrgrad(Y, Δ)\n\n# output\n\n4×2 Matrix{Int64}:\n 0 -1\n 1 0\n 5 6\n 7 8\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.rgrad-Tuple{GrassmannManifold, AbstractMatrix}-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.rgrad","text":"rgrad(Y::GrassmannManifold, e_grad::AbstractMatrix)\n\nCompute the Riemannian gradient at YinGr(n N). \n\nThese gradient have the property that they are orthogonal to the space spanned by Y.\n\nThe precise form of the mapping is: \n\nmathttrgrad(Y nablaL) mapsto nablaL - YY^TnablaL\n\nNote the property Y^Tmathrmrgrad(Y nablaL) = mathbbO\n\nAlso see rgrad(::StiefelManifold, ::AbstractMatrix).\n\nExamples\n\nusing GeometricMachineLearning\n\nY = GrassmannManifold([1 0 ; 0 1 ; 0 0; 0 0])\nΔ = [1 2; 3 4; 5 6; 7 8]\nrgrad(Y, Δ)\n\n# output\n\n4×2 Matrix{Int64}:\n 0 0\n 0 0\n 5 6\n 7 8\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.metric-Tuple{StiefelManifold, AbstractMatrix, AbstractMatrix}-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.metric","text":"Implements the canonical Riemannian metric for the Stiefel manifold:\n\ng_Y (Delta_1 Delta_2) mapsto mathrmtr(Delta_1^T(mathbbI - frac12YY^T)Delta_2)\n\nIt is called with: \n\nY::StiefelManifold\nΔ₁::AbstractMatrix\nΔ₂::AbstractMatrix\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.metric-Tuple{GrassmannManifold, AbstractMatrix, AbstractMatrix}-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.metric","text":"metric(Y::GrassmannManifold, Δ₁::AbstractMatrix, Δ₂::AbstractMatrix)\n\nCompute the metric for vectors Δ₁ and Δ₂ at Y. \n\nThe representation of the Grassmann manifold is realized as a quotient space of the Stiefel manifold. \n\nThe metric for the Grassmann manifold is:\n\ng^Gr_Y(Delta_1 Delta_2) = g^St_Y(Delta_1 Delta_2) = mathrmTr(Delta_1^T (mathbbI - Y Y^T) Delta_2) = mathrmTr(Delta_1^T Delta_2)\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.Ω-Union{Tuple{T}, Tuple{StiefelManifold{T, AT} where AT<:AbstractMatrix{T}, AbstractMatrix{T}}} where T-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.Ω","text":"Ω(Y::StiefelManifold{T}, Δ::AbstractMatrix{T}) where T\n\nPerform canonical horizontal lift for the Stiefel manifold:\n\n Delta mapsto (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)\n\nInternally this performs \n\nSkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')\n\nto save memory. \n\nExamples\n\nusing GeometricMachineLearning\nE = StiefelManifold(StiefelProjection(5, 2))\nΔ = [0. -1.; 1. 0.; 2. 3.; 4. 5.; 6. 7.]\nGeometricMachineLearning.Ω(E, Δ)\n\n# output\n\n5×5 SkewSymMatrix{Float64, Vector{Float64}}:\n 0.0 -1.0 -2.0 -4.0 -6.0\n 1.0 0.0 -3.0 -5.0 -7.0\n 2.0 3.0 0.0 -0.0 -0.0\n 4.0 5.0 0.0 0.0 -0.0\n 6.0 7.0 0.0 0.0 0.0\n\nNote that the output of Ω is a skew-symmetric matrix, i.e. an element of mathfrakg.\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#GeometricMachineLearning.Ω-Union{Tuple{T}, Tuple{GrassmannManifold{T, AT} where AT<:AbstractMatrix{T}, AbstractMatrix{T}}} where T-manifolds-homogeneous_spaces","page":"Homogeneous Spaces","title":"GeometricMachineLearning.Ω","text":"Ω(Y::GrassmannManifold{T}, Δ::AbstractMatrix{T}) where T\n\nPerform the canonical horizontal lift for the Grassmann manifold:\n\n Delta mapsto Omega^St(Y Δ)\n\nwhere Omega^St is the canonical horizontal lift for the Stiefel manifold.\n\nusing GeometricMachineLearning\nE = GrassmannManifold(StiefelProjection(5, 2))\nΔ = [0. 0.; 0. 0.; 2. 3.; 4. 5.; 6. 7.]\nGeometricMachineLearning.Ω(E, Δ)\n\n# output\n\n5×5 SkewSymMatrix{Float64, Vector{Float64}}:\n 0.0 -0.0 -2.0 -4.0 -6.0\n 0.0 0.0 -3.0 -5.0 -7.0\n 2.0 3.0 0.0 -0.0 -0.0\n 4.0 5.0 0.0 0.0 -0.0\n 6.0 7.0 0.0 0.0 0.0\n\n\n\n\n\n","category":"method"},{"location":"manifolds/homogeneous_spaces/#References","page":"Homogeneous Spaces","title":"References","text":"","category":"section"},{"location":"manifolds/homogeneous_spaces/","page":"Homogeneous Spaces","title":"Homogeneous Spaces","text":"P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).\n\n\n\nT. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).\n\n\n\nT. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).\n\n\n\n","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/#Kolmogorov-n-width","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"","category":"section"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"The Kolmogorov n-width measures how well some set mathcalM (typically the solution manifold) can be approximated with a linear subspace:","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"d_n(mathcalM) = mathrminf_V_nsubsetVmathrmdimV_n=nmathrmsup(uinmathcalM)mathrminf_v_ninV_n u - v_n _V","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"with mathcalMsubsetV and V is a (typically infinite-dimensional) Banach space. For advection-dominated problems (among others) the decay of the Kolmogorov n-width is very slow, i.e. one has to pick n very high in order to obtain useful approximations (see [36] and [37]).","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"In order to overcome this, techniques based on neural networks (see e.g. [28]) and optimal transport (see e.g. [37]) have been used. ","category":"page"},{"location":"reduced_order_modeling/kolmogorov_n_width/#References","page":"Kolmogorov n-width","title":"References","text":"","category":"section"},{"location":"reduced_order_modeling/kolmogorov_n_width/","page":"Kolmogorov n-width","title":"Kolmogorov n-width","text":"T. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).\n\n\n\nC. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).\n\n\n\nK. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).\n\n\n\n","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/#The-Horizontal-Lift","page":"Horizontal Lift","title":"The Horizontal Lift","text":"","category":"section"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"For each element YinmathcalM we can perform a splitting mathfrakg = mathfrakg^mathrmhor Yoplusmathfrakg^mathrmver Y, where the two subspaces are the horizontal and the vertical component of mathfrakg at Y respectively. For homogeneous spaces: T_YmathcalM = mathfrakgcdotY, i.e. every tangent space to mathcalM can be expressed through the application of the Lie algebra to the relevant element. The vertical component consists of those elements of mathfrakg which are mapped to the zero element of T_YmathcalM, i.e. ","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"mathfrakg^mathrmver Y = mathrmker(mathfrakgtoT_YmathcalM)","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"The orthogonal complement[1] of mathfrakg^mathrmver Y is the horizontal component and is referred to by mathfrakg^mathrmhor Y. This is naturally isomorphic to T_YmathcalM. For the Stiefel manifold the horizontal lift has the simple form: ","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"Omega(Y V) = left(mathbbI - frac12right)VY^T - YV^T(mathbbI - frac12YY^T)","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"If the element Y is the distinct element E, then the elements of mathfrakg^mathrmhorE take a particularly simple form, see Global Tangent Space for a description of this. ","category":"page"},{"location":"optimizers/manifold_related/horizontal_lift/","page":"Horizontal Lift","title":"Horizontal Lift","text":"[1]: The orthogonal complement is taken with respect to a metric defined on mathfrakg. For the case of G=SO(N) and mathfrakg=mathfrakso(N) = AA+A^T =0 this metric can be chosen as (A_1A_2)mapstofrac12A_1^TA_2.","category":"page"},{"location":"Optimizer/#Optimizer","page":"Optimizers","title":"Optimizer","text":"","category":"section"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"In order to generalize neural network optimizers to homogeneous spaces, a class of manifolds we often encounter in machine learning, we have to find a global tangent space representation which we call mathfrakg^mathrmhor here. ","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"Starting from an element of the tangent space T_YmathcalM[1], we need to perform two mappings to arrive at mathfrakg^mathrmhor, which we refer to by Omega and a red horizontal arrow:","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"[1]: In practice this is obtained by first using an AD routine on a loss function L, and then computing the Riemannian gradient based on this. See the section of the Stiefel manifold for an example of this.","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"Main.include_graphics(\"tikz/general_optimization_with_boundary\") # hide","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"Here the mapping Omega is a horizontal lift from the tangent space onto the horizontal component of the Lie algebra at Y. ","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"The red line maps the horizontal component at Y, i.e. mathfrakg^mathrmhorY, to the horizontal component at mathfrakg^mathrmhor.","category":"page"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"The mathrmcache stores information about previous optimization steps and is dependent on the optimizer. The elements of the mathrmcache are also in mathfrakg^mathrmhor. Based on this the optimer (Adam in this case) computes a final velocity, which is the input of a retraction. Because this update is done for mathfrakg^mathrmhorequivT_YmathcalM, we still need to perform a mapping, called apply_section here, that then finally updates the network parameters. The two red lines are described in global sections.","category":"page"},{"location":"Optimizer/#References","page":"Optimizers","title":"References","text":"","category":"section"},{"location":"Optimizer/","page":"Optimizers","title":"Optimizers","text":"B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\n","category":"page"},{"location":"","page":"Home","title":"Home","text":"CurrentModule = GeometricMachineLearning","category":"page"},{"location":"#Geometric-Machine-Learning","page":"Home","title":"Geometric Machine Learning","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"GeometricMachineLearning.jl implements various scientific machine learning models that aim at learning dynamical systems with geometric structure, such as Hamiltonian (symplectic) or Lagrangian (variational) systems.","category":"page"},{"location":"#Installation","page":"Home","title":"Installation","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"GeometricMachineLearning.jl and all of its dependencies can be installed via the Julia REPL by typing ","category":"page"},{"location":"","page":"Home","title":"Home","text":"]add GeometricMachineLearning","category":"page"},{"location":"#Architectures","page":"Home","title":"Architectures","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"There are several architectures tailored towards problems in scientific machine learning implemented in GeometricMachineLearning.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"architectures/sympnet.md\",\n]","category":"page"},{"location":"#Manifolds","page":"Home","title":"Manifolds","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"GeometricMachineLearning supports putting neural network weights on manifolds. These include:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"manifolds/grassmann_manifold.md\",\n \"manifolds/stiefel_manifold.md\",\n]","category":"page"},{"location":"#Special-Neural-Network-Layer","page":"Home","title":"Special Neural Network Layer","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Many layers have been adapted in order to be used for problems in scientific machine learning. Including:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"layers/attention_layer.md\",\n]","category":"page"},{"location":"#Tutorials","page":"Home","title":"Tutorials","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Tutorials for using GeometricMachineLearning are: ","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"tutorials/sympnet_tutorial.md\",\n \"tutorials/mnist_tutorial.md\",\n]","category":"page"},{"location":"#Reduced-Order-Modeling","page":"Home","title":"Reduced Order Modeling","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"A short description of the key concepts in reduced order modeling (where GeometricMachineLearning can be used) are in:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Pages = [\n \"reduced_order_modeling/autoencoder.md\",\n \"reduced_order_modeling/symplectic_autoencoder.md\",\n \"reduced_order_modeling/kolmogorov_n_width.md\",\n]","category":"page"},{"location":"references/#References","page":"References","title":"References","text":"","category":"section"},{"location":"references/","page":"References","title":"References","text":"S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).\n\n\n\nS. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\nS. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).\n\n\n\nS. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).\n\n\n\nM. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).\n\n\n\nP.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).\n\n\n\nE. Hairer, C. Lubich and G. Wanner. Geometric Numerical integration: structure-preserving algorithms for ordinary differential equations (Springer, 2006).\n\n\n\nF. Mezzadri. How to generate random matrices from the classical compact groups, arXiv preprint math-ph/0609050 (2006).\n\n\n\nD. D. Holm, T. Schmah and C. Stoica. Geometric mechanics and symmetry: from finite to infinite dimensions. Vol. 12 (Oxford University Press, Oxford, UK, 2009).\n\n\n\nP.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).\n\n\n\nT. Bendokat, R. Zimmermann and P.-A. Absil. A Grassmann manifold handbook: Basic geometry and computational aspects, arXiv preprint arXiv:2011.13699 (2020).\n\n\n\nW. S. Moses, V. Churavy, L. Paehler, J. Hückelheim, S. H. Narayanan, M. Schanen and J. Doerfert. Reverse-Mode Automatic Differentiation and Optimization of GPU Kernels via Enzyme. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC '21 (Association for Computing Machinery, New York, NY, USA, 2021).\n\n\n\nM. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).\n\n\n\nJ. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).\n\n\n\nJ. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).\n\n\n\nD. Bahdanau, K. Cho and Y. Bengio. Neural machine translation by jointly learning to align and translate, arXiv preprint arXiv:1409.0473 (2014).\n\n\n\nA. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser and I. Polosukhin. Attention is all you need. Advances in neural information processing systems 30 (2017).\n\n\n\nK. Jacobs. Discrete Stochastics (Birkhäuser Verlag, Basel, Switzerland, 1992).\n\n\n\nK. Feng. The step-transition operators for multi-step methods of ODE's. Journal of Computational Mathematics, 193–202 (1998).\n\n\n\nM.-T. Luong, H. Pham and C. D. Manning. Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015).\n\n\n\nK. Feng and M.-z. Qin. The symplectic methods for the computation of Hamiltonian equations. In: Numerical Methods for Partial Differential Equations: Proceedings of a Conference held in Shanghai, PR China, March 25–29, 1987 (Springer, 1987); pp. 1–37.\n\n\n\nZ. Ge and K. Feng. On the approximation of linear Hamiltonian systems. Journal of Computational Mathematics, 88–97 (1988).\n\n\n\nB. Brantner, G. de Romemont, M. Kraus and Z. Li. Volume-Preserving Transformers for Learning Time Series Data with Structure, arXiv preprint arXiv:2312:11166v2 (2024).\n\n\n\nB. Leimkuhler and S. Reich. Simulating hamiltonian dynamics. No. 14 (Cambridge university press, 2004).\n\n\n\nM. Kraus. GeometricIntegrators.jl: Geometric Numerical Integration in Julia, https://github.com/JuliaGNI/GeometricIntegrators.jl (2020).\n\n\n\nS. Hochreiter and J. Schmidhuber. Long short-term memory. Neural computation 9, 1735–1780 (1997).\n\n\n\nS. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).\n\n\n\nK. Lee and K. T. Carlberg. Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders. Journal of Computational Physics 404, 108973 (2020).\n\n\n\nA. Hemmasian and A. Barati Farimani. Reduced-order modeling of fluid flows with transformers. Physics of Fluids 35 (2023).\n\n\n\nA. Solera-Rico, C. S. Vila, M. Gómez, Y. Wang, A. Almashjary, S. Dawson and R. Vinuesa, beta-Variational autoencoders and transformers for reduced-order modelling of fluid flows, arXiv preprint arXiv:2304.03571 (2023).\n\n\n\nP. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).\n\n\n\nP. Jin, Z. Lin and B. Xiao. Optimal unit triangular factorization of symplectic matrices. Linear Algebra and its Applications (2022).\n\n\n\nN. Patwardhan, S. Marrone and C. Sansone. Transformers in the real world: A survey on nlp applications. Information 14, 242 (2023).\n\n\n\nP. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).\n\n\n\nL. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).\n\n\n\nC. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).\n\n\n\nT. Blickhan. A registration method for reduced basis problems using linear optimal transport, arXiv preprint arXiv:2304.14884 (2023).\n\n\n\nB. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\nT. Lin and H. Zha. Riemannian manifold learning. IEEE transactions on pattern analysis and machine intelligence 30, 796–809 (2008).\n\n\n\nT. Blickhan. BrenierTwoFluids.jl, https://github.com/ToBlick/BrenierTwoFluids (2023).\n\n\n\nI. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).\n\n\n\nB. Brantner and M. Kraus. Symplectic autoencoders for Model Reduction of Hamiltonian Systems, arXiv preprint arXiv:2312.10004 (2023).\n\n\n\nT. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).\n\n\n\nT. Bendokat and R. Zimmermann. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications, arXiv preprint arXiv:2108.12447 (2021).\n\n\n\nB. Brantner, G. de Romemont, M. Kraus and Z. Li. Structure-Preserving Transformers for Learning Parametrized Hamiltonian Systems, arXiv preprint arXiv:2312:11166 (2023).\n\n\n\n","category":"page"},{"location":"manifolds/manifolds/#(Matrix)-Manifolds","page":"General Theory on Manifolds","title":"(Matrix) Manifolds","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Manifolds are topological spaces that locally look like vector spaces. In the following we restrict ourselves to finite-dimensional smooth[1] manifolds. In this section we routinely denote points on a manifold by lower case letters like x y and z if we speak about general properties and by upper case letters like A and B if we talk about specific examples of matrix manifolds. All manifolds that can be used to build neural networks in GeometricMachineLearning, such as the Stiefel manifold and the Grassmann manifold are matrix manifolds.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[1]: Smooth here means C^infty.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A **finite-dimensional smooth manifold** of dimension ``n`` is a second-countable Hausdorff space ``\\mathcal{M}`` for which ``\\forall{}x\\in\\mathcal{M}`` we can find a neighborhood ``U`` that contains ``x`` and a corresponding homeomorphism ``\\varphi_U:U\\cong{}W\\subset\\mathbb{R}^n`` where ``W`` is an open subset. The homeomorphisms ``\\varphi_U`` are referred to as *coordinate charts*. If two such coordinate charts overlap, i.e. if ``U_1\\cap{}U_2\\neq\\{\\}``, then the map ``\\varphi_{U_2}^{-1}\\circ\\varphi_{U_1}`` has to be ``C^\\infty``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"One example of a manifold that is also important for GeometricMachineLearning is the Lie group[2] of orthonormal matrices SO(N). Before we can proof that SO(N) is a manifold we first need the preimage theorem.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[2]: Lie groups are manifolds that also have a group structure, i.e. there is an operation mathcalMtimesmathcalMtomathcalM(ab)mapstoab s.t. (ab)c = a(bc) and there exists a neutral elementemathcalM s.t. ae = a forallainmathcalM as well as an (for every a) inverse element a^-1 s.t. a(a^-1) = e. The neutral element e we refer to as mathbbI when dealing with matrix manifolds.","category":"page"},{"location":"manifolds/manifolds/#The-Preimage-Theorem","page":"General Theory on Manifolds","title":"The Preimage Theorem","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Before we can state the preimage theorem we need another definition[3]:","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[3]: In this definition we use the notation T_xg. This will be explained below. For we will interpret T_xg simply as (varphi_Ucircgcircpsi_V^-1) where varphi_U is a coordinate chart around y = g(x) and psi_V is a coordinate chart around x.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"Consider a smooth mapping ``g: \\mathcal{M}\\to\\mathcal{N}`` from one manifold to another. A point ``y\\in\\mathcal{N}`` is called a **regular value** of ``g`` if ``\\forall{}x\\in{}g^{-1}\\{y\\}`` the map ``T_xg:T_A\\mathcal{M}\\to{}T_{y}\\mathcal{N}`` is surjective.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"We now state the preimage theorem:","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.theorem(raw\"Consider a smooth map ``g:\\mathcal{M}\\to\\mathcal{N}`` from one manifold to another (we assume the dimensions of the two manifolds to be ``m+n`` and ``m`` respectively). Then the preimage of a regular point ``y`` of ``\\mathcal{N}`` is a submanifold of ``\\mathcal{M}``. Furthermore the codimension of ``g^{-1}\\{y\\}`` is equal to the dimension of ``\\mathcal{N}`` and the tangent space ``T_x(g^{-1}\\{y\\})`` is equal to the kernel of ``T_xg``.\"; name = \"Preimage Theorem\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.proof(raw\"Because ``\\mathcal{N}`` has manifold structure we can find a chart ``\\varphi_U:U\\to\\mathbb{R}^m`` for some neighborhood ``U`` that contains ``y``. We further consider a point ``A\\in{}g^{-1}\\{y\\}`` and a chart around it ``\\psi_V:V\\to\\mathbb{R}^{m+n}``. By the implicit function theorem we can then find a mapping ``h`` that turns ``\\varphi_U\\circ{}g\\circ\\psi_V^{-1}`` into a projection ``(x_1, \\ldots, x_{n+m}) \\mapsto (x_{n+1}, \\ldots, x_{n+m})``. We now consider the neighborhood ``V_1\\times\\{0\\} = \\psi(V \\cup f^{-1}\\{y\\})`` for ``\\psi(V) = V_1\\times{}V_2`` with the coordinate chart ``(x_1, \\ldots, x_n) \\mapsto \\psi(x_1, \\ldots, x_n, 0, \\ldots, 0).`` As this map is also smooth by the implicit function theorem this proofs our assertion.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.example(raw\"The group ``SO(N)`` is a Lie group (i.e. has manifold structure).\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.proof(raw\"The vector space ``\\mathbb{R}^{N\\times{}N}`` clearly has manifold structure. The group ``SO(N)`` is equivalent to one of the level sets of the mapping: ``g:\\mathbb{R}^{N\\times{}N}\\to\\mathcal{S}(N), A\\mapsto{}A^TA - \\mathbb{I}``, i.e. it is the component of ``f^{-1}\\{\\mathbb{I}\\}`` that contains ``\\mathbb{I}``. We still need to proof that ``\\mathbb{I}`` is a regular point of ``g``, i.e. that for ``A\\in{}SO(N)`` the mapping ``T_Ag`` is surjective. This means that ``\\forall{}B\\in\\mathcal{S}(N), A\\in\\mathbb{R}^{N\\times{}N}`` ``\\exists{}C\\in\\mathbb{R}^{N\\times{}N}`` s.t. ``C^TA + A^TC = B``. The element ``C=\\frac{1}{2}AB\\in\\mathcal{R}^{N\\times{}N}`` satisfies this property.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Similarly we can also proof: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.example(raw\"The sphere ``S^n:=\\{x\\in\\mathbb{R}^{n+1}: x^Tx = 1\\}`` is a manifold of dimension ``n``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.proof(raw\"Take ``g(x) = x^x - 1`` and proceed as in the case of ``SO(N)``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Note that both these manifolds, SO(N) and S^n are matrix manifolds, i.e. an element of mathcalM can be written as an element of mathbbR^NtimesN in the first case and mathbbR^ntimes1 in the second case. The additional conditions we impose on these manifolds are A^TA = mathbbI in the first case and x^Tx = 1 in the second case. Both of these manifolds belong to the category of Stiefel manifolds.","category":"page"},{"location":"manifolds/manifolds/#Tangent-Spaces","page":"General Theory on Manifolds","title":"Tangent Spaces","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"A tangent space can be seen as the collection of all possible velocities a curve can take at a point on a manifold. For this consider a manifold mathcalM and a point x on it and the collection of C^infty curves through x: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A mapping ``\\gamma:(-\\epsilon, \\epsilon)\\to\\mathcal{M}`` that is ``C^\\infty`` and for which we have ``\\gamma(0) = x`` is called a **``C^\\infty`` curve through ``x``**.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"The tangent space of mathcalM at x is the collection of the first derivatives of all gamma: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"The **tangent space** of ``\\mathcal{M}`` at ``x`` is the collection of all ``C^\\infty`` curves at ``x`` modulo the equivalence class ``\\gamma_1 \\sim \\gamma_2 \\iff \\gamma_1'(0) = \\gamma_2'(0)``. It is denoted by ``T_x\\mathcal{M}``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"As is customary we write gamma for the equivalence class of gamma and this is by definition equivalent to gamma(0). The tangent space T_xmathcalM can be shown to be homeomorphic[4] to mathbbR^n where n is the dimension of the manifold mathcalM. If the homeomorphism is constructed through the coordinate chart (varphi U) we call it varphi(x) or simply[5] varphi. If we are given a map gmathcalMtomathcalN we further define T_xg = (varphi)^-1circ(varphicircgpsi^-1)circpsi, i.e. a smooth map between two manifolds mathcalM and mathcalN induces a smooth map between the tangent spaces T_xmathcalM and T_g(x)mathcalN.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[4]: Note that we have not formally defined addition for T_xmathcalM. This can be done through the definition gamma + beta = alpha where alpha is any C^infty curve through x that satisfies alpha(0) = beta(0) + gamma(0). Note that we can always find such an alpha by the existence and uniqueness theorem.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[5]: We will further discuss this when we introduce the tangent bundle.","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"We want to demonstrate this principle of constructing the tangent space from curves through the example of S^2. We consider the following curves: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"gamma_1(t) = beginpmatrix 0 sin(t) cos(t) endpmatrix\ngamma_2(t) = beginpmatrix sin(t) 0 cos(t) endpmatrix\ngamma_3(t) = beginpmatrix exp(-t ^ 2 2) t sin(t) exp(-t ^ 2 2) t cos(t) sqrt1 - (t ^ 2) exp(-t^2) endpmatrix","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"We now plot the manifold S^2, the three curves described above and the associated tangent vectors (visualized as arrows). Note that the tangent vectors induced by gamma_1 and gamma_3 are the same; for these curves we have gamma_1 sim gamma_3 and the tangent vectors of those two curves coincide: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"using CairoMakie\nusing ForwardDiff\nusing LaTeXStrings\n\nfunction plot_curve!(ax, gamma::Function; epsilon_range::T = 1.4, epsilon_spacing::T = .01, kwargs...) where T\n curve_domain = -epsilon_range : epsilon_spacing : epsilon_range\n curve = zeros(T, 3, length(curve_domain))\n for (i, t) in zip(axes(curve_domain, 1), curve_domain)\n curve[:, i] .= gamma(t)\n end\n lines!(ax, curve[1, :], curve[2, :], curve[3, :]; kwargs...)\nend\n\nfunction plot_arrow!(ax, gamma::Function; kwargs...)\n arrow_val = ForwardDiff.derivative(gamma, 0.)\n\n gamma_vec = ([gamma(0)[1]], [gamma(0)[2]], [gamma(0)[3]])\n gamma_deriv_vec = ([arrow_val[1]], [arrow_val[2]], [arrow_val[3]])\n\n arrows!(ax, gamma_vec..., gamma_deriv_vec...; kwargs...)\nend\n\nfunction tangent_space(; n = 100)\n xs = LinRange(-1.2, 1.2, n)\n ys = LinRange(-1.2, 1.2, n)\n zs = [one(x) * one(y) for x in xs, y in ys]\n xs, ys, zs\nend\n\ngamma_1(t) = [zero(t), sin(t), cos(t)]\ngamma_2(t) = [sin(t), zero(t), cos(t)]\ngamma_3(t) = [exp(-t ^ 2 / 2) * (t ^ 1) * sin(t), exp(-t ^ 2 / 2) * (t ^ 1) * cos(t), sqrt(1 - (t ^ 2) * exp(-t^2))]\n\ncurves = (gamma_1, gamma_2, gamma_3)\n\nmorange = RGBf(255 / 256, 127 / 256, 14 / 256)\nmblue = RGBf(31 / 256, 119 / 256, 180 / 256)\nmred = RGBf(214 / 256, 39 / 256, 40 / 256)\nmpurple = RGBf(148 / 256, 103 / 256, 189 / 256)\nmgreen = RGBf(44 / 256, 160 / 256, 44 / 256)\n\ncolors = (morange, mblue, mred)\n\nfunction make_plot(; theme = :light)\n text_color = theme == :light ? :black : :white\n\n fig = Figure(; backgroundcolor = :transparent)\n\n ax = Axis3(fig[1, 1]; \n backgroundcolor = :transparent, \n aspect = (1., 1., 0.8), \n azimuth = π / 6, \n elevation = π / 8, \n xlabel = rich(\"x\", subscript(\"1\"), font = :italic, color = text_color),\n ylabel = rich(\"x\", subscript(\"2\"), font = :italic, color = text_color),\n zlabel = rich(\"x\", subscript(\"3\"), font = :italic, color = text_color),\n )\n\n surface!(Main.sphere(1., [0., 0., 0.])...; alpha = .6)\n\n for (i, curve, color) in zip(1:length(curves), curves, colors)\n plot_curve!(ax, curve; label = rich(\"γ\", subscript(string(i)); color = text_color, font = :italic), linewidth = 2, color = color)\n end\n\n surface!(ax, tangent_space()...; alpha = .2)\n text!(.9, -.9, 1.; text = L\"T_x\\mathcal{M}\", color = text_color)\n\n for (i, curve, color) in zip(1:length(curves), curves, colors)\n plot_arrow!(ax, curve; linewidth = .03, color = color)\n end\n\n axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color)\n\n fig, ax\nend\n\nif Main.output_type == :html\n save(\"tangent_space.png\", make_plot(; theme = :light)[1]; px_per_unit = 1.5)\n save(\"tangent_space_dark.png\", make_plot(; theme = :dark )[1]; px_per_unit = 1.5)\nelseif Main.output_type == :latex\n save(\"tangent_space.png\", make_plot(; theme = :light)[1]; px_per_unit = 2.0)\nend\n\nnothing","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.include_graphics(\"tangent_space\"; caption = raw\"Visualization of how the tangent space is constructed.\", width = .8) # hide","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"The tangent space T_xmathcalM for","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"x = beginpmatrix0 0 1 endpmatrix\n","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"is also shown. ","category":"page"},{"location":"manifolds/manifolds/#Vector-Fields","page":"General Theory on Manifolds","title":"Vector Fields","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"A time-independent vector field[6] is an object that specifies a velocity for every point on a domain. We first give the definition of a vector field on the vector space mathbbR^n and limit ourselves here to C^infty vector fields:","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"[6]: Also called ordinary differential equation (ODE).","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A **vector field** on ``\\mathbb{R}^n`` is a smooth map ``X:\\mathbb{R}^n\\to\\mathbb{R}^n``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"The definition of a vector field on a manifold is not much more complicated: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Main.definition(raw\"A **vector field** on ``\\mathcal{M}`` is a map ``X`` defined on ``\\mathcal{M}`` such that ``X(x)\\in{}T_x\\mathcal{M}`` and ``\\varphi'\\circ{}X\\circ(\\varphi)^{-1}`` is smooth for any coordinate chart ``(\\varphi, U)`` that contains ``x``.\")","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"In the section on the existence-and-uniqueness theorem we show that every vector field has a unique solution given an initial condition; i.e. given a point xinmathcalM and a vector field X we can find a curve gamma such that gamma(0) = x and gamma(t) = X(gamma(t)) for all t in some interval (-epsilon epsilon).","category":"page"},{"location":"manifolds/manifolds/#The-Tangent-Bundle","page":"General Theory on Manifolds","title":"The Tangent Bundle","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"To each manifold mathcalM we can associate another manifold which we call the tangent bundle and denote by TmathcalM. The points on this manifold are: ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"TmathcalM = (x v_x) xinmathcalM v_xinT_xmathcalM ","category":"page"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Coordinate charts on this manifold can be constructed in a straightforward manner; for every coordinate chart varphi_U the map varphi_U(x) gives a homeomorphism between T_xmathcalM and mathbbR^nfor any xinU. We can then find a neighborhood of any point (x v_x) by taking pi^-1(U) = (x v_x) xinU v_xinT_xmathcalM and this neighborhood is isomorphic to mathbbR^2n via (x v_x) mapsto (varphi_U(x) varphi(x)v_x). The geodesic spray is an important vector field defined on TmathcalM.","category":"page"},{"location":"manifolds/manifolds/#Library-Functions","page":"General Theory on Manifolds","title":"Library Functions","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"Manifold","category":"page"},{"location":"manifolds/manifolds/#GeometricMachineLearning.Manifold-manifolds-manifolds","page":"General Theory on Manifolds","title":"GeometricMachineLearning.Manifold","text":"A manifold in GeometricMachineLearning is a sutype of AbstractMatrix. All manifolds are matrix manifolds and therefore stored as matrices. More details can be found in the docstrings for the StiefelManifold and the GrassmannManifold.\n\n\n\n\n\n","category":"type"},{"location":"manifolds/manifolds/#References","page":"General Theory on Manifolds","title":"References","text":"","category":"section"},{"location":"manifolds/manifolds/","page":"General Theory on Manifolds","title":"General Theory on Manifolds","text":"P.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).\n\n\n\n","category":"page"},{"location":"architectures/volume_preserving_feedforward/#Volume-Preserving-Feedforward-Neural-Network","page":"Volume-Preserving FeedForward","title":"Volume-Preserving Feedforward Neural Network","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/#Neural-network-architecture","page":"Volume-Preserving FeedForward","title":"Neural network architecture","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"The constructor produces the following architecture[1]:","category":"page"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"[1]: Based on the input arguments n_linear and n_blocks. In this example init_upper is set to false, which means that the first layer is of type lower followed by a layer of type upper. ","category":"page"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"Main.include_graphics(\"../tikz/vp_feedforward\") # hide","category":"page"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"Here LinearLowerLayer performs x mapsto x + Lx and NonLinearLowerLayer performs x mapsto x + sigma(Lx + b). The activation function sigma is the forth input argument to the constructor and tanh by default. ","category":"page"},{"location":"architectures/volume_preserving_feedforward/#Note-on-Sympnets","page":"Volume-Preserving FeedForward","title":"Note on Sympnets","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"As SympNets are symplectic maps, they also conserve phase space volume and therefore form a subcategory of volume-preserving feedforward layers. ","category":"page"},{"location":"architectures/volume_preserving_feedforward/#Library-Functions","page":"Volume-Preserving FeedForward","title":"Library Functions","text":"","category":"section"},{"location":"architectures/volume_preserving_feedforward/","page":"Volume-Preserving FeedForward","title":"Volume-Preserving FeedForward","text":"VolumePreservingFeedForward","category":"page"},{"location":"architectures/volume_preserving_feedforward/#GeometricMachineLearning.VolumePreservingFeedForward-architectures-volume_preserving_feedforward","page":"Volume-Preserving FeedForward","title":"GeometricMachineLearning.VolumePreservingFeedForward","text":"Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer. \n\nConstructor\n\nThe constructor is called with the following arguments: \n\nsys_dim::Int: The system dimension. \nn_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nactivation: The activation function for the nonlinear layers in a block. \ninit_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper. \n\n\n\n\n\n","category":"type"},{"location":"library/","page":"Library","title":"Library","text":"CurrentModule = GeometricMachineLearning","category":"page"},{"location":"library/#GeometricMachineLearning-Library-Functions","page":"Library","title":"GeometricMachineLearning Library Functions","text":"","category":"section"},{"location":"library/","page":"Library","title":"Library","text":"Modules = [GeometricMachineLearning]","category":"page"},{"location":"library/#AbstractNeuralNetworks.Chain-Tuple{GSympNet}","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Chain can also be called with a neural network as input.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, false, false}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is false and init_upper_act is false.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, false, true}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is false and init_upper_act is true.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, true, false}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is true and init_upper_act is false.\n\n\n\n\n\n","category":"method"},{"location":"library/#AbstractNeuralNetworks.Chain-Union{Tuple{LASympNet{AT, true, true}}, Tuple{AT}} where AT","page":"Library","title":"AbstractNeuralNetworks.Chain","text":"Build a chain for an LASympnet for which init_upper_linear is true and init_upper_act is true.\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.Matrix-Tuple{GlobalSection}","page":"Library","title":"Base.Matrix","text":"Matrix(λY::GlobalSection)\n\nPut λY into matrix form. \n\nThis is not recommended if speed is important!\n\nUse apply_section and global_rep instead!\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.AbstractCache","page":"Library","title":"GeometricMachineLearning.AbstractCache","text":"AbstractCache has subtypes: \n\nAdamCache\nMomentumCache\nGradientCache\nBFGSCache\n\nAll of them can be initialized with providing an array (also supporting manifold types).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AbstractLieAlgHorMatrix","page":"Library","title":"GeometricMachineLearning.AbstractLieAlgHorMatrix","text":"AbstractLieAlgHorMatrix is a supertype for various horizontal components of Lie algebras. We usually call this mathfrakg^mathrmhor.\n\nSee StiefelLieAlgHorMatrix and GrassmannLieAlgHorMatrix.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AbstractRetraction","page":"Library","title":"GeometricMachineLearning.AbstractRetraction","text":"AbstractRetraction is a type that comprises all retraction methods for manifolds. For every manifold layer one has to specify a retraction method that takes the layer and elements of the (global) tangent space.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AbstractTriangular","page":"Library","title":"GeometricMachineLearning.AbstractTriangular","text":"See UpperTriangular and LowerTriangular.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ActivationLayer","page":"Library","title":"GeometricMachineLearning.ActivationLayer","text":"ActivationLayer is the struct corresponding to the constructors ActivationLayerQ and ActivationLayerP. See those for more information.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ActivationLayerP-Tuple{Any, Any}","page":"Library","title":"GeometricMachineLearning.ActivationLayerP","text":"Performs:\n\nbeginpmatrix\n q p\nendpmatrix mapsto \nbeginpmatrix\n q p + mathrmdiag(a)sigma(q)\nendpmatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.ActivationLayerQ-Tuple{Any, Any}","page":"Library","title":"GeometricMachineLearning.ActivationLayerQ","text":"Performs:\n\nbeginpmatrix\n q p\nendpmatrix mapsto \nbeginpmatrix\n q + mathrmdiag(a)sigma(p) p\nendpmatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.AdamOptimizer","page":"Library","title":"GeometricMachineLearning.AdamOptimizer","text":"Defines the Adam Optimizer. Algorithm and suggested defaults are taken from [41] (page 301).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AdamOptimizerWithDecay","page":"Library","title":"GeometricMachineLearning.AdamOptimizerWithDecay","text":"Defines the Adam Optimizer with weight decay.\n\nConstructors\n\nThe default constructor takes as input: \n\nn_epochs::Int\nη₁: the learning rate at the start \nη₂: the learning rate at the end \nρ₁: the decay parameter for the first moment \nρ₂: the decay parameter for the second moment\nδ: the safety parameter \nT (keyword argument): the type. \n\nThe second constructor is called with: \n\nn_epochs::Int\nT\n\n... the rest are keyword arguments\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AutoEncoder","page":"Library","title":"GeometricMachineLearning.AutoEncoder","text":"The autoencoder architecture\n\nAn autoencoder [41] is a neural network consisting of an encoder Psi^e and a decoder Psi^d. In the simplest case they are trained on some data set mathcalD to reduce the following error: \n\nPsi^dcircPsi^e(mathcalD) - mathcalD\n\nwhich we call the reconstruction error or autoencoder error (see the docs for AutoEncoderLoss) and cdot is some norm.\n\nImplementation details.\n\nAbstract AutoEncoder type. If a custom <:AutoEncoder architecture is implemented it should have the fields full_dim, reduced_dim, n_encoder_blocks and n_decoder_blocks. Further the routines encoder, decoder, encoder_parameters and decoder_parameters should be extended.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.AutoEncoderLoss","page":"Library","title":"GeometricMachineLearning.AutoEncoderLoss","text":"This loss should always be used together with a neural network of type AutoEncoder (and it is also the default for training such a network). \n\nIt simply computes: \n\nmathttAutoEncoderLoss(nnmathttLoss input) = nn(input) - input\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BFGSCache","page":"Library","title":"GeometricMachineLearning.BFGSCache","text":"The cache for the BFGS optimizer.\n\nIt stores an array for the previous time step B and the inverse of the Hessian matrix H.\n\nIt is important to note that setting up this cache already requires a derivative! This is not the case for the other optimizers.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BFGSDummyCache","page":"Library","title":"GeometricMachineLearning.BFGSDummyCache","text":"In order to initialize BGGSCache we first need gradient information. This is why we initially have this BFGSDummyCache until gradient information is available.\n\nNOTE: we may not need this. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BFGSOptimizer","page":"Library","title":"GeometricMachineLearning.BFGSOptimizer","text":"This is an implementation of the Broyden-Fletcher-Goldfarb-Shanno (BFGS) optimizer. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Batch","page":"Library","title":"GeometricMachineLearning.Batch","text":"Batch is a struct whose functor acts on an instance of DataLoader to produce a sequence of training samples for training for one epoch. \n\nThe Constructor\n\nThe constructor for Batch is called with: \n\nbatch_size::Int\nseq_length::Int (optional)\nprediction_window::Int (optional)\n\nThe first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a sequence length and a prediction window as input arguments to Batch. These indicate the number of input vectors and the number of output vectors.\n\nThe functor\n\nAn instance of Batch can be called on an instance of DataLoader to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying batch:Batch to dl::DataLoader is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the time index and the second one is the parameter index.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.BiasLayer","page":"Library","title":"GeometricMachineLearning.BiasLayer","text":"A bias layer that does nothing more than add a vector to the input. This is needed for LA-SympNets.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Classification","page":"Library","title":"GeometricMachineLearning.Classification","text":"Classification Layer that takes a matrix as an input and returns a vector that is used for MNIST classification. \n\nIt has the following arguments: \n\nM: input dimension \nN: output dimension \nactivation: the activation function \n\nAnd the following optional argument: \n\naverage: If this is set to true, then the output is computed as frac1Nsum_i=1^Ninput_bulleti. If set to false (the default) it picks the last column of the input. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ClassificationTransformer","page":"Library","title":"GeometricMachineLearning.ClassificationTransformer","text":"This is a transformer neural network for classification purposes. At the moment this is only used for training on MNIST, but can in theory be used for any classification problem.\n\nIt has to be called with a DataLoader that stores an input and an output tensor. The optional arguments are: \n\nn_heads: The number of heads in the MultiHeadAttention (mha) layers. Default: 7.\nn_layers: The number of transformer layers. Default: 16.\nactivation: The activation function. Default: softmax.\nStiefel: Wheter the matrices in the mha layers are on the Stiefel manifold. \nadd_connection: Whether the input is appended to the output of the mha layer. (skip connection)\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.DataLoader","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. \n\nConstructor\n\nThe data loader can be called with various inputs:\n\nA single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).\nA single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. \nA single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.\nA tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are n_p matrices (first input argument) and n_p integers (second input argument).\nA NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors. \nAn EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.\n\nWhen we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.\n\nFields of DataLoader\n\nThe fields of the DataLoader struct are the following: \n\ninput: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.\noutput: The tensor that contains the output (supervised learning) - this may be of type Nothing if the constructor is only called with one tensor (unsupervised learning).\ninput_dim: The dimension of the system, i.e. what is taken as input by a regular neural network.\ninput_time_steps: The length of the entire time series (length of the second axis).\nn_params: The number of parameters that are present in the data set (length of third axis)\noutput_dim: The dimension of the output tensor (first axis). If output is of type Nothing, then this is also of type Nothing.\noutput_time_steps: The size of the second axis of the output tensor. If output is of type Nothing, then this is also of type Nothing.\n\nThe input and output fields of DataLoader\n\nEven though the arguments to the Constructor may be vectors or matrices, internally DataLoader always stores tensors.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.DataLoader-Union{Tuple{@NamedTuple{q::AT, p::AT}}, Tuple{AT}, Tuple{T}} where {T, AT<:AbstractMatrix{T}}","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. \n\nConstructor\n\nThe data loader can be called with various inputs:\n\nA single vector: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).\nA single matrix: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. \nA single tensor: If the data loader is called with a single tensor, then this is interpreted as an integration problem with the second axis indicating the time step and the third one indicating the parameters.\nA tensor and a vector: This is a special case (MNIST classification problem). For the MNIST problem for example the input are n_p matrices (first input argument) and n_p integers (second input argument).\nA NamedTuple with fields q and p: The NamedTuple contains (i) two matrices or (ii) two tensors. \nAn EnsembleSolution: The EnsembleSolution typically comes from GeometricProblems.\n\nWhen we supply a single vector or a single matrix as input to DataLoader and further set autoencoder = false (keyword argument), then the data are stored as an integration problem and the second axis is assumed to indicate time steps.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.DataLoader-Union{Tuple{GeometricSolutions.EnsembleSolution{T, T1, Vector{ST}}}, Tuple{ST}, Tuple{DT}, Tuple{T1}, Tuple{T}} where {T, T1, DT, ST<:(GeometricSolutions.GeometricSolution{T, T1, @NamedTuple{q::DT}})}","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Constructor for EnsembleSolution from package GeometricSolutions with field q.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.DataLoader-Union{Tuple{GeometricSolutions.EnsembleSolution{T, T1, Vector{ST}}}, Tuple{ST}, Tuple{DT}, Tuple{T1}, Tuple{T}} where {T, T1, DT<:(GeometricSolutions.DataSeries{T, AT} where AT<:Union{AbstractArray{T}, T}), ST<:(GeometricSolutions.GeometricSolution{T, T1, @NamedTuple{q::DT, p::DT}})}","page":"Library","title":"GeometricMachineLearning.DataLoader","text":"Constructor for EnsembleSolution form package GeometricSolutions with fields q and p.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Decoder","page":"Library","title":"GeometricMachineLearning.Decoder","text":"Abstract Decoder type. If a custom <:Decoder architecture is implemented it should have the fields full_dim, reduced_dim and n_decoder_blocks.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Encoder","page":"Library","title":"GeometricMachineLearning.Encoder","text":"Abstract Encoder type. If a custom <:Encoder architecture is implemented it should have the fields full_dim, reduced_dim and n_encoder_blocks.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GSympNet","page":"Library","title":"GeometricMachineLearning.GSympNet","text":"GSympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are: \n\nupscaling_dimension::Int: The upscaling dimension of the gradient layer. See the documentation for GradientLayerQ and GradientLayerP for further explanation. The default is 2*dim.\nn_layers::Int: The number of layers (i.e. the total number of GradientLayerQ and GradientLayerP). The default is 2.\nactivation: The activation function that is applied. By default this is tanh.\ninit_upper::Bool: Initialize the gradient layer so that it first modifies the q-component. The default is true.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GlobalSection","page":"Library","title":"GeometricMachineLearning.GlobalSection","text":"GlobalSection(Y::AbstractMatrix)\n\nConstruct a global section for Y. \n\nA global section lambda is a mapping from a homogeneous space mathcalM to the corresponding Lie group G such that \n\nlambda(Y)E = Y\n\nAlso see apply_section and global_rep.\n\nImplementation\n\nFor an implementation of GlobalSection for a custom array (especially manifolds), the function global_section has to be generalized.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientLayer","page":"Library","title":"GeometricMachineLearning.GradientLayer","text":"GradientLayer is the struct corresponding to the constructors GradientLayerQ and GradientLayerP. See those for more information.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientLayerP","page":"Library","title":"GeometricMachineLearning.GradientLayerP","text":"The gradient layer that changes the q component. It is of the form: \n\nbeginbmatrix\n mathbbI mathbbO nablaV mathbbI \nendbmatrix\n\nwith V(p) = sum_i=1^Ma_iSigma(sum_jk_ijp_j+b_i), where Sigma is the antiderivative of the activation function sigma (one-layer neural network). We refer to M as the upscaling dimension. Such layers are by construction symplectic.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientLayerQ","page":"Library","title":"GeometricMachineLearning.GradientLayerQ","text":"The gradient layer that changes the q component. It is of the form: \n\nbeginbmatrix\n mathbbI nablaV mathbbO mathbbI \nendbmatrix\n\nwith V(p) = sum_i=1^Ma_iSigma(sum_jk_ijp_j+b_i), where Sigma is the antiderivative of the activation function sigma (one-layer neural network). We refer to M as the upscaling dimension. Such layers are by construction symplectic.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GradientOptimizer","page":"Library","title":"GeometricMachineLearning.GradientOptimizer","text":"Define the Gradient optimizer, i.e. W ← W - η*∇f(W) Or the riemannian manifold equivalent, if applicable.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GrassmannLayer","page":"Library","title":"GeometricMachineLearning.GrassmannLayer","text":"Defines a layer that performs simple multiplication with an element of the Grassmann manifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GrassmannLieAlgHorMatrix","page":"Library","title":"GeometricMachineLearning.GrassmannLieAlgHorMatrix","text":"GrassmannLieAlgHorMatrix(B::AbstractMatrix{T}, N::Integer, n::Integer) where T\n\nBuild an instance of GrassmannLieAlgHorMatrix based on an arbitrary matrix B of size (N-n)timesn.\n\nGrassmannLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: piS to SEsim where \n\nE = beginpmatrix mathbbI_n mathbbO_(N-n)timesn endpmatrix\n\nand the equivalence relation is \n\nV_1 sim V_2 iff exists AinmathcalS_mathrmskew(n) textsuch that V_2 = V_1 + beginpmatrix A mathbbO endpmatrix\n\nAn element of GrassmannLieAlgMatrix takes the form: \n\nbeginpmatrix\nbarmathbbO B^T B mathbbO\nendpmatrix\n\nwhere barmathbbOinmathbbR^ntimesn and mathbbOinmathbbR^(N - n)timesn\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.GrassmannLieAlgHorMatrix-Tuple{AbstractMatrix, Int64}","page":"Library","title":"GeometricMachineLearning.GrassmannLieAlgHorMatrix","text":"GrassmannLieAlgHorMatrix(D::AbstractMatrix, n::Integer)\n\nTake a big matrix as input and build an instance of GrassmannLieAlgHorMatrix belonging to the GrassmannManifold Gr(n N) where N is the number of rows of D.\n\nIf the constructor is called with a big NtimesN matrix, then the projection is performed the following way: \n\nbeginpmatrix\nA B_1 \nB_2 D\nendpmatrix mapsto \nbeginpmatrix\nbarmathbbO -B_2^T \nB_2 mathbbO\nendpmatrix\n\nThis can also be seen as the operation:\n\nD mapsto Omega(E DE - EE^TDE)\n\nwhere Omega is the horizontal lift GeometricMachineLearning.Ω.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.GrassmannManifold","page":"Library","title":"GeometricMachineLearning.GrassmannManifold","text":"The GrassmannManifold is based on the StiefelManifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.HRedSys","page":"Library","title":"GeometricMachineLearning.HRedSys","text":"HRedSys computes the reconstructed dynamics in the full system based on the reduced one. Optionally it can be compared to the FOM solution.\n\nIt can be called using the following constructor: HRedSys(N, n; encoder, decoder, v_full, f_full, v_reduced, f_reduced, parameters, tspan, tstep, ics, projection_error) where \n\nencoder: a function mathbbR^2NmapstomathbbR^2n\ndecoder: a (differentiable) function mathbbR^2nmapstomathbbR^2N\nv_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nf_full: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nv_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nf_reduced: a (differentiable) mapping defined the same way as in GeometricIntegrators.\nparameters: a NamedTuple that parametrizes the vector fields (the same for fullvectorfield and reducedvectorfield)\ntspan: a tuple (t₀, tₗ) that specifies start and end point of the time interval over which integration is performed. \ntstep: the time step \nics: the initial condition for the big system.\nprojection_error: the error M - mathcalRcircmathcalP(M) where M is the snapshot matrix; mathcalP and mathcalR are the reduction and reconstruction respectively.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LASympNet","page":"Library","title":"GeometricMachineLearning.LASympNet","text":"LASympNet is called with a single input argument, the system dimension, or with an instance of DataLoader. Optional input arguments are: \n\ndepth::Int: The number of linear layers that are applied. The default is 5.\nnhidden::Int: The number of hidden layers (i.e. layers that are not input or output layers). The default is 2.\nactivation: The activation function that is applied. By default this is tanh.\ninit_upper_linear::Bool: Initialize the linear layer so that it first modifies the q-component. The default is true.\ninit_upper_act::Bool: Initialize the activation layer so that it first modifies the q-component. The default is true.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LayerWithManifold","page":"Library","title":"GeometricMachineLearning.LayerWithManifold","text":"LayerWithManifold is a subtype of AbstractExplicitLayer that contains manifolds as weights.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LayerWithOptionalManifold","page":"Library","title":"GeometricMachineLearning.LayerWithOptionalManifold","text":"LayerWithOptionalManifold is a subtype of AbstractExplicitLayer that can contain manifolds as weights.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearLayer","page":"Library","title":"GeometricMachineLearning.LinearLayer","text":"LinearLayer is the struct corresponding to the constructors LinearLayerQ and LinearLayerP. See those for more information.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearLayerP-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.LinearLayerP","text":"Equivalent to a left multiplication by the matrix:\n\nbeginpmatrix\nmathbbI mathbbO \nB mathbbI\nendpmatrix \n\nwhere B is a symmetric matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.LinearLayerQ-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.LinearLayerQ","text":"Equivalent to a left multiplication by the matrix:\n\nbeginpmatrix\nmathbbI B \nmathbbO mathbbI\nendpmatrix \n\nwhere B is a symmetric matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.LinearSymplecticAttention","page":"Library","title":"GeometricMachineLearning.LinearSymplecticAttention","text":"Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the Q or the P part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.\n\nConstructor\n\nFor the constructors simply call \n\nLinearSymplecticAttentionQ(sys_dim, seq_length)\n\nor \n\nLinearSymplecticAttentionP(sys_dim, seq_length)\n\nwhere sys_dim is the system dimension and seq_length is the sequence length.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearSymplecticAttentionP","page":"Library","title":"GeometricMachineLearning.LinearSymplecticAttentionP","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearSymplecticAttentionQ","page":"Library","title":"GeometricMachineLearning.LinearSymplecticAttentionQ","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LinearSymplecticTransformer","page":"Library","title":"GeometricMachineLearning.LinearSymplecticTransformer","text":"Realizes the linear Symplectic Transformer.\n\nConstructor:\n\nThe constructor is called with the following arguments\n\ndim::Int: System dimension \nseq_length::Int: Number of time steps that the transformer considers. \n\nOptional keyword arguments:\n\nn_sympnet::Int=2: The number of sympnet layers in the transformer.\nupscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer. \nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function for the SympNet layers. \ninit_upper::Bool=true: Specifies if the first layer is a Q-type layer (init_upper=true) or if it is a P-type layer (init_upper=false).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LowerTriangular","page":"Library","title":"GeometricMachineLearning.LowerTriangular","text":"LowerTriangular(S::AbstractVector, n::Int)\n\nBuild a lower-triangular matrix from a vector.\n\nA lower-triangular matrix is an ntimesn matrix that has ones on the diagonal and zeros on the upper triangular.\n\nThe data are stored in a vector S similarly to other matrices. See UpperTriangular, SkewSymMatrix and SymmetricMatrix.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6]\nLowerTriangular(S, 4)\n\n# output\n\n4×4 LowerTriangular{Int64, Vector{Int64}}:\n 0 0 0 0\n 1 0 0 0\n 2 3 0 0\n 4 5 6 0\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.LowerTriangular-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.LowerTriangular","text":"LowerTriangular(A::AbstractMatrix)\n\nBuild a lower-triangular matrix from a matrix.\n\nThis is done by taking the lower left of that matrix.\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nLowerTriangular(M)\n\n# output\n\n4×4 LowerTriangular{Int64, Vector{Int64}}:\n 0 0 0 0\n 5 0 0 0\n 9 10 0 0\n 13 14 15 0\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Manifold","page":"Library","title":"GeometricMachineLearning.Manifold","text":"A manifold in GeometricMachineLearning is a sutype of AbstractMatrix. All manifolds are matrix manifolds and therefore stored as matrices. More details can be found in the docstrings for the StiefelManifold and the GrassmannManifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ManifoldLayer","page":"Library","title":"GeometricMachineLearning.ManifoldLayer","text":"This defines a manifold layer that only has one matrix-valued manifold A associated with it does xmapstoAx. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.MomentumOptimizer","page":"Library","title":"GeometricMachineLearning.MomentumOptimizer","text":"Define the Momentum optimizer, i.e. V ← αV - ∇f(W) W ← W + ηV Or the riemannian manifold equivalent, if applicable.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.MultiHeadAttention","page":"Library","title":"GeometricMachineLearning.MultiHeadAttention","text":"MultiHeadAttention (MHA) serves as a preprocessing step in the transformer. It reweights the input vectors bases on correlations within those data. \n\nConstructor\n\nTakes input arguments: \n\ndim::Int: The system dimension \nn_heads::Int: The number of heads. \nStiefel::Bool=true (keyword argument): whether the weights should be put on the Stiefel manifold. \nretraction::AbstractRetraction (keyword argument): what kind of retraction should be used. By default this is the geodesic retraction. \nadd_connection::Bool=true (keyword argument): determines if the input should be added to the output for the final result. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.NeuralNetworkIntegrator","page":"Library","title":"GeometricMachineLearning.NeuralNetworkIntegrator","text":"This is a super type of various neural network architectures such as SympNet and ResNet whose purpose is to approximate the flow of an ordinary differential equation (ODE).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Optimizer","page":"Library","title":"GeometricMachineLearning.Optimizer","text":"Optimizer struct that stores the 'method' (i.e. Adam with corresponding hyperparameters), the cache and the optimization step.\n\nIt takes as input an optimization method and the parameters of a network. \n\nFor technical reasons we first specify an OptimizerMethod that stores all the hyperparameters of the optimizer. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.Optimizer-Tuple{NeuralNetwork, DataLoader, Batch, Int64, GeometricMachineLearning.NetworkLoss}","page":"Library","title":"GeometricMachineLearning.Optimizer","text":"A functor for Optimizer. It is called with: - nn::NeuralNetwork - dl::DataLoader - batch::Batch - n_epochs::Int - loss\n\nThe last argument is a function through which Zygote differentiates. This argument is optional; if it is not supplied GeometricMachineLearning defaults to an appropriate loss for the DataLoader.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Optimizer-Tuple{OptimizerMethod, NeuralNetwork}","page":"Library","title":"GeometricMachineLearning.Optimizer","text":"Typically the Optimizer is not initialized with the network parameters, but instead with a NeuralNetwork struct.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.PSDArch","page":"Library","title":"GeometricMachineLearning.PSDArch","text":"The architecture\n\nProper symplectic decomposition (PSD) can be seen as a SymplecticAutoencoder for which the decoder and the encoder are both PSD-like matrices (see the docs for PSDLayer. \n\nTraining\n\nFor optimizing the parameters in this architecture no neural network training is necessary (see the docs for solve!).\n\nThe constructor\n\nThe constructor only takes two arguments as input:\n\nfull_dim::Integer\nreduced_dim::Integer\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.PSDLayer","page":"Library","title":"GeometricMachineLearning.PSDLayer","text":"This is a PSD-like layer used for symplectic autoencoders. One layer has the following shape:\n\nA = beginbmatrix Phi mathbbO mathbbO Phi endbmatrix\n\nwhere Phi is an element of the Stiefel manifold St(n N).\n\nThe constructor of PSDLayer is called by PSDLayer(M, N; retraction=retraction): \n\nM is the input dimension.\nN is the output dimension. \nretraction is an instance of a struct with supertype AbstractRetraction. The only options at the moment are Geodesic() and Cayley().\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ResNet","page":"Library","title":"GeometricMachineLearning.ResNet","text":"A ResNet is a neural network that realizes a mapping of the form: x = mathcalNN(x) + x, so the input is again added to the output (a so-called add connection). In GeometricMachineLearning the specific ResNet that we use consists of a series of simple ResNetLayers.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.ResNetLayer","page":"Library","title":"GeometricMachineLearning.ResNetLayer","text":"The ResNetLayer is a simple feedforward neural network to which we add the input after applying it, i.e. it realizes x mapsto x + sigma(Ax + b).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SkewSymMatrix","page":"Library","title":"GeometricMachineLearning.SkewSymMatrix","text":"SkewSymMatrix(S::AbstractVector, n::Integer)\n\nInstantiate a skew-symmetric matrix with information stored in vector S.\n\nA skew-symmetric matrix A is a matrix A^T = -A.\n\nInternally the struct saves a vector S of size n(n-1)div2. The conversion is done the following way: \n\nA_ij = begincases 0 textif i=j \n S( (i-2) (i-1) ) div 2 + j textif ij \n S( (j-2) (j-1) ) div 2 + i textelse endcases\n\nAlso see SymmetricMatrix, LowerTriangular and UpperTriangular.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6]\nSkewSymMatrix(S, 4)\n\n# output\n\n4×4 SkewSymMatrix{Int64, Vector{Int64}}:\n 0 -1 -2 -4\n 1 0 -3 -5\n 2 3 0 -6\n 4 5 6 0\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SkewSymMatrix-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.SkewSymMatrix","text":"SkewSymMatrix(A::AbstractMatrix)\n\nPerform 0.5 * (A - A') and store the matrix in an efficient way (as a vector with n(n-1)2 entries).\n\nIf the constructor is called with a matrix as input it returns a skew-symmetric matrix via the projection:\n\nA mapsto frac12(A - A^T)\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nSkewSymMatrix(M)\n\n# output\n\n4×4 SkewSymMatrix{Float64, Vector{Float64}}:\n 0.0 -1.5 -3.0 -4.5\n 1.5 0.0 -1.5 -3.0\n 3.0 1.5 0.0 -1.5\n 4.5 3.0 1.5 0.0\n\nExtend help\n\nNote that the constructor is designed in such a way that it always returns matrices of type SkewSymMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.\n\nIf the user wishes to allocate a matrix SkewSymMatrix{<:Integer} the constructor SkewSymMatrix(::AbstractVector, n::Integer) has to be called.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.StandardTransformerIntegrator","page":"Library","title":"GeometricMachineLearning.StandardTransformerIntegrator","text":"The regular transformer used as an integrator (multi-step method). \n\nThe constructor is called with one argument: \n\nsys_dim::Int\n\nThe following are keyword arguments:\n\ntransformer_dim::Int: the default is transformer_dim = sys_dim.\nn_blocks::Int: The default is 1.\nn_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)\nL::Int the number of transformer blocks (default is L = 2).\nupscaling_activation: by default identity\nresnet_activation: by default tanh\nadd_connection:Bool=true: if the input should be added to the output.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelLayer","page":"Library","title":"GeometricMachineLearning.StiefelLayer","text":"Defines a layer that performs simple multiplication with an element of the Stiefel manifold.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelLieAlgHorMatrix","page":"Library","title":"GeometricMachineLearning.StiefelLieAlgHorMatrix","text":"StiefelLieAlgHorMatrix(A::SkewSymMatrix{T}, B::AbstractMatrix{T}, N::Integer, n::Integer) where T\n\nBuild an instance of StiefelLieAlgHorMatrix based on a skew-symmetric matrix A and an arbitrary matrix B.\n\nStiefelLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: piS to SE where \n\nE = beginpmatrix mathbbI_n mathbbO_(N-n)timesn endpmatrix\n\nThe matrix (E) is implemented under StiefelProjection in GeometricMachineLearning.\n\nAn element of StiefelLieAlgMatrix takes the form: \n\nbeginpmatrix\nA B^T B mathbbO\nendpmatrix\n\nwhere A is skew-symmetric (this is SkewSymMatrix in GeometricMachineLearning).\n\nAlso see GrassmannLieAlgHorMatrix.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelLieAlgHorMatrix-Tuple{AbstractMatrix, Integer}","page":"Library","title":"GeometricMachineLearning.StiefelLieAlgHorMatrix","text":"StiefelLieAlgHorMatrix(D::AbstractMatrix, n::Integer)\n\nTake a big matrix as input and build an instance of StiefelLieAlgHorMatrix belonging to the StiefelManifold St(n N) where N is the number of rows of D.\n\nIf the constructor is called with a big NtimesN matrix, then the projection is performed the following way: \n\nbeginpmatrix\nA B_1 \nB_2 D\nendpmatrix mapsto \nbeginpmatrix\nmathrmskew(A) -B_2^T \nB_2 mathbbO\nendpmatrix\n\nThe operation mathrmskewmathbbR^ntimesntomathcalS_mathrmskew(n) is the skew-symmetrization operation. This is equivalent to calling of SkewSymMatrix with an ntimesn matrix.\n\nThis can also be seen as the operation:\n\nD mapsto Omega(E DE) = mathrmskewleft(2 left(mathbbI - frac12 E E^T right) DE E^Tright)\n\nAlso see GeometricMachineLearning.Ω.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.StiefelManifold","page":"Library","title":"GeometricMachineLearning.StiefelManifold","text":"An implementation of the Stiefel manifold [7]. The Stiefel manifold is the collection of all matrices YinmathbbR^Ntimesn whose columns are orthonormal, i.e. \n\n St(n N) = Y Y^TY = mathbbI_n \n\nThe Stiefel manifold can be shown to have manifold structure (as the name suggests) and this is heavily used in GeometricMachineLearning. It is further a compact space. More information can be found in the docstrings for rgrad(::StiefelManifold, ::AbstractMatrix)andmetric(::StiefelManifold, ::AbstractMatrix, ::AbstractMatrix)`.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelProjection","page":"Library","title":"GeometricMachineLearning.StiefelProjection","text":"Outer constructor for StiefelProjection. This works with two integers as input and optionally the type.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelProjection-2","page":"Library","title":"GeometricMachineLearning.StiefelProjection","text":"StiefelProjection(backend, T, N, n)\n\nMake a matrix of the form beginbmatrix mathbbI mathbbO endbmatrix^T for a specific backend and data type.\n\nAn array that essentially does vcat(I(n), zeros(N-n, n)) with GPU support. \n\nExtend help\n\nTechnically this should be a subtype of StiefelManifold. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.StiefelProjection-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.StiefelProjection","text":"StiefelProjection(A::AbstractMatrix)\n\nExtract necessary information from A and build an instance of StiefelProjection. \n\nNecessary information here referes to the backend, the data type and the size of the matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.StiefelProjection-Union{Tuple{StiefelLieAlgHorMatrix{T, AT, ST} where {AT<:(SkewSymMatrix{T, AT} where AT<:AbstractVector{T}), ST<:AbstractMatrix{T}}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.StiefelProjection","text":"StiefelProjection(B::StiefelLieAlgHorMatrix)\n\nExtract necessary information from B and build an instance of StiefelProjection. \n\nNecessary information here referes to the backend, the data type and the size of the matrix.\n\nThe size is queried through B.N and B.n.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.SymmetricMatrix","page":"Library","title":"GeometricMachineLearning.SymmetricMatrix","text":"SymmetricMatrix(S::AbstractVector, n::Integer)\n\nInstantiate a symmetric matrix with information stored in vector S.\n\nA SymmetricMatrix A is a matrix A^T = A.\n\nInternally the struct saves a vector S of size n(n+1)div2. The conversion is done the following way: \n\nA_ij = begincases S( (i-1) i ) div 2 + j textif igeqj \n S( (j-1) j ) div 2 + i textelse endcases\n\nSo S stores a string of vectors taken from A: S = tildea_1 tildea_2 ldots tildea_n with tildea_i = A_i1A_i2ldotsA_ii.\n\nAlso see SkewSymMatrix, LowerTriangular and UpperTriangular.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\nSymmetricMatrix(S, 4)\n\n# output\n\n4×4 SymmetricMatrix{Int64, Vector{Int64}}:\n 1 2 4 7\n 2 3 5 8\n 4 5 6 9\n 7 8 9 10\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SymmetricMatrix-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.SymmetricMatrix","text":"SymmetricMatrix(A::AbstractMatrix)\n\nPerform 0.5 * (A + A') and store the matrix in an efficient way (as a vector with n(n+1)2 entries).\n\nIf the constructor is called with a matrix as input it returns a symmetric matrix via the projection:\n\nA mapsto frac12(A + A^T)\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nSymmetricMatrix(M)\n\n# output\n\n4×4 SymmetricMatrix{Float64, Vector{Float64}}:\n 1.0 3.5 6.0 8.5\n 3.5 6.0 8.5 11.0\n 6.0 8.5 11.0 13.5\n 8.5 11.0 13.5 16.0\n\nExtend help\n\nNote that the constructor is designed in such a way that it always returns matrices of type SymmetricMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.\n\nIf the user wishes to allocate a matrix SymmetricMatrix{<:Integer} the constructor SymmetricMatrix(::AbstractVector, n::Integer) has to be called.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.SympNet","page":"Library","title":"GeometricMachineLearning.SympNet","text":"The SympNet type encompasses GSympNets and LASympNets. SympNets are universal approximators of symplectic flows, i.e. maps varphimathbbR^2ntomathbbR^2n for which (nablavarphi)^TmathbbJnablavarphi = mathbbJ holds.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SympNetLayer","page":"Library","title":"GeometricMachineLearning.SympNetLayer","text":"Implements the various layers from the SympNet paper [31]. This is a super type of GradientLayer, ActivationLayer and LinearLayer.\n\nFor the linear layer, the activation and the bias are left out, and for the activation layer K and b are left out!\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SympNetLayer-Tuple{AbstractArray, NamedTuple}","page":"Library","title":"GeometricMachineLearning.SympNetLayer","text":"This is called when a SympnetLayer is applied to a NamedTuple. It calls apply_layer_to_nt_and_return_array.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.SymplecticAutoencoder","page":"Library","title":"GeometricMachineLearning.SymplecticAutoencoder","text":"The architecture\n\nThe symplectic autoencoder architecture was introduced in [42]. Like any other autoencoder it consists of an encoder Psi^emathbbR^2NtomathbbR^2n and a decoder Psi^dmathbbR^2ntomathbbR^2N with nllN. These satisfy the following properties: \n\nnabla_zPsi^emathbbJ_2N(nabla_zPsi^emathbbJ_2N)^T = mathbbJ_2n text and (nabla_xiPsi^d)^TmathbbJ_2Nnabla_xiPsi^d = mathbbJ_2n\n\nBecause the decoder has this particular property, the reduced system can be described by the Hamiltonian HcircPsi^d: \n\nmathbbJ_2nnabla_xi(HcircPsi^d) = mathbbJ_2n(nabla_xiPsi^d)^Tnabla_Psi^d(xi)H = mathbbJ_2n(nabla_xiPsi^d)^TmathbbJ_2N^TmathbbJ_2Nnabla_Psi^d(xi)H = (nabla_xiPsi^d)^+X_H(Psi^d(xi))\n\nwhere (nabla_xiPsi^d)^+ is the pseudoinverse of nabla_xiPsi^d (for more details see the docs on the AutoEncoder type).\n\nThe constructor\n\nThe constructor is called with\n\nfull_dim::Integer \nreduced_dim::Integer \nn_encoder_layers::Integer = 4 (keyword argument)\nn_encoder_blocks::Integer = 2 (keyword argument)\nn_decoder_layers::Integer = 1 (keyword argument)\nn_decoder_blocks::Integer = 3 (keyword argument)\nsympnet_upscale::Integer = 5 (keyword argument)\nactivation = tanh (keyword argument)\nencoder_init_q::Bool = true (keyword argument)\ndecoder_init_q::Bool = true (keyword argument)\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.SymplecticPotential","page":"Library","title":"GeometricMachineLearning.SymplecticPotential","text":"SymplecticPotential(n)\n\nReturns a symplectic matrix of size 2n x 2n\n\nbeginpmatrix\nmathbbO mathbbI \nmathbbO -mathbbI \nendpmatrix\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.TrainingData","page":"Library","title":"GeometricMachineLearning.TrainingData","text":"TrainingData stores: \n\n - problem \n\n - shape \n\n - get \n\n - symbols \n\n - dim \n\n - noisemaker\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.TransformerIntegrator","page":"Library","title":"GeometricMachineLearning.TransformerIntegrator","text":"Encompasses various transformer architectures, such as the structure-preserving transformer and the linear symplectic transformer. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.TransformerLoss","page":"Library","title":"GeometricMachineLearning.TransformerLoss","text":"The loss for a transformer network (especially a transformer integrator). The constructor is called with:\n\nseq_length::Int\nprediction_window::Int (default is 1).\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.UpperTriangular","page":"Library","title":"GeometricMachineLearning.UpperTriangular","text":"LowerTriangular(S::AbstractVector, n::Int)\n\nBuild a lower-triangular matrix from a vector.\n\nA lower-triangular matrix is an ntimesn matrix that has ones on the diagonal and zeros on the upper triangular.\n\nThe data are stored in a vector S similarly to other matrices. See LowerTriangular, SkewSymMatrix and SymmetricMatrix.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6]\nUpperTriangular(S, 4)\n\n# output\n\n4×4 UpperTriangular{Int64, Vector{Int64}}:\n 0 1 2 4\n 0 0 3 5\n 0 0 0 6\n 0 0 0 0\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.UpperTriangular-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.UpperTriangular","text":"UpperTriangular(A::AbstractMatrix)\n\nBuild a lower-triangular matrix from a matrix.\n\nThis is done by taking the lower left of that matrix.\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nUpperTriangular(M)\n\n# output\n\n4×4 UpperTriangular{Int64, Vector{Int64}}:\n 0 2 3 4\n 0 0 7 8\n 0 0 0 12\n 0 0 0 0\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.VolumePreservingAttention","page":"Library","title":"GeometricMachineLearning.VolumePreservingAttention","text":"Volume-preserving attention (single head attention)\n\nDrawbacks: \n\nthe super fast activation is only implemented for sequence lengths of 2, 3, 4 and 5.\nother sequence lengths only work on CPU for now (lu decomposition has to be implemented to work for tensors in parallel).\n\nConstructor\n\nThe constructor is called with: \n\ndim::Int: The system dimension \nseq_length::Int: The sequence length to be considered. The default is zero, i.e. arbitrary sequence lengths; this works for all sequence lengths but doesn't apply the super-fast activation. \nskew_sym::Bool (keyword argument): specifies if we the weight matrix is skew symmetric or arbitrary (default is false).\n\nFunctor\n\nApplying a layer of type VolumePreservingAttention does the following: \n\nFirst we perform the operation X mapsto X^T A X = C, where XinmathbbR^Ntimesmathttseq_length is a vector containing time series data and A is the skew symmetric matrix associated with the layer. \nIn a second step we compute the Cayley transform of C; Lambda = mathrmCayley(C).\nThe output of the layer is then XLambda.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingFeedForward","page":"Library","title":"GeometricMachineLearning.VolumePreservingFeedForward","text":"Realizes a volume-preserving neural network as a combination of VolumePreservingLowerLayer and VolumePreservingUpperLayer. \n\nConstructor\n\nThe constructor is called with the following arguments: \n\nsys_dim::Int: The system dimension. \nn_blocks::Int: The number of blocks in the neural network (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nactivation: The activation function for the nonlinear layers in a block. \ninit_upper::Bool=false (keyword argument): Specifies if the first layer is lower or upper. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingFeedForwardLayer","page":"Library","title":"GeometricMachineLearning.VolumePreservingFeedForwardLayer","text":"Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following: \n\nx mapsto begincases sigma(Lx + b) textwhere L is mathttLowerTriangular sigma(Ux + b) textwhere U is mathttUpperTriangular endcases\n\nThe functor can be applied to a vecotr, a matrix or a tensor. \n\nConstructor\n\nThe constructors are called with:\n\nsys_dim::Int: the system dimension. \nactivation=tanh: the activation function. \ninclude_bias::Bool=true (keyword argument): specifies whether a bias should be used. \n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingLowerLayer","page":"Library","title":"GeometricMachineLearning.VolumePreservingLowerLayer","text":"See the documentation for VolumePreservingFeedForwardLayer.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingTransformer","page":"Library","title":"GeometricMachineLearning.VolumePreservingTransformer","text":"The volume-preserving transformer with the Cayley activation function and built-in upscaling.\n\nConstructor\n\nThe arguments for the constructor are: \n\nsys_dim::Int\nseq_length::Int: The sequence length of the data fed into the transformer.\n\nThe following are keyword argumetns:\n\nn_blocks::Int=1: The number of blocks in one transformer unit (containing linear layers and nonlinear layers). Default is 1.\nn_linear::Int=1: The number of linear VolumePreservingLowerLayers and VolumePreservingUpperLayers in one block. Default is 1.\nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function.\ninit_upper::Bool=false: Specifies if the network first acts on the q component. \nskew_sym::Bool=false: specifies if we the weight matrix is skew symmetric or arbitrary.\n\n\n\n\n\n","category":"type"},{"location":"library/#GeometricMachineLearning.VolumePreservingUpperLayer","page":"Library","title":"GeometricMachineLearning.VolumePreservingUpperLayer","text":"See the documentation for VolumePreservingFeedForwardLayer.\n\n\n\n\n\n","category":"type"},{"location":"library/#AbstractNeuralNetworks.update!-Union{Tuple{CT}, Tuple{T}, Tuple{Optimizer{<:BFGSOptimizer}, CT, AbstractArray{T}}} where {T, CT<:(BFGSCache{T, AT} where AT<:(AbstractArray{T}))}","page":"Library","title":"AbstractNeuralNetworks.update!","text":"Optimization for an entire neural networks with BFGS. What is different in this case is that we still have to initialize the cache.\n\nIf o.step == 1, then we initialize the cache\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.iterate-Union{Tuple{AT}, Tuple{T}, Tuple{NeuralNetwork{<:TransformerIntegrator}, @NamedTuple{q::AT, p::AT}}} where {T, AT<:AbstractMatrix{T}}","page":"Library","title":"Base.iterate","text":"This function computes a trajectory for a Transformer that has already been trained for valuation purposes.\n\nIt takes as input: \n\nnn: a NeuralNetwork (that has been trained).\nics: initial conditions (a matrix in mathbbR^2ntimesmathttseq_length or NamedTuple of two matrices in mathbbR^ntimesmathttseq_length)\nn_points::Int=100 (keyword argument): The number of steps for which we run the prediction. \nprediction_window::Int=size(ics.q, 2): The prediction window (i.e. the number of steps we predict into the future) is equal to the sequence length (i.e. the number of input time steps) by default. \n\n\n\n\n\n","category":"method"},{"location":"library/#Base.iterate-Union{Tuple{BT}, Tuple{AT}, Tuple{T}, Tuple{NeuralNetwork{<:NeuralNetworkIntegrator}, BT}} where {T, AT<:AbstractVector{T}, BT<:@NamedTuple{q::AT, p::AT}}","page":"Library","title":"Base.iterate","text":"This function computes a trajectory for a SympNet that has already been trained for valuation purposes.\n\nIt takes as input: \n\nnn: a NeuralNetwork (that has been trained).\nics: initial conditions (a NamedTuple of two vectors)\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.rand-Union{Tuple{MT}, Tuple{KernelAbstractions.Backend, Type{MT}, Integer, Integer}} where MT<:Manifold","page":"Library","title":"Base.rand","text":"rand(backend::KernelAbstractions.Backend, manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold)\n\nDraw random elements for a specific device.\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round # hide\nimport Random\nRandom.seed!(123)\n\nN, n = 5, 3\nY = rand(CPU(), StiefelManifold{Float32}, N, n)\n_round(Y; digits = 5) # hide\n\n# output\n\n5×3 StiefelManifold{Float32, Matrix{Float32}}:\n -0.27575 0.32991 0.77275\n -0.62485 -0.33224 -0.0686\n -0.69333 0.36724 -0.18988\n -0.09295 -0.73145 0.46064\n 0.2102 0.33301 0.38717\n\nRandom elements of the manifold can also be allocated on GPU, via e.g. ...\n\nrand(CUDABackend(), StiefelManifold{Float32}, N, n)\n\n... for drawing elements on a CUDA device.\n\n\n\n\n\n","category":"method"},{"location":"library/#Base.rand-Union{Tuple{MT}, Tuple{Type{MT}, Integer, Integer}} where MT<:Manifold","page":"Library","title":"Base.rand","text":"rand(manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold\n\nDraw random elements from the Stiefel and the Grassmann manifold. \n\nBecause both of these manifolds are compact spaces we can sample them uniformly [8].\n\nExamples\n\nWhen we call ...\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round # hide\nimport Random\nRandom.seed!(123)\n\nN, n = 5, 3\nY = rand(StiefelManifold{Float32}, N, n)\n_round(Y; digits = 5) # hide\n\n# output\n\n5×3 StiefelManifold{Float32, Matrix{Float32}}:\n -0.27575 0.32991 0.77275\n -0.62485 -0.33224 -0.0686\n -0.69333 0.36724 -0.18988\n -0.09295 -0.73145 0.46064\n 0.2102 0.33301 0.38717\n\n... the sampling is done by first allocating a random matrix of size Ntimesn via Y = randn(Float32, N, n). We then perform a QR decomposition Q, R = qr(Y) with the qr function from the LinearAlgebra package (this is using Householder reflections internally). The final output are then the first n columns of the Q matrix. \n\n\n\n\n\n","category":"method"},{"location":"library/#Base.vec-Tuple{GeometricMachineLearning.AbstractTriangular}","page":"Library","title":"Base.vec","text":"If vec is applied onto Triangular, then the output is the associated vector. \n\n\n\n\n\n","category":"method"},{"location":"library/#Base.vec-Tuple{SkewSymMatrix}","page":"Library","title":"Base.vec","text":"If vec is applied onto SkewSymMatrix, then the output is the associated vector. \n\n\n\n\n\n","category":"method"},{"location":"library/#ChainRulesCore.rrule-Union{Tuple{T}, Tuple{typeof(GeometricMachineLearning.tensor_transpose_mat_mul), AbstractArray{T, 3}, AbstractMatrix{T}}} where T","page":"Library","title":"ChainRulesCore.rrule","text":"This implements the custom pullback for tensortransposemat_mul\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Gradient","page":"Library","title":"GeometricMachineLearning.Gradient","text":"This is an old constructor and will be depricated. For change_q=true it is equivalent to GradientLayerQ; for change_q=false it is equivalent to GradientLayerP.\n\nIf full_grad=false then ActivationLayer is called\n\n\n\n\n\n","category":"function"},{"location":"library/#GeometricMachineLearning.Transformer-Tuple{Integer, Integer, Integer}","page":"Library","title":"GeometricMachineLearning.Transformer","text":"The architecture for a \"transformer encoder\" is essentially taken from arXiv:2010.11929, but with the difference that no layer normalization is employed. This is because we still need to find a generalization of layer normalization to manifolds. \n\nThe transformer is called with the following inputs: \n\ndim: the dimension of the transformer \nn_heads: the number of heads \nL: the number of transformer blocks\n\nIn addition we have the following optional arguments: \n\nactivation: the activation function used for the ResNet (tanh by default)\nStiefel::Bool: if the matrices P^V, P^Q and P^K should live on a manifold (false by default)\nretraction: which retraction should be used (Geodesic() by default)\nadd_connection::Bool: if the input should by added to the ouput after the MultiHeadAttention layer is used (true by default)\nuse_bias::Bool: If the ResNet should use a bias (true by default)\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.accuracy-Union{Tuple{BT}, Tuple{AT}, Tuple{T1}, Tuple{T}, Tuple{Chain, Tuple, DataLoader{T, AT, BT}}} where {T, T1<:Integer, AT<:(AbstractArray{T}), BT<:(AbstractArray{T1})}","page":"Library","title":"GeometricMachineLearning.accuracy","text":"Computes the accuracy (as opposed to the loss) of a neural network classifier. \n\nIt takes as input:\n\nmodel::Chain\nps: parameters of the network\ndl::DataLoader\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.apply_layer_to_nt_and_return_array-Tuple{AbstractArray, AbstractNeuralNetworks.AbstractExplicitLayer, NamedTuple}","page":"Library","title":"GeometricMachineLearning.apply_layer_to_nt_and_return_array","text":"This function is used in the wrappers where the input to the SympNet layers is not a NamedTuple (as it should be) but an AbstractArray.\n\nIt converts the Array to a NamedTuple (via assign_q_and_p), then calls the SympNet routine(s) and converts back to an AbstractArray (with vcat).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.apply_section!-Union{Tuple{AT}, Tuple{T}, Tuple{AT, GlobalSection{T, AT}, AT}} where {T, AT<:(StiefelManifold{T, AT} where AT<:AbstractMatrix{T})}","page":"Library","title":"GeometricMachineLearning.apply_section!","text":"apply_section!(Y::AT, λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT<:StiefelManifold{T}}\n\nApply λY to Y₂ and store the result in Y.\n\nThe inplace version of apply_section.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.apply_section-Union{Tuple{AT}, Tuple{T}, Tuple{GlobalSection{T, AT}, AT}} where {T, AT<:(StiefelManifold{T, AT} where AT<:AbstractMatrix{T})}","page":"Library","title":"GeometricMachineLearning.apply_section","text":"apply_section(λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT <: StiefelManifold{T}}\n\nApply λY to Y₂.\n\nMathematically this is the group action of the element lambdaYinG on the element Y_2 of the homogeneous space mathcalM.\n\nInternally it calls the inplace version apply_section!.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_batch_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.assign_batch_kernel!","text":"Takes as input a batch tensor (to which the data are assigned), the whole data tensor and two vectors params and time_steps that include the specific parameters and time steps we want to assign. \n\nNote that this assigns sequential data! For e.g. being processed by a transformer.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_output_estimate-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, Int64}} where T","page":"Library","title":"GeometricMachineLearning.assign_output_estimate","text":"The function assign_output_estimate is closely related to the transformer. It takes the last prediction_window columns of the output and uses them for the final prediction. i.e.\n\nmathbbR^NtimesmathttpwtomathbbR^Ntimesmathttpw \nbeginbmatrix \n z^(1)_1 cdots z^(T)_1 \n cdots cdots cdots \n z^(1)_n cdots z^(T)_n\n endbmatrix mapsto \n beginbmatrix \n z^(T - mathttpw)_1 cdots z^(T)_1 \n cdots cdots cdots \n z^(T - mathttpw)_n cdots z^(T)_nendbmatrix \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_output_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.assign_output_kernel!","text":"This should be used together with assign_batch_kernel!. It assigns the corresponding output (i.e. target).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.assign_q_and_p-Tuple{AbstractVector, Int64}","page":"Library","title":"GeometricMachineLearning.assign_q_and_p","text":"Allocates two new arrays q and p whose first dimension is half of that of the input x. This should also be supplied through the second argument N.\n\nThe output is a Tuple containing q and p.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.augment_zeros_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.augment_zeros_kernel!","text":"Used for differentiating assignoutputestimate (this appears in the loss). \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.build_v_reduced-Tuple{Any, Any, NeuralNetwork{<:GeometricMachineLearning.SymplecticDecoder}}","page":"Library","title":"GeometricMachineLearning.build_v_reduced","text":"Builds the reduced vector field based on the full vector field for a Hamiltonian system. We derive the reduced vector field via the reduced Hamiltonian: tildeH = HcircPsi^mathrmdec. We then get \n\nmathbbJ_2nnabla_xitildeH = mathbbJ_2n(nablaPsi^mathrmdec)^TmathbbJ_2N^TmathbbJ_2Nnabla_zH = mathbbJ_2n(nablaPsi^mathrmdec)^TmathbbJ_2N^T beginpmatrix v(z) f(z) endpmatrix = beginpmatrix - (nabla_pPsi_q)^Tf(z) + (nabla_pPsi_p)^Tv(z) (nabla_qPsi_q)^Tf(z) - (nabla_qPsi_p)^Tv(z) endpmatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.compute_iterations-Tuple{Integer, Integer, Integer}","page":"Library","title":"GeometricMachineLearning.compute_iterations","text":"This function gives iterations from the full dimension to the reduced dimension (i.e. the intermediate steps). The iterations are given in ascending order. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.compute_iterations_for_symplectic_system-Tuple{Integer, Integer, Integer}","page":"Library","title":"GeometricMachineLearning.compute_iterations_for_symplectic_system","text":"This function gives iterations from the full dimension to the reduced dimension (i.e. the intermediate steps). The iterations are given in ascending order. Only even steps are allowed here.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.compute_output_of_mha-Union{Tuple{T}, Tuple{M}, Tuple{MultiHeadAttention{M, M}, AbstractMatrix{T}, NamedTuple}} where {M, T}","page":"Library","title":"GeometricMachineLearning.compute_output_of_mha","text":"Applies MHA to an abstract matrix. This is the same independent of whether the input is added to the output or not. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.convert_input_and_batch_indices_to_array-Union{Tuple{BT}, Tuple{AT}, Tuple{T}, Tuple{DataLoader{T, BT}, Batch, Vector{Tuple{Int64, Int64}}}} where {T, AT<:AbstractArray{T, 3}, BT<:@NamedTuple{q::AT, p::AT}}","page":"Library","title":"GeometricMachineLearning.convert_input_and_batch_indices_to_array","text":"Takes the output of the batch functor and uses it to create the corresponding array (NamedTuples). \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.convert_input_and_batch_indices_to_array-Union{Tuple{BT}, Tuple{T}, Tuple{DataLoader{T, BT}, Batch, Vector{Tuple{Int64, Int64}}}} where {T, BT<:AbstractArray{T, 3}}","page":"Library","title":"GeometricMachineLearning.convert_input_and_batch_indices_to_array","text":"Takes the output of the batch functor and uses it to create the corresponding array. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.crop_array_for_transformer_loss-Union{Tuple{BT}, Tuple{AT}, Tuple{T2}, Tuple{T}, Tuple{AT, BT}} where {T, T2, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T2, 3}}","page":"Library","title":"GeometricMachineLearning.crop_array_for_transformer_loss","text":"This crops the output array of the neural network so that it conforms with the output it should be compared to. This is needed for the transformer loss. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.custom_mat_mul-Tuple{AbstractMatrix, AbstractVecOrMat}","page":"Library","title":"GeometricMachineLearning.custom_mat_mul","text":"Multiplies a matrix with a vector, a matrix or a tensor.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.decoder_layers_from_iteration-Tuple{GeometricMachineLearning.AutoEncoder, AbstractVector{<:Integer}}","page":"Library","title":"GeometricMachineLearning.decoder_layers_from_iteration","text":"Takes as input the autoencoder architecture and a vector of integers specifying the layer dimensions in the decoder. Has to return a tuple of AbstractExplicitLayers.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.draw_batch!-Union{Tuple{T}, Tuple{AbstractMatrix{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.draw_batch!","text":"This assigns the batch if the data are in form of a matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.encoder_layers_from_iteration-Tuple{GeometricMachineLearning.AutoEncoder, AbstractVector{<:Integer}}","page":"Library","title":"GeometricMachineLearning.encoder_layers_from_iteration","text":"Takes as input the autoencoder architecture and a vector of integers specifying the layer dimensions in the encoder. Has to return a tuple of AbstractExplicitLayers.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.geodesic-Union{Tuple{T}, Tuple{Manifold{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.geodesic","text":"geodesic(Y::Manifold, Δ)\n\nTake as input an element of a manifold Y and a tangent vector in Δ in the corresponding tangent space and compute the geodesic (exponential map).\n\nIn different notation: take as input an element x of mathcalM and an element of T_xmathcalM and return mathttgeodesic(x v_x) = exp(v_x) For example: \n\nY = rand(StiefelManifold{Float64}, N, n)\nΔ = rgrad(Y, rand(N, n))\ngeodesic(Y, Δ)\n\nSee the docstring for rgrad for details on this function.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.global_rep-Union{Tuple{AT}, Tuple{T}, Tuple{GlobalSection{T, AT}, AbstractMatrix{T}}} where {T, AT<:(GrassmannManifold{T, AT} where AT<:AbstractMatrix{T})}","page":"Library","title":"GeometricMachineLearning.global_rep","text":"global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:GrassmannManifold{T}}\n\nExpress Δ (an the tangent space of Y) as an instance of GrassmannLieAlgHorMatrix.\n\nThe method global_rep for GrassmannManifold is similar to that for StiefelManifold.\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round\nimport Random \n\nRandom.seed!(123)\n\nY = rand(GrassmannManifold, 6, 3)\nΔ = rgrad(Y, randn(6, 3))\nλY = GlobalSection(Y)\n\n_round(global_rep(λY, Δ); digits = 3)\n\n# output\n\n6×6 GrassmannLieAlgHorMatrix{Float64, Matrix{Float64}}:\n 0.0 0.0 0.0 0.981 -2.058 0.4\n 0.0 0.0 0.0 -0.424 0.733 -0.919\n 0.0 0.0 0.0 -1.815 1.409 1.085\n -0.981 0.424 1.815 0.0 0.0 0.0\n 2.058 -0.733 -1.409 0.0 0.0 0.0\n -0.4 0.919 -1.085 0.0 0.0 0.0\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.global_rep-Union{Tuple{AT}, Tuple{T}, Tuple{GlobalSection{T, AT}, AbstractMatrix{T}}} where {T, AT<:(StiefelManifold{T, AT} where AT<:AbstractMatrix{T})}","page":"Library","title":"GeometricMachineLearning.global_rep","text":"global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:StiefelManifold{T}}\n\nExpress Δ (an the tangent space of Y) as an instance of StiefelLieAlgHorMatrix.\n\nThis maps an element from T_YmathcalM to an element of mathfrakg^mathrmhor. \n\nThese two spaces are isomorphic where the isomorphism where the isomorphism is established through lambda(Y)inG via:\n\nT_YmathcalM to mathfrakg^mathrmhor Delta mapsto lambda(Y)^-1Omega(Y Delta)lambda(Y)\n\nAlso see GeometricMachineLearning.Ω.\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round\nimport Random \n\nRandom.seed!(123)\n\nY = rand(StiefelManifold, 6, 3)\nΔ = rgrad(Y, randn(6, 3))\nλY = GlobalSection(Y)\n\n_round(global_rep(λY, Δ); digits = 3)\n\n# output\n\n6×6 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:\n 0.0 0.679 1.925 0.981 -2.058 0.4\n -0.679 0.0 0.298 -0.424 0.733 -0.919\n -1.925 -0.298 0.0 -1.815 1.409 1.085\n -0.981 0.424 1.815 0.0 0.0 0.0\n 2.058 -0.733 -1.409 0.0 0.0 0.0\n -0.4 0.919 -1.085 0.0 0.0 0.0\n\nImplementation\n\nThe function global_rep does in fact not perform the entire map lambda(Y)^-1Omega(Y Delta)lambda(Y) but only\n\nDelta mapsto mathrmskew(Y^TDelta)\n\nto get the small skew-symmetric matrix and \n\nDelta mapsto (lambda(Y)_1N nN^T Delta)_1(N-n) 1n\n\nfor the arbitrary matrix.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.global_section-Union{Tuple{GrassmannManifold{T, AT} where AT<:AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.global_section","text":"global_section(Y::GrassmannManifold)\n\nCompute a matrix of size Ntimes(N-n) whose columns are orthogonal to the columns in Y.\n\nThe method global_section for the Grassmann manifold is equivalent to that for the StiefelManifold (we represent the Grassmann manifold as an embedding in the Stiefel manifold). \n\nSee the documentation for global_section(Y::StiefelManifold{T}) where T. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.global_section-Union{Tuple{StiefelManifold{T, AT} where AT<:AbstractMatrix{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.global_section","text":"global_section(Y::StiefelManifold)\n\nCompute a matrix of size Ntimes(N-n) whose columns are orthogonal to the columns in Y.\n\nThis matrix is also called Y_perp [6, 10, 11].\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: global_section\nimport Random\n\nRandom.seed!(123)\n\nY = StiefelManifold([1. 0.; 0. 1.; 0. 0.; 0. 0.])\n\nround.(Matrix(global_section(Y)); digits = 3)\n\n# output\n\n4×2 Matrix{Float64}:\n 0.0 -0.0\n 0.0 0.0\n 0.936 -0.353\n 0.353 0.936\n\nFurther note that we convert the QRCompactWYQ object to a Matrix before we display it.\n\nImplementation\n\nThe implementation is done with a QR decomposition (LinearAlgebra.qr!). Internally we do: \n\nA = randn(N, N - n) # or the gpu equivalent\nA = A - Y.A * (Y.A' * A)\nqr!(A).Q\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.init_optimizer_cache-Tuple{GradientOptimizer, Any}","page":"Library","title":"GeometricMachineLearning.init_optimizer_cache","text":"Wrapper for the functions setup_adam_cache, setup_momentum_cache, setup_gradient_cache, setup_bfgs_cache. These appear outside of optimizer_caches.jl because the OptimizerMethods first have to be defined.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.initialize_hessian_inverse-Union{Tuple{AbstractArray{T}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.initialize_hessian_inverse","text":"This initializes the inverse of the Hessian for various arrays. This requires an implementation of a vectorization operation vec. This is important for custom arrays.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.map_index_for_symplectic_potential-Tuple{Int64, Int64}","page":"Library","title":"GeometricMachineLearning.map_index_for_symplectic_potential","text":"This assigns the right index for the symplectic potential. To be used with assign_ones_for_symplectic_potential_kernel!.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul!-Tuple{Any, Any, Any}","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul!","text":"mat_tensor_mul!(C, A, B)\n\nMultiply the matrix A onto the tensor B from the left and store the result in C.\n\nAlso checks the bounds of the input arrays.\n\nThe function mat_tensor_mul calls mat_tensor_mul! internally.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul!-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, LowerTriangular{T, AT} where AT<:AbstractVector{T}, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul!","text":"mat_tensor_mul!(C::AbstractArray{T, 3}, A::LowerTriangular{T}, B::AbstractArray{T, 3}) where T\n\nMultiply the lower-triangular matrix A onto the tensor B from the left and store the result in C.\n\nAlso checks the bounds of the input arrays. \n\nThis performs an efficient multiplication based on the special structure of the lower-triangular matrix A.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul!-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, SkewSymMatrix{T, AT} where AT<:AbstractVector{T}, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul!","text":"mat_tensor_mul!(C::AbstractArray{T, 3}, A::SkewSymMatrix{T}, B::AbstractArray{T, 3}) where T\n\nMultiply skew-symmetric the matrix A onto the tensor B from the left and store the result in C.\n\nAlso checks the bounds of the input arrays. \n\nThis performs an efficient multiplication based on the special structure of the skew-symmetric matrix A.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul!-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, SymmetricMatrix{T, AT} where AT<:AbstractVector{T}, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul!","text":"mat_tensor_mul!(C::AbstractArray{T, 3}, A::SymmetricMatrix{T}, B::AbstractArray{T, 3}) where T\n\nMultiply the symmetric matrix A onto the tensor B from the left and store the result in C.\n\nAlso checks the bounds of the input arrays. \n\nThis performs an efficient multiplication based on the special structure of the symmetric matrix A.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul!-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, UpperTriangular{T, AT} where AT<:AbstractVector{T}, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul!","text":"mat_tensor_mul!(C::AbstractArray{T, 3}, A::UpperTriangular{T}, B::AbstractArray{T, 3}) where T\n\nMultiply the upper-triangular matrix A onto the tensor B from the left and store the result in C.\n\nAlso checks the bounds of the input arrays. \n\nThis performs an efficient multiplication based on the special structure of the upper-triangular matrix A.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul-Union{Tuple{AT}, Tuple{ST}, Tuple{BT}, Tuple{T}, Tuple{AT, AbstractArray{T, 3}}} where {T, BT<:(AbstractArray{T}), ST<:StiefelManifold{T, BT}, AT<:LinearAlgebra.Adjoint{T, ST}}","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul","text":"Extend mat_tensor_mul to a multiplication by the adjoint of an element of StiefelManifold. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul-Union{Tuple{T}, Tuple{AbstractMatrix{T}, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul","text":"mat_tensor_mul(A::AbstractMatrix{T}, B::AbstractArray{T, 3}) where T\n\nMultipliy the matrix A onto the tensor B from the left. \n\nInternally this calls the inplace version mat_tensor_mul!.\n\nExamples\n\nusing GeometricMachineLearning: mat_tensor_mul\n\nB = [1 1 1; 1 1 1; 1 1 1;;; 2 2 2; 2 2 2; 2 2 2]\nA = [3 0 0; 0 2 0; 0 0 1]\n\nmat_tensor_mul(A, B)\n\n# output\n\n3×3×2 Array{Int64, 3}:\n[:, :, 1] =\n 3 3 3\n 2 2 2\n 1 1 1\n\n[:, :, 2] =\n 6 6 6\n 4 4 4\n 2 2 2\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.mat_tensor_mul-Union{Tuple{T}, Tuple{StiefelManifold, AbstractArray{T, 3}}} where T","page":"Library","title":"GeometricMachineLearning.mat_tensor_mul","text":"Extend mat_tensor_mul to a multiplication by an element of StiefelManifold. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.metric-Tuple{GrassmannManifold, AbstractMatrix, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.metric","text":"metric(Y::GrassmannManifold, Δ₁::AbstractMatrix, Δ₂::AbstractMatrix)\n\nCompute the metric for vectors Δ₁ and Δ₂ at Y. \n\nThe representation of the Grassmann manifold is realized as a quotient space of the Stiefel manifold. \n\nThe metric for the Grassmann manifold is:\n\ng^Gr_Y(Delta_1 Delta_2) = g^St_Y(Delta_1 Delta_2) = mathrmTr(Delta_1^T (mathbbI - Y Y^T) Delta_2) = mathrmTr(Delta_1^T Delta_2)\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.metric-Tuple{StiefelManifold, AbstractMatrix, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.metric","text":"Implements the canonical Riemannian metric for the Stiefel manifold:\n\ng_Y (Delta_1 Delta_2) mapsto mathrmtr(Delta_1^T(mathbbI - frac12YY^T)Delta_2)\n\nIt is called with: \n\nY::StiefelManifold\nΔ₁::AbstractMatrix\nΔ₂::AbstractMatrix\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.number_of_batches-Union{Tuple{OT}, Tuple{AT}, Tuple{BT}, Tuple{T}, Tuple{DataLoader{T, AT, OT, :TimeSeries}, Batch}} where {T, BT<:AbstractArray{T, 3}, AT<:Union{@NamedTuple{q::BT, p::BT}, BT}, OT}","page":"Library","title":"GeometricMachineLearning.number_of_batches","text":"Gives the number of batches. Inputs are of type DataLoader and Batch.\n\nHere the big distinction is between data that are time-series like and data that are autoencoder like.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.onehotbatch-Union{Tuple{AbstractVector{T}}, Tuple{T}} where T<:Integer","page":"Library","title":"GeometricMachineLearning.onehotbatch","text":"One-hot-batch encoding of a vector of integers: inputin01ldots9^ell. The output is a tensor of shape 10times1timesell. \n\n0 mapsto beginbmatrix 1 0 ldots 0 endbmatrix\n\nIn more abstract terms: i mapsto e_i.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.optimization_step!-Tuple{Optimizer, Chain, Tuple, Tuple}","page":"Library","title":"GeometricMachineLearning.optimization_step!","text":"Optimization for an entire neural network, the way this function should be called. \n\ninputs: \n\no::Optimizer\nmodel::Chain\nps::Tuple\ndx::Tuple\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.optimization_step!-Tuple{Optimizer, Union{AbstractNeuralNetworks.AbstractExplicitCell, AbstractNeuralNetworks.AbstractExplicitLayer}, NamedTuple, NamedTuple, NamedTuple}","page":"Library","title":"GeometricMachineLearning.optimization_step!","text":"Optimization for a single layer. \n\ninputs: \n\no::Optimizer\nd::Union{AbstractExplicitLayer, AbstractExplicitCell}\nps::NamedTuple: the parameters \nC::NamedTuple: NamedTuple of the caches \ndx::NamedTuple: NamedTuple of the derivatives (output of AD routine)\n\nps, C and dx must have the same keys. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.optimize_for_one_epoch!-Union{Tuple{T}, Tuple{Optimizer, Any, Union{Tuple, NamedTuple}, DataLoader{T, AT} where AT<:Union{AbstractArray{T}, NamedTuple}, Batch, Union{typeof(GeometricMachineLearning.loss), GeometricMachineLearning.NetworkLoss}}} where T","page":"Library","title":"GeometricMachineLearning.optimize_for_one_epoch!","text":"Optimize for an entire epoch. For this you have to supply: \n\nan instance of the optimizer.\nthe neural network model \nthe parameters of the model \nthe data (in form of DataLoader)\nin instance of Batch that contains batch_size (and optionally seq_length)\n\nWith the optional argument:\n\nthe loss, which takes the model, the parameters ps and an instance of DataLoader as input.\n\nThe output of optimize_for_one_epoch! is the average loss over all batches of the epoch:\n\noutput = frac1mathttsteps_per_epochsum_t=1^mathttsteps_per_epochloss(theta^(t-1))\n\nThis is done because any reverse differentiation routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps) (if the loss only depends on the parameters).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.patch_index-Union{Tuple{T}, Tuple{T, T, T}, NTuple{4, T}} where T<:Integer","page":"Library","title":"GeometricMachineLearning.patch_index","text":"Based on coordinates i,j this returns the batch index (for MNIST data set for now).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.rgrad-Tuple{GrassmannManifold, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.rgrad","text":"rgrad(Y::GrassmannManifold, e_grad::AbstractMatrix)\n\nCompute the Riemannian gradient at YinGr(n N). \n\nThese gradient have the property that they are orthogonal to the space spanned by Y.\n\nThe precise form of the mapping is: \n\nmathttrgrad(Y nablaL) mapsto nablaL - YY^TnablaL\n\nNote the property Y^Tmathrmrgrad(Y nablaL) = mathbbO\n\nAlso see rgrad(::StiefelManifold, ::AbstractMatrix).\n\nExamples\n\nusing GeometricMachineLearning\n\nY = GrassmannManifold([1 0 ; 0 1 ; 0 0; 0 0])\nΔ = [1 2; 3 4; 5 6; 7 8]\nrgrad(Y, Δ)\n\n# output\n\n4×2 Matrix{Int64}:\n 0 0\n 0 0\n 5 6\n 7 8\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.rgrad-Tuple{StiefelManifold, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.rgrad","text":"rgrad(Y::StiefelManifold, e_grad::AbstractMatrix)\n\nCompute the Riemannian gradient for the Stiefel manifold at YinSt(Nn) based on nablaLinmathbbR^Ntimesn (the Euclidean gradient). \n\nThe function computes the Riemannian gradient with respect to the canonical metric.\n\nThe precise form of the mapping is: \n\nmathttrgrad(Y nablaL) mapsto nablaL - Y(nablaL)^TY\n\nNote the property Y^Tmathrmrgrad(Y nablaL)inmathcalS_mathrmskew(n)\n\nExamples\n\nusing GeometricMachineLearning\n\nY = StiefelManifold([1 0 ; 0 1 ; 0 0; 0 0])\nΔ = [1 2; 3 4; 5 6; 7 8]\nrgrad(Y, Δ)\n\n# output\n\n4×2 Matrix{Int64}:\n 0 -1\n 1 0\n 5 6\n 7 8\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.solve!-Tuple{NeuralNetwork{<:PSDArch}, AbstractMatrix}","page":"Library","title":"GeometricMachineLearning.solve!","text":"PSDArch does not require neural network training since it is a strictly linear operation that can be solved with singular value decomposition (SVD).\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.split_and_flatten-Union{Tuple{AbstractArray{T, 3}}, Tuple{T}} where T","page":"Library","title":"GeometricMachineLearning.split_and_flatten","text":"split_and_flatten takes a tensor as input and produces another one as output (essentially rearranges the input data in an intricate way) so that it can easily be processed with a transformer.\n\nThe optional arguments are: \n\npatch_length: by default this is 7. \nnumber_of_patches: by default this is 16.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_mul!-Tuple{Any, Any, Any}","page":"Library","title":"GeometricMachineLearning.tensor_mat_mul!","text":"tensor_mat_mul!(C, A, B)\n\nMultiply the matrix B onto the tensor A from the right and store the result in C.\n\nAlso checks the bounds of the input arrays.\n\nThe function tensor_mat_mul calls tensor_mat_mul! internally.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_mul!-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, AbstractArray{T, 3}, SymmetricMatrix{T, AT} where AT<:AbstractVector{T}}} where T","page":"Library","title":"GeometricMachineLearning.tensor_mat_mul!","text":"mat_tensor_mul!(C::AbstractArray{T, 3}, B::AbstractArray{T, 3}, A::SymmetricMatrix{T}) where T\n\nMultiply the symmetric matrix A onto the tensor B from the right and store the result in C.\n\nAlso checks the bounds of the input arrays.\n\nThis performs an efficient multiplication based on the special structure of the symmetric matrix A.\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_mul-Union{Tuple{T}, Tuple{AbstractArray{T, 3}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.tensor_mat_mul","text":"tensor_mat_mul(A::AbstractArray{T, 3}, B::AbstractArray{T}) where T\n\nMultipliy the matrix B onto the tensor A from the right. \n\nInternally this calls the inplace version tensor_mat_mul!.\n\nExamples\n\nusing GeometricMachineLearning: tensor_mat_mul\n\nA = [1 1 1; 1 1 1; 1 1 1;;; 2 2 2; 2 2 2; 2 2 2]\nB = [3 0 0; 0 2 0; 0 0 1]\n\ntensor_mat_mul(A, B)\n\n# output\n\n3×3×2 Array{Int64, 3}:\n[:, :, 1] =\n 3 2 1\n 3 2 1\n 3 2 1\n\n[:, :, 2] =\n 6 4 2\n 6 4 2\n 6 4 2\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_skew_sym_assign-Union{Tuple{AT}, Tuple{T}, Tuple{AT, AbstractMatrix{T}}} where {T, AT<:AbstractArray{T, 3}}","page":"Library","title":"GeometricMachineLearning.tensor_mat_skew_sym_assign","text":"Takes as input: \n\nZ::AbstractArray{T, 3}: A tensor that stores a bunch of time series. \nA::AbstractMatrix: A matrix that is used to perform various scalar products. \n\nFor one of these time series the function performs the following computation: \n\n (z^(i) z^(j)) mapsto (z^(i))^TAz^(j) text for i j\n\nThe result of this are n(n-2)div2 scalar products. These scalar products are written into a lower-triangular matrix and the final output of the function is a tensor of these lower-triangular matrices. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.tensor_mat_skew_sym_assign_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.tensor_mat_skew_sym_assign_kernel!","text":"A kernel that computes the weighted scalar products of all combinations of vectors in the matrix Z except where the two vectors are the same and writes the result into a tensor of skew symmetric matrices C. \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.train!","page":"Library","title":"GeometricMachineLearning.train!","text":"train!(...)\n\nPerform a training of a neural networks on data using given method a training Method\n\nDifferent ways of use:\n\ntrain!(neuralnetwork, data, optimizer = GradientOptimizer(1e-2), training_method; nruns = 1000, batch_size = default(data, type), showprogress = false )\n\nArguments\n\nneuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend\ndata : the data (see TrainingData)\noptimizer = GradientOptimizer: the optimization method (see Optimizer)\ntraining_method : specify the loss function used \nnruns : number of iteration through the process with default value \nbatch_size : size of batch of data used for each step\n\n\n\n\n\n","category":"function"},{"location":"library/#GeometricMachineLearning.train!-Tuple{AbstractNeuralNetworks.AbstractNeuralNetwork{<:AbstractNeuralNetworks.Architecture}, AbstractTrainingData, TrainingParameters}","page":"Library","title":"GeometricMachineLearning.train!","text":"train!(neuralnetwork, data, optimizer, training_method; nruns = 1000, batch_size, showprogress = false )\n\nArguments\n\nneuralnetwork::LuxNeuralNetwork : the neural net work using LuxBackend\ndata::AbstractTrainingData : the data\n``\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.within_patch_index-Union{Tuple{T}, Tuple{T, T, T}} where T<:Integer","page":"Library","title":"GeometricMachineLearning.within_patch_index","text":"Based on coordinates i,j this returns the index within the batch\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.write_ones_kernel!-Tuple{Any}","page":"Library","title":"GeometricMachineLearning.write_ones_kernel!","text":"Kernel that is needed for functions relating to SymmetricMatrix and SkewSymMatrix \n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Ω-Union{Tuple{T}, Tuple{GrassmannManifold{T, AT} where AT<:AbstractMatrix{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.Ω","text":"Ω(Y::GrassmannManifold{T}, Δ::AbstractMatrix{T}) where T\n\nPerform the canonical horizontal lift for the Grassmann manifold:\n\n Delta mapsto Omega^St(Y Δ)\n\nwhere Omega^St is the canonical horizontal lift for the Stiefel manifold.\n\nusing GeometricMachineLearning\nE = GrassmannManifold(StiefelProjection(5, 2))\nΔ = [0. 0.; 0. 0.; 2. 3.; 4. 5.; 6. 7.]\nGeometricMachineLearning.Ω(E, Δ)\n\n# output\n\n5×5 SkewSymMatrix{Float64, Vector{Float64}}:\n 0.0 -0.0 -2.0 -4.0 -6.0\n 0.0 0.0 -3.0 -5.0 -7.0\n 2.0 3.0 0.0 -0.0 -0.0\n 4.0 5.0 0.0 0.0 -0.0\n 6.0 7.0 0.0 0.0 0.0\n\n\n\n\n\n","category":"method"},{"location":"library/#GeometricMachineLearning.Ω-Union{Tuple{T}, Tuple{StiefelManifold{T, AT} where AT<:AbstractMatrix{T}, AbstractMatrix{T}}} where T","page":"Library","title":"GeometricMachineLearning.Ω","text":"Ω(Y::StiefelManifold{T}, Δ::AbstractMatrix{T}) where T\n\nPerform canonical horizontal lift for the Stiefel manifold:\n\n Delta mapsto (mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)\n\nInternally this performs \n\nSkewSymMatrix(2 * (I(n) - .5 * Y * Y') * Δ * Y')\n\nto save memory. \n\nExamples\n\nusing GeometricMachineLearning\nE = StiefelManifold(StiefelProjection(5, 2))\nΔ = [0. -1.; 1. 0.; 2. 3.; 4. 5.; 6. 7.]\nGeometricMachineLearning.Ω(E, Δ)\n\n# output\n\n5×5 SkewSymMatrix{Float64, Vector{Float64}}:\n 0.0 -1.0 -2.0 -4.0 -6.0\n 1.0 0.0 -3.0 -5.0 -7.0\n 2.0 3.0 0.0 -0.0 -0.0\n 4.0 5.0 0.0 0.0 -0.0\n 6.0 7.0 0.0 0.0 0.0\n\nNote that the output of Ω is a skew-symmetric matrix, i.e. an element of mathfrakg.\n\n\n\n\n\n","category":"method"},{"location":"optimizers/adam_optimizer/#The-Adam-Optimizer","page":"Adam Optimizer","title":"The Adam Optimizer","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"The Adam Optimizer is one of the most widely (if not the most widely used) neural network optimizer. Like most modern neural network optimizers it contains a cache that is updated based on first-order gradient information and then, in a second step, the cache is used to compute a velocity estimate for updating the neural network weights. ","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold. ","category":"page"},{"location":"optimizers/adam_optimizer/#All-weights-on-a-vector-space","page":"Adam Optimizer","title":"All weights on a vector space","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"The cache of the Adam optimizer consists of first and second moments. The first moments B_1 store linear information about the current and previous gradients, and the second moments B_2 store quadratic information about current and previous gradients (all computed from a first-order gradient). ","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"If all the weights are on a vector space, then we directly compute updates for B_1 and B_2:","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"B_1 gets ((rho_1 - rho_1^t)(1 - rho_1^t))cdotB_1 + (1 - rho_1)(1 - rho_1^t)cdotnablaL\nB_2 gets ((rho_2 - rho_1^t)(1 - rho_2^t))cdotB_2 + (1 - rho_2)(1 - rho_2^t)cdotnablaLodotnablaL\nwhere odotmathbbR^ntimesmathbbR^ntomathbbR^n is the Hadamard product: aodotb_i = a_ib_i. rho_1 and rho_2 are hyperparameters. Their defaults, rho_1=09 and rho_2=099, are taken from [goodfellow2016deep; page 301]. After having updated the cache (i.e. B_1 and B_2) we compute a velocity (step 3) with which the parameters Y_t are then updated (step 4).\nW_tgets -etaB_1sqrtB_2 + delta\nY_t+1 gets Y_t + W_t","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Here eta (with default 0.01) is the learning rate and delta (with default 3cdot10^-7) is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise. ","category":"page"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"Main.include_graphics(\"../tikz/adam_optimizer\") # hide","category":"page"},{"location":"optimizers/adam_optimizer/#Weights-on-manifolds","page":"Adam Optimizer","title":"Weights on manifolds","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"The problem with generalizing Adam to manifolds is that the Hadamard product odot as well as the other element-wise operations (, sqrt and + in step 3 above) lack a clear geometric interpretation. In GeometricMachineLearning we get around this issue by utilizing a so-called global tangent space representation. ","category":"page"},{"location":"optimizers/adam_optimizer/#References","page":"Adam Optimizer","title":"References","text":"","category":"section"},{"location":"optimizers/adam_optimizer/","page":"Adam Optimizer","title":"Adam Optimizer","text":"I. Goodfellow, Y. Bengio and A. Courville. Deep learning (MIT press, Cambridge, MA, 2016).\n\n\n\n","category":"page"},{"location":"data_loader/TODO/#DATA-Loader-TODO","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"","category":"section"},{"location":"data_loader/TODO/","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"[x] Implement @views instead of allocating a new array in every step. \n[x] Implement sampling without replacement.\n[x] Store information on the epoch and the current loss. \n[x] Usually the training loss is computed over the entire data set, we are probably going to do this for one epoch via ","category":"page"},{"location":"data_loader/TODO/","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"loss_e = frac1batchessum_batchinbatchesloss(batch)","category":"page"},{"location":"data_loader/TODO/","page":"DATA Loader TODO","title":"DATA Loader TODO","text":"Point 4 makes sense because the output of an AD routine is the value of the loss function as well as the pullback. ","category":"page"},{"location":"data_loader/data_loader/#Data-Loader","page":"Routines","title":"Data Loader","text":"","category":"section"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning, Markdown\nMarkdown.parse(description(Val(:DataLoader)))","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"The data loader can be called with various types of arrays as input, for example a snapshot matrix:","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSnapshotMatrix = rand(Float32, 10, 100)\n\ndl = DataLoader(SnapshotMatrix)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"or a snapshot tensor: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSnapshotTensor = rand(Float32, 10, 100, 5)\n\ndl = DataLoader(SnapshotTensor)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Here the DataLoader has different properties :RegularData and :TimeSeries. This indicates that in the first case we treat all columns in the input tensor independently (this is mostly used for autoencoder problems), whereas in the second case we have time series-like data, which are mostly used for integration problems. We can also treat a problem with a matrix as input as a time series-like problem by providing an additional keyword argument: autoencoder=false:","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSnapshotMatrix = rand(Float32, 10, 100)\n\ndl = DataLoader(SnapshotMatrix; autoencoder=false)\ndl.input_time_steps","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning, Markdown\nMarkdown.parse(description(Val(:data_loader_for_named_tuple)))","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nSymplecticSnapshotTensor = (q = rand(Float32, 10, 100, 5), p = rand(Float32, 10, 100, 5))\n\ndl = DataLoader(SymplecticSnapshotTensor)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"dl.input_dim","category":"page"},{"location":"data_loader/data_loader/#The-Batch-struct","page":"Routines","title":"The Batch struct","text":"","category":"section"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning, Markdown\nMarkdown.parse(description(Val(:Batch)))","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nmatrix_data = rand(Float32, 2, 10)\ndl = DataLoader(matrix_data; autoencoder = true)\n\nbatch = Batch(3)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"This also works if the data are in qp form: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nqp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))\ndl = DataLoader(qp_data; autoencoder = true)\n\nbatch = Batch(3)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"In those two examples the autoencoder keyword was set to true (the default). This is why the first index was always 1. This changes if we set autoencoder = false: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nqp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))\ndl = DataLoader(qp_data; autoencoder = false) # false is default \n\nbatch = Batch(3)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Specifically the routines do the following: ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"mathttn_indicesleftarrow mathttn_paramslormathttinput_time_steps \nmathttindices leftarrow mathttshuffle(mathtt1mathttn_indices)\nmathcalI_i leftarrow mathttindices(i - 1) cdot mathttbatch_size + 1 mathtt i cdot mathttbatch_sizetext for i=1 ldots (mathrmlast -1)\nmathcalI_mathttlast leftarrow mathttindices(mathttn_batches - 1) cdot mathttbatch_size + 1mathttend","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Note that the routines are implemented in such a way that no two indices appear double. ","category":"page"},{"location":"data_loader/data_loader/#Sampling-from-a-tensor","page":"Routines","title":"Sampling from a tensor","text":"","category":"section"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"We can also sample tensor data.","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"using GeometricMachineLearning # hide\n\nqp_data = (q = rand(Float32, 2, 20, 3), p = rand(Float32, 2, 20, 3))\ndl = DataLoader(qp_data)\n\n# also specify sequence length here\nbatch = Batch(4, 5)\nbatch(dl)","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Sampling from a tensor is done the following way (mathcalI_i again denotes the batch indices for the i-th batch): ","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"mathtttime_indices leftarrow mathttshuffle(mathtt1(mathttinput_time_steps - mathttseq_length - mathttprediction_window)\nmathttparameter_indices leftarrow mathttshuffle(mathtt1n_params)\nmathttcomplete_indices leftarrow mathttproduct(mathtttime_indices mathttparameter_indices)\nmathcalI_i leftarrow mathttcomplete_indices(i - 1) cdot mathttbatch_size + 1 i cdot mathttbatch_sizetext for i=1 ldots (mathrmlast -1)\nmathcalI_mathrmlast leftarrow mathttcomplete_indices(mathrmlast - 1) cdot mathttbatch_size + 1mathttend","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"This algorithm can be visualized the following way (here batch_size = 4):","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Main.include_graphics(\"../tikz/tensor_sampling\") # hide","category":"page"},{"location":"data_loader/data_loader/","page":"Routines","title":"Routines","text":"Here the sampling is performed over the second axis (the time step dimension) and the third axis (the parameter dimension). Whereas each block has thickness 1 in the x direction (i.e. pertains to a single parameter), its length in the y direction is seq_length. In total we sample as many such blocks as the batch size is big. By construction those blocks are never the same throughout a training epoch but may intersect each other!","category":"page"},{"location":"manifolds/basic_topology/#Basic-Concepts-from-General-Topology","page":"Concepts from General Topology","title":"Basic Concepts from General Topology","text":"","category":"section"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"On this page we discuss basic notions of topology that are necessary to define manifolds and work with them. Here we largely omit concrete examples and only define concepts that are necessary for defining a manifold[1], namely the properties of being Hausdorff and second countable. For a detailed discussion of the theory and for a wide range of examples that illustrate the theory see e.g. [1]. The here-presented concepts are also (rudimentarily) covered in most differential geometry books such as [2, 3]. ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"[1]: Some authors (see e.g. [2]) do not require these properties. But since they constitute very weak restrictions and are always satisfied by the manifolds relevant for our purposes we require them here. ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now start by giving all the definitions, theorem and corresponding proofs that are needed to define manifolds. Every manifold is a topological space which is why we give this definition first: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A **topological space** is a set ``\\mathcal{M}`` for which we define a collection of subsets of ``\\mathcal{M}``, which we denote by ``\\mathcal{T}`` and call the *open subsets*. ``\\mathcal{T}`` further has to satisfy the following three conditions:\n\" *\nMain.indentation * raw\"1. The empty set and ``\\mathcal{M}`` belong to ``\\mathcal{T}``.\n\" *\nMain.indentation * raw\"2. Any union of an arbitrary number of elements of ``\\mathcal{T}`` again belongs to ``\\mathcal{T}``.\n\" *\nMain.indentation * raw\"3. Any intersection of a finite number of elements of ``\\mathcal{T}`` again belongs to ``\\mathcal{T}``.\n\" *\nMain.indentation * \"So an arbitrary union of open sets is again open and a finite intersection of open sets is again open.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Based on this definition of a topological space we can now define what it means to be Hausdorff: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A topological space ``\\mathcal{M}`` is said to be **Hausdorff** if for any two points ``x,y\\in\\mathcal{M}`` we can find two open sets ``U_x,U_y\\in\\mathcal{T}`` s.t. ``x\\in{}U_x, y\\in{}U_y`` and ``U_x\\cap{}U_y=\\{\\}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now give the second definition that we need for defining manifolds, that of second countability:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A topological space ``\\mathcal{M}`` is said to be **second-countable** if we can find a countable subcollection of ``\\mathcal{T}`` called ``\\mathcal{U}`` s.t. ``\\forall{}U\\in\\mathcal{T}`` and ``x\\in{}U`` we can find an element ``V\\in\\mathcal{U}`` for which ``x\\in{}V\\subset{}U``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now give a few definitions and results that are needed for the inverse function theorem which is essential for practical applications of manifold theory. We start with the definition of continuity: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A mapping ``f`` between topological spaces ``\\mathcal{M}`` and ``\\mathcal{N}`` is called **continuous** if the preimage of every open set is again an open set, i.e. if ``f^{-1}\\{U\\}\\in\\mathcal{T}`` for ``U`` open in ``\\mathcal{N}`` and ``\\mathcal{T}`` the topology on ``\\mathcal{M}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Continuity can also be formulated in terms of closed sets instead of doing it with open sets. The definition of closed sets is given below:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A **closed set** of a topological space ``\\mathcal{M}`` is one whose complement is an open set, i.e. ``F`` is closed if ``F^c\\in\\mathcal{T}``, where the superscript ``{}^c`` indicates the complement. For closed sets we thus have the following three properties:\n\" *\nMain.indentation * raw\"1. The empty set and ``\\mathcal{M}`` are closed sets.\n\" *\nMain.indentation * raw\"2. Any union of a finite number of closed sets is again closed.\n\" *\nMain.indentation * raw\"3. Any intersection of an arbitrary number of closed sets is again closed.\n\" *\nMain.indentation * \"So a finite union of closed sets is again closed and an arbitrary intersection of closed sets is again closed.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"We now give an equivalent definition of continuity: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"The definition of continuity is equivalent to the following, second definition: ``f:\\mathcal{M}\\to\\mathcal{N}`` is continuous if ``f^{-1}\\{F\\}\\subset\\mathcal{M}`` is a closed set for each closed set ``F\\subset\\mathcal{N}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"First assume that ``f`` is continuous according to the first definition and not to the second. Then ``f^{-1}\\{F\\}`` is not closed but ``f^{-1}\\{F^c\\}`` is open. But ``f^{-1}\\{F^c\\} = \\{x\\in\\mathcal{M}:f(x)\\not\\in\\mathcal{N}\\} = (f^{-1}\\{F\\})^c`` cannot be open, else ``f^{-1}\\{F\\}`` would be closed. The implication of the first definition under assumption of the second can be shown analogously.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"The next theorem makes the rather abstract definition of closed sets more concrete; this definition is especially important for many practical proofs:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"The property of a set ``F`` being closed is equivalent to the following statement: If a point ``y`` is such that for every open set ``U`` containing it we have ``U\\cap{}F\\ne\\{\\}`` then this point is contained in ``F``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"We first proof that if a set is closed then the statement holds. Consider a closed set ``F`` and a point ``y\\not\\in{}F`` s.t. every open set containing ``y`` has nonempty intersection with ``F``. But the complement ``F^c`` also is such a set, which is a clear contradiction. Now assume the above statement for a set ``F`` and further assume ``F`` is not closed. Its complement ``F^c`` is thus not open. Now consider the *interior* of this set: ``\\mathrm{int}(F^c):=\\cup\\{U:U\\subset{}F^c\\text{ and $U$ open}\\}``, i.e. the biggest open set contained within ``F^c``. Hence there must be a point ``y`` which is in ``F^c`` but is not in its interior, else ``F^c`` would be equal to its interior, i.e. would be open. We further must be able to find an open set ``U`` that contains ``y`` but is also contained in ``F^c``, else ``y`` would be an element of ``F``. A contradiction.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Next we define open covers, a concept that is very important in developing a theory of manifolds: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"An **open cover** of a topological space ``\\mathcal{M}`` is a (not necessarily countable) collection of open sets ``\\{U_i\\}_{i\\mathcal{I}}`` s.t. their union contains ``\\mathcal{M}``. A **finite open cover** is a finite collection of open sets that cover ``\\mathcal{M}``. We say that an open cover is **reducible** to a finite cover if we can find a finite number of elements in the open cover whose union still contains ``\\mathcal{M}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"And connected to this definition we state what it means for a topological space to be compact. This is a rather strong property that some of the manifolds treated in here have, for example the Stiefel manifold.","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.definition(raw\"A topological space ``\\mathcal{M}`` is called **compact** if every open cover is reducible to a finite cover.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"A very important result from general topology is that continuous functions preserve compactness[2]: ","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"[2]: We also say that compactness is a topological property [1].","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"Consider a continuous function ``f:\\mathcal{M}\\to\\mathcal{N}`` and a compact set ``K\\in\\mathcal{M}``. Then ``f(K)`` is also compact.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"Consider an open cover of ``f(K)``: ``\\{U_i\\}_{i\\in\\mathcal{I}}``. Then ``\\{f^{-1}\\{U_i\\}\\}_{i\\in\\mathcal{I}}`` is an open cover of ``K`` and hence reducible to a finite cover ``\\{f^{-1}\\{U_i\\}\\}_{i\\in\\{i_1,\\ldots,i_n\\}}``. But then ``\\{{U_i\\}_{i\\in\\{i_1,\\ldots,i_n}}`` also covers ``f(K)``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Moreover compactness is a property that is inherited by closed subspaces:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"A closed subset of a compact space is compact.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"Call the closed set ``F`` and consider an open cover of this set: ``\\{U\\}_{i\\in\\mathcal{I}}``. Then this open cover combined with ``F^c`` is an open cover for the entire compact space, hence reducible to a finite cover.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"A compact subset of a Hausdorff space is closed.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"Consider a compact subset ``K``. If ``K`` is not closed, then there has to be a point ``y\\not\\in{}K`` s.t. every open set containing ``y`` intersects ``K``. Because the surrounding space is Hausdorff we can now find the following two collections of open sets: ``\\{(U_z, U_{z,y}: U_z\\cap{}U_{z,y}=\\{\\})\\}_{z\\in{}K}``. The open cover ``\\{U_z\\}_{z\\in{}K}`` is then reducible to a finite cover ``\\{U_z\\}_{z\\in\\{z_1, \\ldots, z_n\\}}``. The intersection ``\\cap_{z\\in{z_1, \\ldots, z_n}}U_{z,y}`` is then an open set that contains ``y`` but has no intersection with ``K``. A contraction.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"This last theorem we will use in proofing the inverse function theorem:","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.theorem(raw\"If ``\\mathcal{M}`` is compact and ``\\mathcal{N}`` is Hausdorff, then the inverse of a continuous function ``f:\\mathcal{M}\\to\\mathcal{N}`` is again continuous, i.e. ``f(V)`` is an open set in ``\\mathcal{N}`` for ``V\\in\\mathcal{T}``.\")","category":"page"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"Main.proof(raw\"We can equivalently show that every closed set is mapped to a closed set. First consider the set ``K\\in\\mathcal{M}``. Its image is again compact and hence closed because ``\\mathcal{N}`` is Hausdorff.\")","category":"page"},{"location":"manifolds/basic_topology/#References","page":"Concepts from General Topology","title":"References","text":"","category":"section"},{"location":"manifolds/basic_topology/","page":"Concepts from General Topology","title":"Concepts from General Topology","text":"S. Lipschutz. General Topology (McGraw-Hill Book Company, 1965).\n\n\n\nS. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\nS. I. Richard L. Bishop. Tensor Analysis on Manifolds (Dover Publications, 1980).\n\n\n\n","category":"page"},{"location":"layers/linear_symplectic_attention/#Linear-Symplectic-Attention","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"","category":"section"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"The attention layer introduced here is an extension of the Sympnet gradient layer to the setting where we deal with time series data. We first have to define a notion of symplecticity for multi-step methods. ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"This definition is essentially taken from [21, 22] and similar to the definition of volume-preservation in [23]. ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"Main.definition(raw\"\"\"\nA multi-step method ``\\times_T\\mathbb{R}^{2n}\\to\\times_T\\mathbb{R}^{2n}`` is called **symplectic** if it preserves the the symplectic product structure.\n\"\"\")","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"The symplectic product structure is the following skew-symmetric non-degenerate bilinear form: ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"mathbbJ(z^(1) ldots z^(T) tildez^(1) ldots tildez^(T)) = sum_i=1^T (z^(i))^Ttildez^(i)","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"In order to construct a symplectic attention mechanism we extend the principle SympNet gradient layer, i.e. we construct scalar functions that only depend on q^(1) ldots q^(T) or p^(1) ldots p^(T). The specific choice we make here is the following: ","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"F(q^(1) q^(T)) = frac12mathrmTr(QAQ^T)","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"where Q = q^(1) ldots q^(T). We therefore have for the gradient:","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"nabla_Qf = frac12Q(A + A^T) = QbarA","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"where ``A\\in\\mathcal{S}_\\mathrm{skew}(T). So the map performs:","category":"page"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"q^(1) ldots q^(T) mapsto left sum_i=1^Ta_1iq^(i) ldots sum_i=1^Ta_Tiq^(i) right","category":"page"},{"location":"layers/linear_symplectic_attention/#Library-Functions","page":"Linear Symplectic Attention","title":"Library Functions","text":"","category":"section"},{"location":"layers/linear_symplectic_attention/","page":"Linear Symplectic Attention","title":"Linear Symplectic Attention","text":"LinearSymplecticAttention\nLinearSymplecticAttentionQ\nLinearSymplecticAttentionP","category":"page"},{"location":"layers/linear_symplectic_attention/#GeometricMachineLearning.LinearSymplecticAttention-layers-linear_symplectic_attention","page":"Linear Symplectic Attention","title":"GeometricMachineLearning.LinearSymplecticAttention","text":"Implements the linear symplectic attention layers. Analogous to GradientLayer it performs mappings that only change the Q or the P part. For more information see LinearSymplecticAttentionQ and LinearSymplecticAttentionP.\n\nConstructor\n\nFor the constructors simply call \n\nLinearSymplecticAttentionQ(sys_dim, seq_length)\n\nor \n\nLinearSymplecticAttentionP(sys_dim, seq_length)\n\nwhere sys_dim is the system dimension and seq_length is the sequence length.\n\n\n\n\n\n","category":"type"},{"location":"layers/linear_symplectic_attention/#GeometricMachineLearning.LinearSymplecticAttentionQ-layers-linear_symplectic_attention","page":"Linear Symplectic Attention","title":"GeometricMachineLearning.LinearSymplecticAttentionQ","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"layers/linear_symplectic_attention/#GeometricMachineLearning.LinearSymplecticAttentionP-layers-linear_symplectic_attention","page":"Linear Symplectic Attention","title":"GeometricMachineLearning.LinearSymplecticAttentionP","text":"Performs: \n\nbeginpmatrix Q P endpmatrix mapsto beginpmatrix Q + nabla_PF P endpmatrix\n\nwhere Q PinmathbbR^ntimesT and F(P) = frac12mathrmTr(P A P^T). \n\n\n\n\n\n","category":"type"},{"location":"manifolds/riemannian_manifolds/#Riemannian-Manifolds","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"A Riemannian manifold is a manifold mathcalM that we endow with a mapping g that smoothly[1] assigns a metric g_x to each tangent space T_xmathcalM. By a slight abuse of notation we will also refer to this g as a metric.","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"[1]: Smooth here refers to the fact that gmathcalMtotext(Space of Metrics) has to be a smooth map. But in order to discuss this in detail we would have to define a topology on the space of metrics. A more detailed discussion can be found in [2, 3, 5].","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"After having defined a metric g we can associate a length to each curve gamma0 t to mathcalM through: ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"L(gamma) = int_0^t sqrtg_gamma(s)(gamma(s) gamma(s))ds","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"This L turns mathcalM into a metric space:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Main.definition(raw\"The **metric on a Riemannian manifold** ``\\mathcal{M}`` is \n\" * \nMain.indentation * raw\"```math\n\" *\nMain.indentation * raw\"d(x, y) = \\mathrm{inf}_{\\text{$\\gamma(0) = x$ and $\\gamma(t) = y$}}L(\\gamma),\n\" * \nMain.indentation * raw\"```\n\" *\nMain.indentation * raw\"where ``t`` can be chosen arbitrarily.\")","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"If a curve is minimal with respect to the function L we call it the shortest curve or a geodesic. So we say that a curve gamma0 ttomathcalM is a geodesic if there is no shorter curve that can connect two points in gamma(0 t), i.e. ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"d(gamma(t_i) gamma(t_f)) = int_t_i^t_fsqrtg_gamma(s)(gamma(s) gamma(s))ds","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"for any t_i t_fin0 t.","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"An important result of Riemannian geometry states that there exists a vector field X on TmathcalM, called the geodesic spray, whose integral curves are derivatives of geodesics.","category":"page"},{"location":"manifolds/riemannian_manifolds/#Geodesic-Sprays-and-the-Exponential-Map","page":"Riemannian Manifolds","title":"Geodesic Sprays and the Exponential Map","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"To every Riemannian manifold we can naturally associate a vector field called the geodesic spray or geodesic equation. For our purposes it is enough to state that this vector field is unique and well-defined [5].","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"The important property of the geodesic spray is","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Main.theorem(raw\"Given an initial point ``x`` and an initial velocity ``v_x``, an integral curve for the geodesic spray is of the form ``t \\mapsto (\\gamma_{v_x}(t), \\gamma_{v_x}'(t))`` where ``\\gamma_{v_x}`` is a geodesic. We further have the property that the integral curve for the geodesic spray for an initial point ``x`` and an initial velocity ``\\eta\\cdot{}v_x`` (where ``\\eta`` is a scalar) is of the form ``t \\mapsto (\\gamma_{\\eta\\cdot{}v_x}(t), \\gamma_{\\eta\\cdot{}v_x}'(t)) = (\\gamma_{v_x}(\\eta\\cdot{}t), \\eta\\cdot\\gamma_{v_x}'(\\eta\\cdot{}t)).``\")","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"It is therefore customary to introduce the exponential map expT_xmathcalMtomathcalM as","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"exp(v_x) = gamma_v_x(1)","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"and we see that gamma_v_x(t) = exp(tcdotv_x). In GeometricMachineLearning we denote the exponential map by geodesic to avoid confusion with the matrix exponential map[2]:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"[2]: The Riemannian exponential map and the matrix exponential map coincide for many matrix Lie groups.","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":" mathttgeodesic(x v_x) equiv exp(v_x)","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"We give an example here:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"using GeometricMachineLearning\nusing CairoMakie # hide\nimport Random # hide\nRandom.seed!(123) # hide\n\nY = rand(StiefelManifold, 3, 1)\n\nv = 5 * rand(3, 1)\nΔ = v - Y * (v' * Y)\n\nfig = Figure(; backgroundcolor = :transparent) # hide\ntext_color = Main.output_type == :html ? :white : :black # hide\nax = Axis3(fig[1, 1]; # hide\n backgroundcolor = :transparent, # hide\n aspect = (1., 1., 1.), # hide\n azimuth = π / 6, # hide\n elevation = π / 8, # hide\n xlabel = rich(\"x\", subscript(\"1\"), font = :italic, color = text_color), # hide\n ylabel = rich(\"x\", subscript(\"2\"), font = :italic, color = text_color), # hide\n zlabel = rich(\"x\", subscript(\"3\"), font = :italic, color = text_color), # hide\n ) # hide\n\n# plot a sphere with radius one and origin 0\nsurface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)\n\nmorange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide\npoint_vec = ([Y[1]], [Y[2]], [Y[3]])\nscatter!(ax, point_vec...; color = morange, marker = :star5)\n\nmred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide\narrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]])\narrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02)\n\nfig","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"We now solve the geodesic spray for etacdotDelta for eta = 01 02 03 ldots 25 and plot the corresponding points:","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]\n\nY_increments = [geodesic(Y, Δ_increment) for Δ_increment in Δ_increments]\n\nfor Y_increment in Y_increments\n scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; \n color = mred, markersize = 5)\nend\n\nfig","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"So a geodesic can be seen as the equivalent of a straight line on a manifold. Also note that we drew a random element form StiefelManifold here and not from S^2. This is because Stiefel manifolds are more general spaces than S^n and also comprise them. ","category":"page"},{"location":"manifolds/riemannian_manifolds/#The-Riemannian-Gradient","page":"Riemannian Manifolds","title":"The Riemannian Gradient","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"The Riemannian gradient of a function LmathcalMtomathbbR is a vector field[3] mathrmgrad^gL (or simply mathrmgradL) for which we have","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"[3]: We also write mathrmgrad^gL(x) = mathrmgrad^g_xL","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":" g_x(mathrmgrad_x^gL v_x) = (nabla_varphi_U(x)(Lcircvarphi_U^-1))^T varphi_U(v_x) ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"where ","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":" nabla_xf = beginpmatrix fracpartialfpartialx_1 cdots fracpartialfpartialx_n endpmatrix","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"is the Euclidean gradient. By the non-degeneracy of g the Riemannian gradient always exists [3]. We will give specific examples of this when discussing the Stiefel manifold and the Grassmann manifold. ","category":"page"},{"location":"manifolds/riemannian_manifolds/#Gradient-Flows-and-Riemannian-Optimization","page":"Riemannian Manifolds","title":"Gradient Flows and Riemannian Optimization","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"In GeometricMachineLearning we can include weights in neural networks that are part of a manifold. Training such neural networks amounts to Riemannian optimization and hence solving the gradient flow equation. The gradient flow equation is given by","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"X(x) = - mathrmgrad_xL","category":"page"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"Solving this gradient flow equation will then lead us to a local minimum on mathcalM. This will be elaborated on when talking about optimizers. In practice we cannot solve the gradient flow equation directly and have to rely on approximations. The most straightforward approximation (and one that serves as a basis for all the optimization algorithms in GeometricMachineLearning) is to take the point (x X(x)) as an initial condition for the geodesic spray and then solve the ODE for a small time step. We will call this ","category":"page"},{"location":"manifolds/riemannian_manifolds/#Library-Functions","page":"Riemannian Manifolds","title":"Library Functions","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"geodesic(::Manifold{T}, ::AbstractMatrix{T}) where T","category":"page"},{"location":"manifolds/riemannian_manifolds/#GeometricMachineLearning.geodesic-Union{Tuple{T}, Tuple{Manifold{T}, AbstractMatrix{T}}} where T-manifolds-riemannian_manifolds","page":"Riemannian Manifolds","title":"GeometricMachineLearning.geodesic","text":"geodesic(Y::Manifold, Δ)\n\nTake as input an element of a manifold Y and a tangent vector in Δ in the corresponding tangent space and compute the geodesic (exponential map).\n\nIn different notation: take as input an element x of mathcalM and an element of T_xmathcalM and return mathttgeodesic(x v_x) = exp(v_x) For example: \n\nY = rand(StiefelManifold{Float64}, N, n)\nΔ = rgrad(Y, rand(N, n))\ngeodesic(Y, Δ)\n\nSee the docstring for rgrad for details on this function.\n\n\n\n\n\n","category":"method"},{"location":"manifolds/riemannian_manifolds/#References","page":"Riemannian Manifolds","title":"References","text":"","category":"section"},{"location":"manifolds/riemannian_manifolds/","page":"Riemannian Manifolds","title":"Riemannian Manifolds","text":"S. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\nM. P. Do Carmo and J. Flaherty Francis. Riemannian geometry. Vol. 2 (Springer, 1992).\n\n\n\n","category":"page"},{"location":"architectures/transformer/#Standard-Transformer","page":"Standard Transformer","title":"Standard Transformer","text":"","category":"section"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"The transformer is a relatively modern neural network architecture [17] that has come to dominate the field of natural language processing (NLP, [33]) and replaced the previously dominant long-short term memory cells (LSTM, [26]). Its success is due to a variety of factors: ","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"unlike LSTMs it consists of very simple building blocks and hence is easier to interpret mathematically,\nit is very flexible in its application and the data it is fed with do not have to conform to a rigid pattern, \ntransformers utilize modern hardware (especially GPUs) very effectively. ","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"The transformer architecture is sketched below: ","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"Main.include_graphics(\"../tikz/transformer_encoder\") # hide","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"It is nothing more than a combination of a multihead attention layer and a residual neural network[1] (ResNet).","category":"page"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"[1]: A ResNet is nothing more than a neural network to whose output we again add the input, i.e. every ResNet is of the form mathrmResNet(x) = x + mathcalNN(x).","category":"page"},{"location":"architectures/transformer/#Library-Functions","page":"Standard Transformer","title":"Library Functions","text":"","category":"section"},{"location":"architectures/transformer/","page":"Standard Transformer","title":"Standard Transformer","text":"StandardTransformerIntegrator","category":"page"},{"location":"architectures/transformer/#GeometricMachineLearning.StandardTransformerIntegrator-architectures-transformer","page":"Standard Transformer","title":"GeometricMachineLearning.StandardTransformerIntegrator","text":"The regular transformer used as an integrator (multi-step method). \n\nThe constructor is called with one argument: \n\nsys_dim::Int\n\nThe following are keyword arguments:\n\ntransformer_dim::Int: the default is transformer_dim = sys_dim.\nn_blocks::Int: The default is 1.\nn_heads::Int: the number of heads in the multihead attentio layer (default is n_heads = sys_dim)\nL::Int the number of transformer blocks (default is L = 2).\nupscaling_activation: by default identity\nresnet_activation: by default tanh\nadd_connection:Bool=true: if the input should be added to the output.\n\n\n\n\n\n","category":"type"},{"location":"optimizers/manifold_related/global_sections/#Global-Sections","page":"Global Sections","title":"Global Sections","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"Global sections are needed needed for the generalization of Adam and other optimizers to homogeneous spaces. They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general optimizer framework.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#Computing-the-global-section","page":"Global Sections","title":"Computing the global section","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"In differential geometry a section is always associated to some bundle, in our case this bundle is piGtomathcalMAmapstoAE. A section is a mapping mathcalMtoG for which pi is a left inverse, i.e. picirclambda = mathrmid. ","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"For the Stiefel manifold St(n N)subsetmathbbR^Ntimesn we compute the global section the following way: ","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"Start with an element YinSt(nN),\nDraw a random matrix AinmathbbR^Ntimes(N-n),\nRemove the subspace spanned by Y from the range of A: AgetsA-YY^TA\nCompute a QR decomposition of A and take as section lambda(Y) = Y Q_1N 1(N-n) = Y barlambda.","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"It is easy to check that lambda(Y)inG=SO(N).","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"In GeometricMachineLearning, GlobalSection takes an element of YinSt(nN)equivStiefelManifold{T} and returns an instance of GlobalSection{T, StiefelManifold{T}}. The application O(N)timesSt(nN)toSt(nN) is done with the functions apply_section! and apply_section.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#Computing-the-global-tangent-space-representation-based-on-a-global-section","page":"Global Sections","title":"Computing the global tangent space representation based on a global section","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"The output of the horizontal lift Omega is an element of mathfrakg^mathrmhorY. For this mapping Omega(Y BY) = B if Binmathfrakg^mathrmhorY, i.e. there is no information loss and no projection is performed. We can map the Binmathfrakg^mathrmhorY to mathfrakg^mathrmhor with Bmapstolambda(Y)^-1Blambda(Y).","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"The function global_rep performs both mappings at once[1], i.e. it takes an instance of GlobalSection and an element of T_YSt(nN), and then returns an element of frakg^mathrmhorequivStiefelLieAlgHorMatrix.","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"[1]: For computational reasons.","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"In practice we use the following: ","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"beginaligned\nlambda(Y)^TOmega(YDelta)lambda(Y) = lambda(Y)^T(mathbbI - frac12YY^T)DeltaY^T - YDelta^T(mathbbI - frac12YY^T)lambda(Y) \n = lambda(Y)^T(mathbbI - frac12YY^T)DeltaE^T - YDelta^T(lambda(Y) - frac12YE^T) \n = lambda(Y)^TDeltaE^T - frac12EY^TDeltaE^T - EDelta^Tlambda(Y) + frac12EDelta^TYE^T \n = beginbmatrix Y^TDeltaE^T barlambdaDeltaE^T endbmatrix - frac12EY^TDeltaE - beginbmatrix EDelta^TY EDelta^Tbarlambda endbmatrix + frac12EDelta^TYE^T \n = beginbmatrix Y^TDeltaE^T barlambdaDeltaE^T endbmatrix + EDelta^TYE^T - beginbmatrixEDelta^TY EDelta^Tbarlambda endbmatrix \n = EY^TDeltaE^T + EDelta^TYE^T - EDelta^TYE^T + beginbmatrix mathbbO barlambdaDeltaE^T endbmatrix - beginbmatrix mathbbO EDelta^Tbarlambda endbmatrix \n = EY^TDeltaE^T + beginbmatrix mathbbO barlambdaDeltaE^T endbmatrix - beginbmatrix mathbbO EDelta^Tbarlambda endbmatrix\nendaligned","category":"page"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"meaning that for an element of the horizontal component of the Lie algebra mathfrakg^mathrmhor we store A=Y^TDelta and B=barlambda^TDelta.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#Optimization","page":"Global Sections","title":"Optimization","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"The output of global_rep is then used for all the optimization steps.","category":"page"},{"location":"optimizers/manifold_related/global_sections/#References","page":"Global Sections","title":"References","text":"","category":"section"},{"location":"optimizers/manifold_related/global_sections/","page":"Global Sections","title":"Global Sections","text":"T. Frankel. The geometry of physics: an introduction (Cambridge university press, Cambridge, UK, 2011).\n\n\n\n","category":"page"},{"location":"optimizers/bfgs_optimizer/#The-BFGS-Algorithm","page":"BFGS Optimizer","title":"The BFGS Algorithm","text":"","category":"section"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The presentation shown here is largely taken from chapters 3 and 6 of reference [15] with a derivation based on an online comment. The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"It is a version of a quasi-Newton method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"m_k(x) = f(x_k) + (nabla_x_kf)^T(x - x_k) + frac12(x - x_k)^TB_k(x - x_k)","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where B_k is referred to as the approximate Hessian. We further require B_k to be symmetric and positive definite. Differentiating the above expression and setting the derivative to zero gives us: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"nabla_xm_k = nabla_x_kf + B_k(x - x_k) = 0","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"or written differently: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"x - x_k = -B_k^-1nabla_x_kf","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"This value we will from now on call p_k = x - x_k and refer to as the search direction. The new iterate then is: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"x_k+1 = x_k + alpha_kp_k","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where alpha_k is the step length. Techniques that describe how to pick an appropriate alpha_k are called line-search methods and are discussed below. First we discuss what requirements we impose on B_k. A first reasonable condition would be to require the gradient of m_k to be equal to that of f at the points x_k-1 and x_k: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nnabla_x_km_k = nabla_x_kf + B_k(x_k - x_k) overset= nabla_x_kf text and \nnabla_x_k-1m_k = nablax_kf + B_k(x_k-1 - x_k) overset= nabla_x_k-1f\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The first one of these conditions is of course automatically satisfied. The second one can be rewritten as: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"B_k(x_k - x_k-1) = overset= nabla_x_kf - nabla_x_k-1f ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The following notations are often used: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"s_k-1 = alpha_k-1p_k-1 = x_k - x_k-1 text and y_k-1 = nabla_x_kf - nabla_x_k-1f ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The conditions mentioned above then becomes: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"B_ks_k-1 overset= y_k-1","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"and we call it the secant equation. A second condition we impose on B_k is that is has to be positive-definite at point s_k-1:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"s_k-1^Ty_k-1 0","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"This is referred to as the curvature condition. If we impose the Wolfe conditions, the curvature condition hold automatically. The Wolfe conditions are stated with respect to the parameter alpha_k.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"The Wolfe conditions are:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"f(x_k+alphap_k)leqf(x_k) + c_1alpha(nabla_x_kf)^Tp_k for c_1in(01).\n(nabla_(x_k + alpha_kp_k)f)^Tp_k geq c_2(nabla_x_kf)^Tp_k for c_2in(c_11).","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"A possible choice for c_1 and c_2 are 10^-4 and 09 (see [15]). The two Wolfe conditions above are respectively called the sufficient decrease condition and the curvature condition respectively. Note that the second Wolfe condition (also called curvature condition) is stronger than the one mentioned before under the assumption that the first Wolfe condition is true:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"(nabla_x_kf)^Tp_k-1 - c_2(nabla_x_k-1f)^Tp_k-1 = y_k-1^Tp_k-1 + (1 - c_2)(nabla_x_k-1f)^Tp_k-1 geq 0","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"and the second term in this expression is (1 - c_2)(nabla_x_k-1f)^Tp_k-1geqfrac1-c_2c_1alpha_k-1(f(x_k) - f(x_k-1)), which is negative. ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"In order to pick the ideal B_k we solve the following problem: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nmin_B B - B_k-1_W \ntextst B = B^Ttext and Bs_k-1=y_k-1\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where the first condition is symmetry and the second one is the secant equation. For the norm cdot_W we pick the weighted Frobenius norm:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"A_W = W^12AW^12_F","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"where cdot_F is the usual Frobenius norm[1] and the matrix W=tildeB_k-1 is the inverse of the average Hessian:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB_k-1 = int_0^1 nabla^2f(x_k-1 + taualpha_k-1p_k-1)dtau","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"[1]: The Frobenius norm is A_F^2 = sum_ija_ij^2.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"In order to find the ideal B_k under the conditions described above, we introduce some notation: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB_k-1 = W^12B_k-1W^12,\ntildeB = W^12BW^12, \ntildey_k-1 = W^12y_k-1, \ntildes_k-1 = W^-12s_k-1.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"With this notation we can rewrite the problem of finding B_k as: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nmin_tildeB tildeB - tildeB_k-1_F \ntextst tildeB = tildeB^Ttext and tildeBtildes_k-1=tildey_k-1\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"We further have Wy_k-1 = s_k-1 (by the mean value theorem ?) and therefore tildey_k-1 = tildes_k-1.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"Now we rewrite B and B_k-1 in a new basis U = uu_perp, where u = tildes_k-1tildes_k-1 and u_perp is an orthogonal complement[2] of u:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"[2]: So we must have u^Tu_perp=0 and further u_perp^Tu_perp=mathbbI.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"beginaligned\nU^TtildeB_k-1U - U^TtildeBU = beginbmatrix u^T u_perp^T endbmatrix(tildeB_k-1 - tildeB)beginbmatrix u u_perp endbmatrix = \nbeginbmatrix\n u^TtildeB_k-1u - 1 u^TtildeB_k-1u \n u_perp^TtildeB_k-1u u_perp^T(tildeB_k-1-tildeB_k)u_perp\nendbmatrix\nendaligned","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"By a property of the Frobenius norm: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB_k-1 - tildeB^2_F = (u^TtildeB_k-1 -1)^2 + u^TtildeB_k-1u_perp_F^2 + u_perp^TtildeB_k-1u_F^2 + u_perp^T(tildeB_k-1 - tildeB)u_perp_F^2","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"We see that tildeB only appears in the last term, which should therefore be made zero. This then gives: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"tildeB = Ubeginbmatrix 1 0 0 u^T_perptildeB_k-1u_perp endbmatrix = uu^T + (mathbbI-uu^T)tildeB_k-1(mathbbI-uu^T)","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"If we now map back to the original coordinate system, the ideal solution for B_k is: ","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"B_k = (mathbbI - frac1y_k-1^Ts_k-1y_k-1s_k-1^T)B_k-1(mathbbI - frac1y_k-1^Ts_k-1s_k-1y_k-1^T) + frac1y_k-1^Ts_k-1y_ky_k^T","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"What we need in practice however is not B_k, but its inverse H_k. This is because we need to find s_k-1 based on y_k-1. To get H_k based on the expression for B_k above we can use the Sherman-Morrison-Woodbury formula[3] to obtain:","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"[3]: The Sherman-Morrison-Woodbury formula states (A + UCV)^-1 = A^-1 - A^-1 - A^-1U(C^-1 + VA^-1U)^-1VA^-1.","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"H_k = H_k-1 - fracH_k-1y_k-1y_k-1^TH_k-1y_k-1^TH_k-1y_k-1 + fracs_k-1s_k-1^Ty_k-1^Ts_k-1","category":"page"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"TODO: Example where this works well!","category":"page"},{"location":"optimizers/bfgs_optimizer/#References","page":"BFGS Optimizer","title":"References","text":"","category":"section"},{"location":"optimizers/bfgs_optimizer/","page":"BFGS Optimizer","title":"BFGS Optimizer","text":"J. N. Stephen J. Wright. Numerical optimization (Springer Science+Business Media, 2006).\n\n\n\n","category":"page"},{"location":"manifolds/inverse_function_theorem/#Foundational-Theorem-for-Differential-Manifolds","page":"Foundations of Differential Manifolds","title":"Foundational Theorem for Differential Manifolds","text":"","category":"section"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Here we state and proof all the theorem necessary to define differential manifold. All these theorems (including proofs) can be found in e.g. [2].","category":"page"},{"location":"manifolds/inverse_function_theorem/#The-Fixed-Point-Theorem","page":"Foundations of Differential Manifolds","title":"The Fixed-Point Theorem","text":"","category":"section"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"The fixed-point theorem will be used in the proof of the inverse function theorem below and the existence-and-uniqueness theorem. ","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.theorem(raw\"A function ``f:U \\to U`` defined on an open subset ``U`` of a complete metric vector space ``\\mathcal{V} \\supset U`` that is contractive, i.e. ``|f(z) - f(y)| \\leq q|z - y|`` with ``q < 1``, has a unique fixed point ``y^*`` such that ``f(y^*) = y^*``. Further ``y^*`` can be found by taking any ``y\\in{}U`` through ``y^* = \\lim_{m\\to\\infty}f^m(y)``.\"; name = \"Banach Fixed-Point Theorem\")","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.proof(raw\"Fix a point ``y\\in{}U``. We proof that the sequence ``(f^m(y))_{m\\in\\mathbb{N}}`` is Cauchy and because ``\\mathcal{V}`` is a complete metric space, the limit of this sequence exists. Take ``\\tilde{m} > m`` and we have\n\" *\nMain.indentation * raw\"```math\n\" *\nMain.indentation * raw\"\\begin{aligned}\n\" *\nMain.indentation * raw\"|f^{\\tilde{m}}(y) - f^m(y)| & \\leq \\sum_{i = m}^{\\tilde{m} - 1}|f^{i+1}(y) - f^{i}(y)| \\\\\n\" *\nMain.indentation * raw\" & \\leq \\sum_{i = m}^{\\tilde{m} - 1}q^i|f(y) - y| \\\\ \n\" *\nMain.indentation * raw\" & \\leq \\sum_{i = m}^\\infty{}q^i|f(y) - y| = (f(y) - y)\\left( \\frac{q}{1 - q} - \\sum_{i = 1}^{m-1}q^i \\right)\\\\\n\" *\nMain.indentation * raw\" & = (f(y) - y)\\left( \\frac{q}{1 - q} - \\frac{q - q^m}{q - 1} \\right) = (f(y) - y)\\frac{q^{m+1}}{1 - q}.\n\" *\nMain.indentation * raw\"\\end{aligned} \n\" *\nMain.indentation * raw\"```\n\" *\nMain.indentation * raw\"And the sequence is clearly Cauchy.\")","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Note that we stated the fixed-point theorem for arbitrary complete metric spaces here, not just for mathbbR^n. For the section on manifolds we only need the theorem for mathbbR^n, but for the existence-and-uniqueness theorem we need the statement for more general spaces. ","category":"page"},{"location":"manifolds/inverse_function_theorem/#The-Inverse-Function-Theorem","page":"Foundations of Differential Manifolds","title":"The Inverse Function Theorem","text":"","category":"section"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"The inverse function theorem gives a sufficient condition on a vector-valued function to be invertible in a neighborhood of a specific point. This theorem serves as a basis for the implicit function theorem and further for the preimage theorem and is critical in developing a theory of manifolds. Here we first state the theorem and then give a proof.","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.theorem(raw\"Consider a vector-valued differentiable function ``F:\\mathbb{R}^N\\to\\mathbb{R}^N`` and assume its Jacobian is non-degenerate at a point ``x\\in\\mathbb{R}^N``. Then there exists a neighborhood ``U`` that contains ``F(x)`` and on which ``F`` is invertible, i.e. ``\\exists{}H:U\\to\\mathbb{R}^N`` s.t. ``\\forall{}y\\in{}U,\\,F\\circ{}H(y) = y`` and ``H`` is differentiable.\"; name = \"Inverse function theorem\")","category":"page"},{"location":"manifolds/inverse_function_theorem/","page":"Foundations of Differential Manifolds","title":"Foundations of Differential Manifolds","text":"Main.proof(raw\"\"\"Consider a mapping ``F:\\mathbb{R}^N\\to\\mathbb{R}^N`` and assume its Jacobian has full rank at point ``x``, i.e. ``\\det{}F'(x)\\neq0``. We further assume that ``F(x) = 0``, ``F'(x) = \\mathbb{I}`` and ``x = 0``. Now consider a ball around ``x`` whose radius ``r`` we do not yet fix and two points ``y`` and ``z`` in that ball: ``y,z\\in{}B(r)``. We further introduce the function ``G(y):=y-F(y)``. By the *mean value theorem* we have ``|G(y)| = |G(y) - x| = |G(y) - G(x)|\\leq|y-x|\\sup_{05, width is set to 5). See the theory section for more details; there depth was called n.\nnhidden : the number of pairs of linear and activation layers with default value set to 1 (i.e the LA-SympNet is a composition of a linear layer, an activation layer and then again a single layer). \nactivation : the activation function for all the activations layers with default set to tanh,\ninitupperlinear : a boolean that indicates whether the first linear layer changes q first. By default this is true.\ninitupperact : a boolean that indicates whether the first activation layer changes q first. By default this is true.","category":"page"},{"location":"tutorials/sympnet_tutorial/#G-SympNet","page":"Sympnets","title":"G-SympNet","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"To call a G-SympNet, one needs to write","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"gsympnet = GSympNet(dim; upscaling_dimension=2*dim, n_layers=2, activation=tanh, init_upper=true) ","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"GSympNet takes one obligatory argument:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"dim : the dimension of the phase space (i.e. an integer) or optionally an instance of DataLoader. This latter option will be used below.","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"and severals keywords argument :","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"upscaling_dimension: The first dimension of the matrix with which the input is multiplied. In the theory section this matrix is called K and the upscaling dimension is called m.\nn_layers: the number of gradient layers with default value set to 2.\nactivation : the activation function for all the activations layers with default set to tanh.\ninit_upper : a boolean that indicates whether the first gradient layer changes q first. By default this is true.","category":"page"},{"location":"tutorials/sympnet_tutorial/#Loss-function","page":"Sympnets","title":"Loss function","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"The loss function described in the theory section is the default choice used in GeometricMachineLearning.jl for training SympNets.","category":"page"},{"location":"tutorials/sympnet_tutorial/#Data-Structures-in-GeometricMachineLearning.jl","page":"Sympnets","title":"Data Structures in GeometricMachineLearning.jl","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Main.include_graphics(\"../tikz/structs_visualization\") # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/#Examples","page":"Sympnets","title":"Examples","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Let us see how to use it on several examples.","category":"page"},{"location":"tutorials/sympnet_tutorial/#Example-of-a-pendulum-with-G-SympNet","page":"Sympnets","title":"Example of a pendulum with G-SympNet","text":"","category":"section"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Let us begin with a simple example, the pendulum system, the Hamiltonian of which is ","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"H(qp)inmathbbR^2 mapsto frac12p^2-cos(q) in mathbbR","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Here we generate pendulum data with the script GeometricMachineLearning/scripts/pendulum.jl:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"using GeometricMachineLearning # hide\nimport Random # hide\n\nRandom.seed!(1234)\n\n# load script\ninclude(\"../../../scripts/pendulum.jl\")\n# specify the data type\ntype = Float16 \n# get data \nqp_data = GeometricMachineLearning.apply_toNT(a -> type.(a), pendulum_data((q=[0.], p=[1.]); tspan=(0.,100.)))\n# call the DataLoader\ndl = DataLoader(qp_data)\n# this last line is a hack so as to not display the output # hide\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Next we specify the architectures. GeometricMachineLearning.jl provides useful defaults for all parameters although they can be specified manually (which is done in the following):","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"# layer dimension for gradient module \nconst upscaling_dimension = 2\n# hidden layers\nconst nhidden = 1\n# activation function\nconst activation = tanh\n\n# calling G-SympNet architecture \ngsympnet = GSympNet(dl, upscaling_dimension=upscaling_dimension, n_layers=4, activation=activation)\n\n# calling LA-SympNet architecture \nlasympnet = LASympNet(dl, nhidden=nhidden, activation=activation)\n\n# specify the backend\nconst backend = CPU()\n\n# initialize the networks\nla_nn = NeuralNetwork(lasympnet, backend, type) \ng_nn = NeuralNetwork(gsympnet, backend, type)\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"If we want to obtain information on the number of parameters in a neural network, we can do that very simply with the function parameterlength. For the LASympNet:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"parameterlength(la_nn.model)","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"And for the GSympNet:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"parameterlength(g_nn.model)","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Remark: We can also specify whether we would like to start with a layer that changes the q-component or one that changes the p-component. This can be done via the keywords init_upper for GSympNet, and init_upper_linear and init_upper_act for LASympNet.","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We have to define an optimizer which will be use in the training of the SympNet. For more details on optimizer, please see the corresponding documentation. In this example we use Adam:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"# set up optimizer; for this we first need to specify the optimization method (argue for why we need the optimizer method)\nopt_method = AdamOptimizer(type)\nla_opt = Optimizer(opt_method, la_nn)\ng_opt = Optimizer(opt_method, g_nn)\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We can now perform the training of the neural networks. The syntax is the following :","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"# number of training epochs\nconst nepochs = 300\n# Batchsize used to compute the gradient of the loss function with respect to the parameters of the neural networks.\nconst batch_size = 100\n\nbatch = Batch(batch_size)\n\n# perform training (returns array that contains the total loss for each training step)\ng_loss_array = g_opt(g_nn, dl, batch, nepochs)\nla_loss_array = la_opt(la_nn, dl, batch, nepochs)\nnothing # hide","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We can also plot the training errors against the epoch (here the y-axis is in log-scale):","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"using Plots\np1 = plot(g_loss_array, xlabel=\"Epoch\", ylabel=\"Training error\", label=\"G-SympNet\", color=3, yaxis=:log)\nplot!(p1, la_loss_array, label=\"LA-SympNet\", color=2)","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"The trainings data data_q and data_p must be matrices of mathbbR^ntimes d where n is the length of data and d is the half of the dimension of the system, i.e data_q[i,j] is q_j(t_i) where (t_1t_n) are the corresponding time of the training data.","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"Now we can make a prediction. Let's compare the initial data with a prediction starting from the same phase space point using the function iterate:","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"ics = (q=qp_data.q[:,1], p=qp_data.p[:,1])\n\nsteps_to_plot = 200\n\n#predictions\nla_trajectory = iterate(la_nn, ics; n_points = steps_to_plot)\ng_trajectory = iterate(g_nn, ics; n_points = steps_to_plot)\n\nusing Plots\np2 = plot(qp_data.q'[1:steps_to_plot], qp_data.p'[1:steps_to_plot], label=\"training data\")\nplot!(p2, la_trajectory.q', la_trajectory.p', label=\"LA Sympnet\")\nplot!(p2, g_trajectory.q', g_trajectory.p', label=\"G Sympnet\")","category":"page"},{"location":"tutorials/sympnet_tutorial/","page":"Sympnets","title":"Sympnets","text":"We see that GSympNet outperforms the LASympNet on this problem.","category":"page"},{"location":"optimizers/general_optimization/#Optimization-for-Neural-Networks","page":"General Optimization","title":"Optimization for Neural Networks","text":"","category":"section"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the gradient flow equation:","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"dottheta = -nabla_thetaL","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"by means of a Euler time-stepping scheme: ","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"theta^t+1 = theta^t - hnabla_theta^tL","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"where eta (the time step of the Euler scheme) is referred to as the learning rate","category":"page"},{"location":"optimizers/general_optimization/","page":"General Optimization","title":"General Optimization","text":"This equation can easily be generalized to manifolds by replacing the Euclidean gradient nabla_theta^tL by a Riemannian gradient -hmathrmgrad_theta^tL and addition by -hnabla_theta^tL with a retraction by -hmathrmgrad_theta^tL.","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Symplectic-Autoencoders-and-the-Toda-Lattice","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders and the Toda Lattice","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"In this tutorial we use a SymplecticAutoencoder to approximate the linear wave equation with a lower-dimensional Hamiltonian model and compare it with standard proper symplectic decomposition (PSD).","category":"page"},{"location":"tutorials/symplectic_autoencoder/#The-system","page":"Symplectic Autoencoders","title":"The system","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"The Toda lattice is a prototypical example of a Hamiltonian PDE. It is described by ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":" H(q p) = sum_ninmathbbZleft( fracp_n^22 + alpha e^q_n - q_n+1 right)","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We further assume a finite number of particles N and impose periodic boundary conditions: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"beginaligned\n q_n+N equiv q_n \n p_n+N equiv p_n\nendaligned","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"In this tutorial we want to reduce the dimension of the big system by a significant factor with (i) proper symplectic decomposition (PSD) and (ii) symplectic autoencoders. The first approach is strictly linear whereas the second one allows for more general mappings. ","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Using-the-Toda-lattice-in-numerical-experiments","page":"Symplectic Autoencoders","title":"Using the Toda lattice in numerical experiments","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"In order to use the Toda lattice in numerical experiments we have to pick suitable initial conditions. For this, consider the third-degree spline: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"h(s) = begincases\n 1 - frac32s^2 + frac34s^3 textif 0 leq s leq 1 \n frac14(2 - s)^3 textif 1 s leq 2 \n 0 textelse \nendcases","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Plotted on the relevant domain it looks like this: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Main.include_graphics(\"../tikz/third_degree_spline\") # hide","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We end up with the following choice of parametrized initial conditions: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"u_0(mu)(omega) = h(s(omega mu)) quad s(omega mu) = 20 mu omega + fracmu2","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"For the purposes of this tutorial we will use the default value for mu provided in GeometricMachineLearning:","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"using GeometricProblems.TodaLattice: μ\n\nμ","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Get-the-data","page":"Symplectic Autoencoders","title":"Get the data","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"The training data can very easily be obtained by using the packages GeometricProblems and GeometricIntegrators:","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"using GeometricProblems.TodaLattice: hodeproblem\nusing GeometricIntegrators: integrate, ImplicitMidpoint\nusing GeometricMachineLearning \nusing Plots\nimport Random\n\npr = hodeproblem(; tspan = (0.0, 100.))\nsol = integrate(pr, ImplicitMidpoint())\ndl = DataLoader(sol; autoencoder = true)\n\ndl.input_dim","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Here we first integrate the system with implicit midpoint and then put the training data into the right format by calling DataLoader. We can get the dimension of the system by calling dl.input_dim. Also note that the keyword autoencoder was set to true.","category":"page"},{"location":"tutorials/symplectic_autoencoder/#Train-the-network","page":"Symplectic Autoencoders","title":"Train the network","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We now want to compare two different approaches: PSDArch and SymplecticAutoencoder. For this we first have to set up the networks: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"const reduced_dim = 2\n\npsd_arch = PSDArch(dl.input_dim, reduced_dim)\nsae_arch = SymplecticAutoencoder(dl.input_dim, reduced_dim; n_encoder_blocks = 4, n_decoder_blocks = 4, n_encoder_layers = 4, n_decoder_layers = 1)\n\nRandom.seed!(123)\npsd_nn = NeuralNetwork(psd_arch)\nsae_nn = NeuralNetwork(sae_arch)\n\nnothing # hide","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Training a neural network is usually done by calling an instance of Optimizer in GeometricMachineLearning. PSDArch however can be solved directly by using singular value decomposition and this is done by calling solve!. The SymplecticAutoencoder we train with the AdamOptimizer however: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"const n_epochs = 8\nconst batch_size = 16\n\no = Optimizer(sae_nn, AdamOptimizer(Float64))\n\npsd_error = solve!(psd_nn, dl)\nsae_error = o(sae_nn, dl, Batch(batch_size), n_epochs)\n\nhline([psd_error]; color = 2, label = \"PSD error\")\nplot!(sae_error; color = 3, label = \"SAE error\", xlabel = \"epoch\", ylabel = \"training error\")","category":"page"},{"location":"tutorials/symplectic_autoencoder/#The-online-stage","page":"Symplectic Autoencoders","title":"The online stage","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"After having trained our neural network we can now evaluate it in the online stage of reduced complexity modeling: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"psd_rs = HRedSys(pr, encoder(psd_nn), decoder(psd_nn); integrator = ImplicitMidpoint())\nsae_rs = HRedSys(pr, encoder(sae_nn), decoder(sae_nn); integrator = ImplicitMidpoint())\n\nprojection_error(psd_rs)","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"projection_error(sae_rs)","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"Next we plot a comparison between the PSD prediction and the symplectic autoencoder prediction: ","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"sol_full = integrate_full_system(psd_rs)\nsol_psd_reduced = integrate_reduced_system(psd_rs)\nsol_sae_reduced = integrate_reduced_system(sae_rs)\n\nconst t_step = 100\nplot(sol_full.s.q[t_step], label = \"Implicit Midpoint\")\nplot!(psd_rs.decoder((q = sol_psd_reduced.s.q[t_step], p = sol_psd_reduced.s.p[t_step])).q, label = \"PSD\")\nplot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_step], p = sol_sae_reduced.s.p[t_step])).q, label = \"SAE\")","category":"page"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"We can see that the autoencoder approach has much more approximation capabilities than the psd approach. The jiggly lines are due to the fact that training was done for only 8 epochs. ","category":"page"},{"location":"tutorials/symplectic_autoencoder/#References","page":"Symplectic Autoencoders","title":"References","text":"","category":"section"},{"location":"tutorials/symplectic_autoencoder/","page":"Symplectic Autoencoders","title":"Symplectic Autoencoders","text":"P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).\n\n\n\nL. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).\n\n\n\nC. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).\n\n\n\n","category":"page"},{"location":"optimizers/manifold_related/retractions/#Retractions","page":"Retractions","title":"Retractions","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/#Classical-Definition","page":"Retractions","title":"Classical Definition","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Classically, retractions are defined as maps smooth maps ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"R TmathcalMtomathcalM(xv)mapstoR_x(v)","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"such that each curve c(t) = R_x(tv) satisfies c(0) = x and c(0) = v.","category":"page"},{"location":"optimizers/manifold_related/retractions/#In-GeometricMachineLearning","page":"Retractions","title":"In GeometricMachineLearning","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Retractions are a map from the horizontal component of the Lie algebra mathfrakg^mathrmhor to the respective manifold.","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"For optimization in neural networks (almost always first order) we solve a gradient flow equation ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"dotW = -mathrmgrad_WL ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"where mathrmgrad_WL is the Riemannian gradient of the loss function L evaluated at position W.","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"If we deal with Euclidean spaces (vector spaces), then the Riemannian gradient is just the result of an AD routine and the solution of the equation above can be approximated with W^t+1 gets W^t - etanabla_W^tL, where eta is the learning rate. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"For manifolds, after we obtained the Riemannian gradient (see e.g. the section on Stiefel manifold), we have to solve a geodesic equation. This is a canonical ODE associated with any Riemannian manifold. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"The general theory of Riemannian manifolds is rather complicated, but for the neural networks treated in GeometricMachineLearning, we only rely on optimization of matrix Lie groups and homogeneous spaces, which is much simpler. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"For Lie groups each tangent space is isomorphic to its Lie algebra mathfrakgequivT_mathbbIG. The geodesic map from mathfrakg to G, for matrix Lie groups with bi-invariant Riemannian metric like SO(N), is simply the application of the matrix exponential exp. Alternatively this can be replaced by the Cayley transform (see (Absil et al, 2008).)","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Starting from this basic map expmathfrakgtoG we can build mappings for more complicated cases: ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"General tangent space to a Lie group T_AG: The geodesic map for an element VinT_AG is simply Aexp(A^-1V).\nSpecial tangent space to a homogeneous space T_EmathcalM: For V=BEinT_EmathcalM the exponential map is simply exp(B)E. \nGeneral tangent space to a homogeneous space T_YmathcalM with Y = AE: For Delta=ABEinT_YmathcalM the exponential map is simply Aexp(B)E. This is the general case which we deal with. ","category":"page"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function retraction in GeometricMachineLearning performs mathfrakg^mathrmhortomathcalM, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with apply_section and represented through the red vertical line in the diagram on the general optimizer framework.","category":"page"},{"location":"optimizers/manifold_related/retractions/#Word-of-caution","page":"Retractions","title":"Word of caution","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"The Lie group corresponding to the Stiefel manifold SO(N) has a bi-invariant Riemannian metric associated with it: (B_1B_2)mapsto mathrmTr(B_1^TB_2). For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult (see (Bendokat et al, 2021).)","category":"page"},{"location":"optimizers/manifold_related/retractions/#References","page":"Retractions","title":"References","text":"","category":"section"},{"location":"optimizers/manifold_related/retractions/","page":"Retractions","title":"Retractions","text":"Absil P A, Mahony R, Sepulchre R. Optimization algorithms on matrix manifolds[M]. Princeton University Press, 2008.\nBendokat T, Zimmermann R. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications[J]. arXiv preprint arXiv:2108.12447, 2021.\nO'Neill, Barrett. Semi-Riemannian geometry with applications to relativity. Academic press, 1983.","category":"page"},{"location":"arrays/skew_symmetric_matrix/#Symmetric-Skew-Symmetric-and-Triangular-Matrices.","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric Skew-Symmetric and Triangular Matrices.","text":"","category":"section"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"Among the special arrays implemented in GeometricMachineLearning SymmetricMatrix, SkewSymMatrix, UpperTriangular and LowerTriangular are the most common ones and these can also be found in other libraries; LinearAlgebra.jl has an implementation of a symmetric matrix called Symmetric for example. The versions of these matrices in GeometricMachineLearning are however more memory efficient as they only store as many parameters as are necessary, i.e. n(n+1)2 for the symmetric matrix and n(n-1)2 for the other three. In addition operations such as matrix and tensor multiplication are implemented for these matrices to work in parallel on GPU. ","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"We now show the various matrices. First UpperTriangular:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"U = beginpmatrix\n 0 a_12 cdots a_1n \n 0 ddots a_2n \n vdots ddots ddots vdots \n 0 cdots 0 0 \nendpmatrix","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"The matrix LowerTriangular:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"L = beginpmatrix\n 0 0 cdots 0 \n a_21 ddots vdots \n vdots ddots ddots vdots \n a_n1 cdots a_n(n-1) 0 \nendpmatrix","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"An instance of SkewSymMatrix can be written as A = L - L^T or A = U - U^T:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"A = beginpmatrix\n 0 - a_21 cdots - a_n1 \n a_21 ddots vdots \n vdots ddots ddots vdots \n a_n1 cdots a_n(n-1) 0 \nendpmatrix","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"And lastly a SymmetricMatrix:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"L = beginpmatrix\n a_11 a_21 cdots a_n1 \n a_21 ddots vdots \n vdots ddots ddots vdots \n a_n1 cdots a_n(n-1) a_nn\nendpmatrix","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"Note that any matrix MinmathbbR^ntimesn can be written","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"M = frac12(M - M^T) + frac12(M + M^T)","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"where the first part of this matrix is skew-symmetric and the second part is symmetric. This is also how the constructors for SkewSymMatrix and SymmetricMatrix are designed:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"using GeometricMachineLearning\n\nM = rand(3, 3) ","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"A = SkewSymMatrix(M)","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"B = SymmetricMatrix(M)","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"@assert M ≈ A + B # hide\nM ≈ A + B","category":"page"},{"location":"arrays/skew_symmetric_matrix/#How-are-Special-Matrices-Stored?","page":"Symmetric and Skew-Symmetric Matrices","title":"How are Special Matrices Stored?","text":"","category":"section"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"The following image demonstrates how special matrices are stored in GeometricMachineLearning:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"Main.include_graphics(\"../tikz/skew_sym_visualization\"; caption = \"The elements of a skew-symmetric matrix (and other special matrices) are stored as a vector. The elements of the big vector are the entries on the lower left of the matrix, stored row-wise.\") # hide","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"So what is stored internally is a vector of size n(n-1)2 for the skew-symmetric matrix and the triangular matrices and a vector of size n(n+1)2 for the symmetric matrix. We can sample a random skew-symmetric matrix: ","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"using GeometricMachineLearning \nimport Random \nRandom.seed!(123)\n\nA = rand(SkewSymMatrix, 5)","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"and then access the vector:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"A.S ","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"This is equivalent to sampling a vector and then assigning a matrix:","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"using GeometricMachineLearning\nimport Random\nRandom.seed!(123)\n\nS = rand(5 * (5 - 1) ÷ 2)\nSkewSymMatrix(S, 5)","category":"page"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"These special matrices are important for SympNets, volume-preserving transformers and linear symplectic transformers.","category":"page"},{"location":"arrays/skew_symmetric_matrix/#Parallel-Computation","page":"Symmetric and Skew-Symmetric Matrices","title":"Parallel Computation","text":"","category":"section"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"The functions GeometricMachineLearning.mat_tensor_mul and GeometricMachineLearning.tensor_mat_mul are also implemented for these matrices for efficient parallel computations. This is elaborated on when we introduce pullbacks.","category":"page"},{"location":"arrays/skew_symmetric_matrix/#Library-Functions","page":"Symmetric and Skew-Symmetric Matrices","title":"Library Functions","text":"","category":"section"},{"location":"arrays/skew_symmetric_matrix/","page":"Symmetric and Skew-Symmetric Matrices","title":"Symmetric and Skew-Symmetric Matrices","text":"UpperTriangular\nUpperTriangular(::AbstractMatrix)\nLowerTriangular\nLowerTriangular(::AbstractMatrix)\nSymmetricMatrix\nSymmetricMatrix(::AbstractMatrix)\nSkewSymMatrix\nSkewSymMatrix(::AbstractMatrix)","category":"page"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.UpperTriangular-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.UpperTriangular","text":"LowerTriangular(S::AbstractVector, n::Int)\n\nBuild a lower-triangular matrix from a vector.\n\nA lower-triangular matrix is an ntimesn matrix that has ones on the diagonal and zeros on the upper triangular.\n\nThe data are stored in a vector S similarly to other matrices. See LowerTriangular, SkewSymMatrix and SymmetricMatrix.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6]\nUpperTriangular(S, 4)\n\n# output\n\n4×4 UpperTriangular{Int64, Vector{Int64}}:\n 0 1 2 4\n 0 0 3 5\n 0 0 0 6\n 0 0 0 0\n\n\n\n\n\n","category":"type"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.UpperTriangular-Tuple{AbstractMatrix}-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.UpperTriangular","text":"UpperTriangular(A::AbstractMatrix)\n\nBuild a lower-triangular matrix from a matrix.\n\nThis is done by taking the lower left of that matrix.\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nUpperTriangular(M)\n\n# output\n\n4×4 UpperTriangular{Int64, Vector{Int64}}:\n 0 2 3 4\n 0 0 7 8\n 0 0 0 12\n 0 0 0 0\n\n\n\n\n\n","category":"method"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.LowerTriangular-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.LowerTriangular","text":"LowerTriangular(S::AbstractVector, n::Int)\n\nBuild a lower-triangular matrix from a vector.\n\nA lower-triangular matrix is an ntimesn matrix that has ones on the diagonal and zeros on the upper triangular.\n\nThe data are stored in a vector S similarly to other matrices. See UpperTriangular, SkewSymMatrix and SymmetricMatrix.\n\nThe struct two fields: S and n. The first stores all the entries of the matrix in a sparse fashion (in a vector) and the second is the dimension n for AinmathbbR^ntimesn.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6]\nLowerTriangular(S, 4)\n\n# output\n\n4×4 LowerTriangular{Int64, Vector{Int64}}:\n 0 0 0 0\n 1 0 0 0\n 2 3 0 0\n 4 5 6 0\n\n\n\n\n\n","category":"type"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.LowerTriangular-Tuple{AbstractMatrix}-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.LowerTriangular","text":"LowerTriangular(A::AbstractMatrix)\n\nBuild a lower-triangular matrix from a matrix.\n\nThis is done by taking the lower left of that matrix.\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nLowerTriangular(M)\n\n# output\n\n4×4 LowerTriangular{Int64, Vector{Int64}}:\n 0 0 0 0\n 5 0 0 0\n 9 10 0 0\n 13 14 15 0\n\n\n\n\n\n","category":"method"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.SymmetricMatrix-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.SymmetricMatrix","text":"SymmetricMatrix(S::AbstractVector, n::Integer)\n\nInstantiate a symmetric matrix with information stored in vector S.\n\nA SymmetricMatrix A is a matrix A^T = A.\n\nInternally the struct saves a vector S of size n(n+1)div2. The conversion is done the following way: \n\nA_ij = begincases S( (i-1) i ) div 2 + j textif igeqj \n S( (j-1) j ) div 2 + i textelse endcases\n\nSo S stores a string of vectors taken from A: S = tildea_1 tildea_2 ldots tildea_n with tildea_i = A_i1A_i2ldotsA_ii.\n\nAlso see SkewSymMatrix, LowerTriangular and UpperTriangular.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\nSymmetricMatrix(S, 4)\n\n# output\n\n4×4 SymmetricMatrix{Int64, Vector{Int64}}:\n 1 2 4 7\n 2 3 5 8\n 4 5 6 9\n 7 8 9 10\n\n\n\n\n\n","category":"type"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.SymmetricMatrix-Tuple{AbstractMatrix}-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.SymmetricMatrix","text":"SymmetricMatrix(A::AbstractMatrix)\n\nPerform 0.5 * (A + A') and store the matrix in an efficient way (as a vector with n(n+1)2 entries).\n\nIf the constructor is called with a matrix as input it returns a symmetric matrix via the projection:\n\nA mapsto frac12(A + A^T)\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nSymmetricMatrix(M)\n\n# output\n\n4×4 SymmetricMatrix{Float64, Vector{Float64}}:\n 1.0 3.5 6.0 8.5\n 3.5 6.0 8.5 11.0\n 6.0 8.5 11.0 13.5\n 8.5 11.0 13.5 16.0\n\nExtend help\n\nNote that the constructor is designed in such a way that it always returns matrices of type SymmetricMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.\n\nIf the user wishes to allocate a matrix SymmetricMatrix{<:Integer} the constructor SymmetricMatrix(::AbstractVector, n::Integer) has to be called.\n\n\n\n\n\n","category":"method"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.SkewSymMatrix-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.SkewSymMatrix","text":"SkewSymMatrix(S::AbstractVector, n::Integer)\n\nInstantiate a skew-symmetric matrix with information stored in vector S.\n\nA skew-symmetric matrix A is a matrix A^T = -A.\n\nInternally the struct saves a vector S of size n(n-1)div2. The conversion is done the following way: \n\nA_ij = begincases 0 textif i=j \n S( (i-2) (i-1) ) div 2 + j textif ij \n S( (j-2) (j-1) ) div 2 + i textelse endcases\n\nAlso see SymmetricMatrix, LowerTriangular and UpperTriangular.\n\nExamples\n\nusing GeometricMachineLearning\nS = [1, 2, 3, 4, 5, 6]\nSkewSymMatrix(S, 4)\n\n# output\n\n4×4 SkewSymMatrix{Int64, Vector{Int64}}:\n 0 -1 -2 -4\n 1 0 -3 -5\n 2 3 0 -6\n 4 5 6 0\n\n\n\n\n\n","category":"type"},{"location":"arrays/skew_symmetric_matrix/#GeometricMachineLearning.SkewSymMatrix-Tuple{AbstractMatrix}-arrays-skew_symmetric_matrix","page":"Symmetric and Skew-Symmetric Matrices","title":"GeometricMachineLearning.SkewSymMatrix","text":"SkewSymMatrix(A::AbstractMatrix)\n\nPerform 0.5 * (A - A') and store the matrix in an efficient way (as a vector with n(n-1)2 entries).\n\nIf the constructor is called with a matrix as input it returns a skew-symmetric matrix via the projection:\n\nA mapsto frac12(A - A^T)\n\nExamples\n\nusing GeometricMachineLearning\nM = [1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16]\nSkewSymMatrix(M)\n\n# output\n\n4×4 SkewSymMatrix{Float64, Vector{Float64}}:\n 0.0 -1.5 -3.0 -4.5\n 1.5 0.0 -1.5 -3.0\n 3.0 1.5 0.0 -1.5\n 4.5 3.0 1.5 0.0\n\nExtend help\n\nNote that the constructor is designed in such a way that it always returns matrices of type SkewSymMatrix{<:AbstractFloat} when called with a matrix, even if this matrix is of type AbstractMatrix{<:Integer}.\n\nIf the user wishes to allocate a matrix SkewSymMatrix{<:Integer} the constructor SkewSymMatrix(::AbstractVector, n::Integer) has to be called.\n\n\n\n\n\n","category":"method"},{"location":"pullbacks/computation_of_pullbacks/#Pullbacks-and-Automatic-Differentiation","page":"Pullbacks","title":"Pullbacks and Automatic Differentiation","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"Automatic Differentiation is an important part of modern machine learning libraries. It is essentially a tool to compute the gradient of a loss function with respect to its input arguments. ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#How-to-Compute-Pullbacks","page":"Pullbacks","title":"How to Compute Pullbacks","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"GeometricMachineLearning has many pullbacks for custom array types and other operations implemented. The need for this essentially comes from the fact that we cannot trivially differentiate custom GPU kernels at the moment[1].","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"[1]: This will change once we switch to Enzyme (see [12]), but the package is still in its infancy. ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#What-is-a-pullback?","page":"Pullbacks","title":"What is a pullback?","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"Here we first explain the principle of a pullback with the example of a vector-valued function. The generalization to matrices and higher-order tensors is straight-forward. ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"The pullback of a vector-valued function fmathbbR^ntomathbbR^m can be interpreted as the sensitivities in the input space mathbbR^n with respect to variations in the output space mathbbR^m via the function f: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"leftmathrmpullback(f)ainmathbbR^n dbinmathbbR^mright_i = sum_j=1^mfracpartialf_jpartiala_idb_j","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"This principle can easily be generalized to matrices. For this consider the function gmathbbR^n_1timesn_2tomathbbR^m_1timesm_2. For this case we have: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"leftmathrmpullback(g)AinmathbbR^n_1timesn_2 dBinmathbbR^m_1timesm_2right_(i_1 i_2) = sum_j_1=1^m_1sum_j_2=1^m_2fracpartialf_(j_1 j_2)partiala_(i_1 i_2)db_(j_1 j_2)","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"The generalization to higher-order tensors is again straight-forward.","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#Illustrative-example","page":"Pullbacks","title":"Illustrative example","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"Consider the matrix inverse mathrminv mathbbR^ntimesntomathbbR^ntimesn as an example. This fits into the above framework where inv is a matrix-valued function from mathbbR^ntimesn to mathbbR^ntimesn. We here write B = A^-1 = mathrminv(A). We thus have to compute: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"leftmathrmpullback(mathrminv)AinmathbbR^ntimesn dBinmathbbR^ntimesnright_(i j) = sum_k=1^nsum_ell=1^nfracpartialb_k ellpartiala_i jdb_k ell","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"For a matrix A that depends on a parameter varepsilon we have that: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"fracpartialpartialvarepsilonB = -Bleft( fracpartialpartialvarepsilon right) B","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"This can easily be checked: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"mathbbO = fracpartialpartialvarepsilonmathbbI = fracpartialpartialvarepsilon(AB) = AfracpartialpartialvarepsilonB + left(fracpartialpartialvarepsilonAright)B","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"We can then write: ","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"beginaligned\nsum_kellleft( fracpartialpartiala_ij b_kell right) db_kell = sum_kellleft fracpartialpartiala_ij B right_kell db_kell \n = - sum_kellleftB left(fracpartialpartiala_ij Aright) B right_kell db_kell \n = - sum_kellmnb_km left(fracpartiala_mnpartiala_ijright) b_nell db_kell \n = - sum_kellmnb_km delta_imdelta_jn b_nell db_kell \n = - sum_kellb_ki b_jell db_kell \n equiv - B^TcdotdBcdotB^T \nendaligned","category":"page"},{"location":"pullbacks/computation_of_pullbacks/#Motivation-from-a-differential-geometric-perspective","page":"Pullbacks","title":"Motivation from a differential-geometric perspective","text":"","category":"section"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"The notions of a pullback in automatic differentiation and differential geometry are closely related (see e.g. [13] and [14]). In both cases we want to compute, based on a mapping fmathcalVtomathcalW a mapsto f(a) = b, a map of differentials db mapsto da. In the differential geometry case db and da are part of the associated cotangent spaces, i.e. dbinT^*_bmathcalW and dainT^*_amathcalV; in AD we (mostly) deal with spaces of arrays, i.e. vector spaces, which means that dbinmathcalW and dainmathcalV.","category":"page"},{"location":"pullbacks/computation_of_pullbacks/","page":"Pullbacks","title":"Pullbacks","text":"M. Betancourt. A geometric theory of higher-order automatic differentiation, arXiv preprint arXiv:1812.11592 (2018).\n\n\n\nJ. Bolte and E. Pauwels. A mathematical model for automatic differentiation in machine learning. Advances in Neural Information Processing Systems 33, 10809–10819 (2020).\n\n\n\n","category":"page"},{"location":"reduced_order_modeling/autoencoder/#Reduced-Order-modeling-and-Autoencoders","page":"POD and Autoencoders","title":"Reduced Order modeling and Autoencoders","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"Reduced order modeling is a data-driven technique that exploits the structure of parametric PDEs to make solving those PDEs easier.","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"Consider a parametric PDE written in the form: F(z(mu)mu)=0 where z(mu) evolves on a infinite-dimensional Hilbert space V. ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of V, which will be denoted by V_h. ","category":"page"},{"location":"reduced_order_modeling/autoencoder/#Solution-manifold","page":"POD and Autoencoders","title":"Solution manifold","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"To any parametric PDE we associate a solution manifold: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"mathcalM = z(mu)F(z(mu)mu)=0 muinmathbbP","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"(Image: )","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"In the image above a 2-dimensional solution manifold is visualized as a sub-manifold in 3-dimensional space. In general the embedding space is an infinite-dimensional function space.","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"As an example of this consider the 1-dimensional wave equation: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"partial_tt^2q(tximu) = mu^2partial_xixi^2q(tximu)text on ItimesOmega","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"where I = (01) and Omega=(-1212). As initial condition for the first derivative we have partial_tq(0ximu) = -mupartial_xiq_0(ximu) and furthermore q(tximu)=0 on the boundary (i.e. xiin-1212).","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"The solution manifold is a 1-dimensional submanifold: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"mathcalM = (t xi)mapstoq(tximu)=q_0(xi-mutmu)muinmathbbPsubsetmathbbR","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"If we provide an initial condition u_0, a parameter instance mu and a time t, then ximapstoq(tximu) will be the momentary solution. If we consider the time evolution of q(tximu), then it evolves on a two-dimensional submanifold barmathcalM = ximapstoq(tximu)tinImuinmathbbP.","category":"page"},{"location":"reduced_order_modeling/autoencoder/#General-workflow","page":"POD and Autoencoders","title":"General workflow","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"In reduced order modeling we aim to construct a mapping to a space that is close to this solution manifold. This is done through the following steps: ","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"Discretize the PDE.\nSolve the discretized PDE for a certain set of parameter instances muinmathbbP.\nBuild a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the reduction mathcalP and the reconstruction mathcalR.","category":"page"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been Proper orthogonal decomposition (POD), but in recent years autoencoders have also become a popular alternative (see (Fresca et al, 2021)). ","category":"page"},{"location":"reduced_order_modeling/autoencoder/#References","page":"POD and Autoencoders","title":"References","text":"","category":"section"},{"location":"reduced_order_modeling/autoencoder/","page":"POD and Autoencoders","title":"POD and Autoencoders","text":"S. Fresca, L. Dede’ and A. Manzoni. A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs. Journal of Scientific Computing 87, 1–36 (2021).\n\n\n\n","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/#The-Existence-And-Uniqueness-Theorem","page":"Differential Equations and the EAU theorem","title":"The Existence-And-Uniqueness Theorem","text":"","category":"section"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"The existence-and-uniqueness theorem, also known as the Picard-Lindelöf theorem, Picard's existence theorem and the Cauchy-Lipschitz theorem gives a proof of the existence of solutions for ODEs. Here we state the existence-and-uniqueness theorem for manifolds as vector fields are just a special case of this. Its proof relies on the Banach fixed-point theorem[1].","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"[1]: It has to be noted that the proof given here is not entirely self-contained. The proof of the fundamental theorem of calculus, i.e. the proof of the existence of an antiderivative of a continuous function [4], is omitted for example. ","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"Main.theorem(raw\"Let ``X`` a vector field on the manifold ``\\mathcal{M}`` that is differentiable at ``x``. Then we can find an ``\\epsilon>0`` and a unique curve ``\\gamma:(-\\epsilon, \\epsilon)\\to\\mathcal{M}`` such that ``\\gamma'(t) = X(\\gamma(t))``.\"; name = \"Existence-And-Uniqueness Theorem\")","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"Main.proof(raw\"We consider a ball around a point ``x\\in\\mathcal{M}`` with radius ``r`` that we pick such that the ball ``B(x, r)`` fits into the ``U`` of some coordinate chart ``\\varphi_U``; we further use ``X`` and ``\\varphi'\\circ{}X\\circ\\varphi^{-1}`` interchangeably in this proof. We then define ``L := \\mathrm{sup}_{y,z\\in{}B(x,r)}|X(y) - X(z)|/|y - z|.`` Note that this ``L`` is always finite because ``X`` is bounded and differentiable. We now define the map ``\\Gamma: C^\\infty((-\\epsilon, \\epsilon), \\mathbb{R}^n)\\to{}C^\\infty((-\\epsilon, \\epsilon), \\mathbb{R}^n)`` (for some ``\\epsilon`` that we do not yet fix) as \n\" * \nMain.indentation * raw\"```math\n\" * \nMain.indentation * raw\"\\Gamma\\gamma(t) = x + \\int_0^tX(\\gamma(s))ds,\n\" * \nMain.indentation * raw\"```\n\" * \nMain.indentation * raw\"i.e. ``\\Gamma`` maps ``C^\\infty`` curves through ``x`` into ``C^\\infty`` curves through ``x``. We further have with the norm ``||\\gamma||_\\infty = \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}|\\gamma(t)|``:\n\" * \nMain.indentation * raw\"```math\n\" *\nMain.indentation * raw\"\\begin{aligned} \n\" * \nMain.indentation * raw\"||\\Gamma(\\gamma_1 - \\gamma_2)||_\\infty & = \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}\\left| \\int_0^t (X(\\gamma_1(s)) - X(\\gamma_2(s)))ds \\right| \\\\\n\" * \nMain.indentation * raw\"& \\leq \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}\\int_0^t | X(\\gamma_1(s)) - X(\\gamma_2(s)) | ds \\\\\n\" * \nMain.indentation * raw\"& \\leq \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}\\int_0^t L |\\gamma_1(s) - \\gamma_2(s)| ds \\\\\n\" * \nMain.indentation * raw\"& \\leq \\epsilon{}L \\cdot \\mathrm{sup}_{t \\in (-\\epsilon, \\epsilon)}|\\gamma_1(t) - \\gamma_2(t)|,\n\" * \nMain.indentation * raw\"\\end{aligned}\n\" * \nMain.indentation * raw\"```\n\" * \nMain.indentation * raw\"and we see that ``\\Gamma`` is a contractive mapping if we pick ``\\epsilon`` small enough and we can hence apply the fixed-point theorem. So there has to exist a ``C^\\infty`` curve through ``x`` that we call ``\\gamma^*`` such that \n\" * \nMain.indentation * raw\"```math\n\" * \nMain.indentation * raw\"\\gamma^*(t) = \\int_0^tX(\\gamma^*(s))ds,\n\" *\nMain.indentation * raw\"and this ``\\gamma^*`` is the curve we were looking for. Its uniqueness is guaranteed by the fixed-point theorem.\")","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"For all the problems we discuss here we can extend the integral curves of X from the finite interval (-epsilon epsilon) to all of mathbbR. The solution gamma we call an integral curve or flow of the vector field (ODE).","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/#Time-Dependent-Vector-Fields","page":"Differential Equations and the EAU theorem","title":"Time-Dependent Vector Fields","text":"","category":"section"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"We proved the theorem above for a time-independent vector field X, but it also holds for time-dependent vector fields, i.e. for mapping of the form: ","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"X 0TtimesmathcalMtoTM","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"The proof for this case proceeds analogously to the case of the time-independent vector field; to apply the proof we simply have to extend the vector field to (here written for a specific coordinate chart varphi_U): ","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"barX 0 TtimesmathbbR^ntomathbbR^n+1 (t x_1 ldots x_n) mapsto (1 X(x_1 ldots x_n))","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"More details on this can be found in e.g. [2]. For GeometricMachineLearning time-dependent vector fields are important because many of the optimizers we are using (such as the Adam optimizer) can be seen as approximating the flow of a time-dependent vector field.","category":"page"},{"location":"manifolds/existence_and_uniqueness_theorem/#Reference","page":"Differential Equations and the EAU theorem","title":"Reference","text":"","category":"section"},{"location":"manifolds/existence_and_uniqueness_theorem/","page":"Differential Equations and the EAU theorem","title":"Differential Equations and the EAU theorem","text":"S. Lang. Real and functional analysis. Vol. 142 (Springer Science & Business Media, 2012).\n\n\n\nS. Lang. Fundamentals of differential geometry. Vol. 191 (Springer Science & Business Media, 2012).\n\n\n\n","category":"page"},{"location":"arrays/global_tangent_spaces/#Global-Tangent-Spaces","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"","category":"section"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"In GeometricMachineLearning standard neural network optimizers are generalized to homogeneous spaces by leveraging the special structure of the tangent spaces of this class of manifolds. When we introduced homogeneous spaces we already talked about that every tangent space to a homogeneous space T_YmathcalM is of the form: ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":" T_YmathcalM = mathfrakg cdot Y = AY Ainmathfrakg","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We then have a decomposition of mathfrakg into a vertical part mathfrakg^mathrmver Y and a horizontal part mathfrakg^mathrmhor Y and the horizontal part is isomorphic to T_YmathcalM. ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We now identify a special element E in mathcalM and designate the horizontal component mathfrakg^mathrmhor E as our global tangent space. We will refer to this global tangent space by mathfrakg^mathrmhor. We can now find a transformation from any mathfrakg^mathrmhor Y to mathfrakg^mathrmhor and vice-versa (these spaces are isomorphic).","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Main.theorem(raw\"Let ``A\\in{}G`` an element such that ``AE = Y``. Then we have\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"A^{-1}\\cdot\\mathfrak{g}^{\\mathrm{hor},Y}\\cdot{}A = \\mathfrak{g}^\\mathrm{hor},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"i.e. for every element ``B\\in\\mathfrak{g}^\\mathrm{hor}`` we can find a ``B^Y \\in \\mathfrak{g}^{\\mathrm{hor},Y}`` s.t. ``B = A^{-1}B^YA`` (and vice-versa).\")","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Main.proof(raw\"We first show that for every ``B^Y\\in\\mathfrak{g}^{\\mathrm{hor},Y}`` the element ``A^{-1}B^YA`` is in ``\\mathfrak{g}^{\\mathrm{hor}}``. First not that ``A^{-1}B^YA\\in\\mathfrak{g}`` by a fundamental theorem of Lie group theory (closedness of the Lie algebra under adjoint action). Now assume that ``A^{-1}B^YA`` is not fully contained in ``\\mathfrak{g}^\\mathrm{hor}``, i.e. it also has a vertical component. So we would lose information when performing ``A^{-1}B^YA \\mapsto A^{-1}B^YAE = A^{-1}B^YY``, but this contradicts the fact that ``B^Y\\in\\mathfrak{g}^{\\mathrm{hor},Y}.`` We now have to proof that for every ``B\\in\\mathfrak{g}^\\mathrm{hor}`` we can find an element in ``\\mathfrak{g}^{\\mathrm{hor}, Y}`` such that this element is mapped to ``B``. By a argument similar to the one above we can show that ``ABA^{-1}\\in\\mathfrak{g}^\\mathrm{hor, Y}`` and this element maps to ``B``. Proofing that the map is injective is now trivial.\")","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We should note that we have written all Lie group and Lie algebra actions as simple matrix multiplications, like AE = Y. For some Lie groups and Lie algebras we should use different notations [9]. These Lie groups are however not relevant for what we use in GeometricMachineLearning and we will stick to regular matrix notation.","category":"page"},{"location":"arrays/global_tangent_spaces/#Global-Sections","page":"Global Tangent Spaces","title":"Global Sections","text":"","category":"section"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Note that the theorem above requires us to find an element AinG such that AE = Y. If we can find a mapping lambdamathcalMtoG we call such a mapping a global section. ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Main.theorem(raw\"We call a mapping from ``\\lambda:\\mathcal{M} \\to G`` a homogeneous space to its associated Lie group a **global section** if it satisfies:\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"\\lambda(Y)E = Y,\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"where ``E`` is the distinct element of the homogeneous space.\")","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Note that in general global sections are not unique because the rank of G is in general greater than that of mathcalM. We give an example of how to construct such a global section for the Stiefel and the Grassmann manifolds below. ","category":"page"},{"location":"arrays/global_tangent_spaces/#The-Global-Tangent-Space-for-the-Stiefel-Manifold","page":"Global Tangent Spaces","title":"The Global Tangent Space for the Stiefel Manifold","text":"","category":"section"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We now discuss the specific form of the global tangent space for the Stiefel manifold. We choose the distinct element[1] E to have an especially simple form (this matrix can be build by calling StiefelProjection):","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"[1]: We already introduced this special matrix together with the Stiefel manifold.","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"E = beginbmatrix\nmathbbI_n \nmathbbO\nendbmatrixinSt(n N)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Based on this elements of the vector space mathfrakg^mathrmhor E = mathfrakg^mathrmhor are: ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"beginpmatrix\nA B^T B mathbbO\nendpmatrix","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"where A is a skew-symmetric matrix of size ntimesn and B is an arbitrary matrix of size (N - n)timesn.","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Arrays of type mathfrakg^mathrmhor E are implemented in GeometricMachineLearning under the name StiefelLieAlgHorMatrix.","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We can call this with e.g. a skew-symmetric matrix A and an arbitrary matrix B:","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"using GeometricMachineLearning # hide\n\nN, n = 10, 4\n\nA = rand(SkewSymMatrix, n)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"B = rand(N - n, n)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"B1 = StiefelLieAlgHorMatrix(A, B, N, n)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We can also call it with a matrix of shape NtimesN:","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"B2 = Matrix(B1) # note that this does not have any special structure\n\nStiefelLieAlgHorMatrix(B2, n)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Or we can call it a matrix of shape Ntimesn:","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"E = StiefelProjection(N, n)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"B3 = B1 * E\n\nStiefelLieAlgHorMatrix(B3, n)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We now demonstrate how to map from an element of mathfrakg^mathrmhor Y to an element of mathfrakg^mathrmhor:","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"using GeometricMachineLearning # hide\n\nN, n = 10, 5\n\nY = rand(StiefelManifold, N, n)\nΔ = rgrad(Y, rand(N, n))\nΩΔ = GeometricMachineLearning.Ω(Y, Δ)\nλY = GlobalSection(Y) \n\nλY_mat = Matrix(λY)\n\nround.(λY_mat' * ΩΔ * λY_mat; digits = 3)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Performing this computation directly is computationally very inefficient however and the user is strongly discouraged to call Matrix on an instance of GlobalSection. The better option is calling global_rep:","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"using GeometricMachineLearning: _round # hide\n\n_round(global_rep(λY, Δ); digits = 3)","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Internally GlobalSection calls the function GeometricMachineLearning.global_section which does the following for the Stiefel manifold: ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"A = randn(N, N - n) # or the gpu equivalent\nA = A - Y * (Y' * A)\nY⟂ = qr(A).Q[1:N, 1:(N - n)]","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"So we draw (N - n) new columns randomly, subtract the part that is spanned by the columns of Y and then perform a QR composition on the resulting matrix. The Q part of the decomposition is a matrix of (N - n) columns that is orthogonal to Y and is typically referred to as Y_perp [6, 10, 11]. We can easily check that this Y_perp is indeed orthogonal to Y.","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Main.theorem(raw\"The matrix ``Y_\\perp`` constructed with the above algorithm satisfies\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"Y^TY_\\perp = \\mathbb{O},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"and\n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"(Y_\\perp)^TY_\\perp = \\mathbb{I},\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"i.e. all the columns in the big matrix ``[Y, Y_\\perp]\\in\\mathbb{R}^{N\\times{}N}`` are mutually orthonormal and it therefore is an element of ``SO(N)``.\")","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"Main.proof(raw\"The second property is trivially satisfied because the ``Q`` component of a ``QR`` decomposition is an orthogonal matrix. For the first property note that ``Y^TQR = \\mathbb{O}`` is zero because we have subtracted the ``Y`` component from the matrix ``QR``. The matrix ``R\\in\\mathbb{R}^{N\\times{}(N-n)}`` further has the property ``[R]_{ij} = 0`` for ``i > j`` and we have that \n\" * Main.indentation * raw\"```math\n\" * Main.indentation * raw\"(Y^TQ)R = [r_{11}(Y^TQ)_{1\\bullet}, r_{12}(Y^TQ)_{1\\bullet} + r_{22}(Y^TQ)_{2\\bullet}, \\ldots, \\sum_{i=1}^{N-n}r_{i(N-n)}(Y^TQ)_{i\\bullet}].\n\" * Main.indentation * raw\"```\n\" * Main.indentation * raw\"Now all the coefficients ``r_{ii}`` are non-zero because the matrix we performed the ``QR`` decomposition on has full rank and we can see that if ``(Y^TQ)R`` is zero ``Y^TQ`` also has to be zero.\")","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"We now discuss the global tangent space for the Grassmann manifold. This is similar to the Stiefel case.","category":"page"},{"location":"arrays/global_tangent_spaces/#Global-Tangent-Space-for-the-Grassmann-Manifold","page":"Global Tangent Spaces","title":"Global Tangent Space for the Grassmann Manifold","text":"","category":"section"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"In the case of the Grassmann manifold we construct the global tangent space with respect to the distinct element mathcalE=mathrmspan(E)inGr(nN), where E is again the same matrix.","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"The tangent tangent space T_mathcalEGr(nN) can be represented through matrices: ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"beginpmatrix\n 0 cdots 0 \n cdots cdots cdots \n 0 cdots 0 \n b_11 cdots b_1n \n cdots cdots cdots \n b_(N-n)1 cdots b_(N-n)n\nendpmatrix","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"This representation is based on the identification T_mathcalEGr(nN)toT_EmathcalS_E that was discussed in the section on the Grassmann manifold[2]. We use the following notation:","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"[2]: We derived the following expression for the Riemannian gradient of the Grassmann manifold: mathrmgrad_mathcalY^GrL = nabla_YL - YY^Tnabla_YL. The tangent space to the element mathcalE can thus be written as barB - EE^TbarB where BinmathbbR^Ntimesn and the matrices in this tangent space have the desired form. ","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"mathfrakg^mathrmhor = mathfrakg^mathrmhormathcalE = leftbeginpmatrix 0 -B^T B 0 endpmatrix textB arbitraryright","category":"page"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"This is equivalent to the horizontal component of mathfrakg for the Stiefel manifold for the case when A is zero. This is a reflection of the rotational invariance of the Grassmann manifold: the skew-symmetric matrices A are connected to the group of rotations O(n) which is factored out in the Grassmann manifold Gr(nN)simeqSt(nN)O(n). In GeometricMachineLearning we thus treat the Grassmann manifold as being embedded in the Stiefel manifold. In [11] viewing the Grassmann manifold as a quotient space of the Stiefel manifold is important for \"feasibility\" in \"practical computations\". ","category":"page"},{"location":"arrays/global_tangent_spaces/#Library-Functions","page":"Global Tangent Spaces","title":"Library Functions","text":"","category":"section"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"GeometricMachineLearning.AbstractLieAlgHorMatrix\nStiefelLieAlgHorMatrix\nStiefelLieAlgHorMatrix(::AbstractMatrix, ::Int)\nGrassmannLieAlgHorMatrix\nGrassmannLieAlgHorMatrix(::AbstractMatrix, ::Int)\nGlobalSection\nGeometricMachineLearning.global_section\nglobal_rep","category":"page"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.AbstractLieAlgHorMatrix-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.AbstractLieAlgHorMatrix","text":"AbstractLieAlgHorMatrix is a supertype for various horizontal components of Lie algebras. We usually call this mathfrakg^mathrmhor.\n\nSee StiefelLieAlgHorMatrix and GrassmannLieAlgHorMatrix.\n\n\n\n\n\n","category":"type"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.StiefelLieAlgHorMatrix-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.StiefelLieAlgHorMatrix","text":"StiefelLieAlgHorMatrix(A::SkewSymMatrix{T}, B::AbstractMatrix{T}, N::Integer, n::Integer) where T\n\nBuild an instance of StiefelLieAlgHorMatrix based on a skew-symmetric matrix A and an arbitrary matrix B.\n\nStiefelLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: piS to SE where \n\nE = beginpmatrix mathbbI_n mathbbO_(N-n)timesn endpmatrix\n\nThe matrix (E) is implemented under StiefelProjection in GeometricMachineLearning.\n\nAn element of StiefelLieAlgMatrix takes the form: \n\nbeginpmatrix\nA B^T B mathbbO\nendpmatrix\n\nwhere A is skew-symmetric (this is SkewSymMatrix in GeometricMachineLearning).\n\nAlso see GrassmannLieAlgHorMatrix.\n\n\n\n\n\n","category":"type"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.StiefelLieAlgHorMatrix-Tuple{AbstractMatrix, Int64}-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.StiefelLieAlgHorMatrix","text":"StiefelLieAlgHorMatrix(D::AbstractMatrix, n::Integer)\n\nTake a big matrix as input and build an instance of StiefelLieAlgHorMatrix belonging to the StiefelManifold St(n N) where N is the number of rows of D.\n\nIf the constructor is called with a big NtimesN matrix, then the projection is performed the following way: \n\nbeginpmatrix\nA B_1 \nB_2 D\nendpmatrix mapsto \nbeginpmatrix\nmathrmskew(A) -B_2^T \nB_2 mathbbO\nendpmatrix\n\nThe operation mathrmskewmathbbR^ntimesntomathcalS_mathrmskew(n) is the skew-symmetrization operation. This is equivalent to calling of SkewSymMatrix with an ntimesn matrix.\n\nThis can also be seen as the operation:\n\nD mapsto Omega(E DE) = mathrmskewleft(2 left(mathbbI - frac12 E E^T right) DE E^Tright)\n\nAlso see GeometricMachineLearning.Ω.\n\n\n\n\n\n","category":"method"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.GrassmannLieAlgHorMatrix-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.GrassmannLieAlgHorMatrix","text":"GrassmannLieAlgHorMatrix(B::AbstractMatrix{T}, N::Integer, n::Integer) where T\n\nBuild an instance of GrassmannLieAlgHorMatrix based on an arbitrary matrix B of size (N-n)timesn.\n\nGrassmannLieAlgHorMatrix is the horizontal component of the Lie algebra of skew-symmetric matrices (with respect to the canonical metric). The projection here is: piS to SEsim where \n\nE = beginpmatrix mathbbI_n mathbbO_(N-n)timesn endpmatrix\n\nand the equivalence relation is \n\nV_1 sim V_2 iff exists AinmathcalS_mathrmskew(n) textsuch that V_2 = V_1 + beginpmatrix A mathbbO endpmatrix\n\nAn element of GrassmannLieAlgMatrix takes the form: \n\nbeginpmatrix\nbarmathbbO B^T B mathbbO\nendpmatrix\n\nwhere barmathbbOinmathbbR^ntimesn and mathbbOinmathbbR^(N - n)timesn\n\n\n\n\n\n","category":"type"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.GrassmannLieAlgHorMatrix-Tuple{AbstractMatrix, Int64}-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.GrassmannLieAlgHorMatrix","text":"GrassmannLieAlgHorMatrix(D::AbstractMatrix, n::Integer)\n\nTake a big matrix as input and build an instance of GrassmannLieAlgHorMatrix belonging to the GrassmannManifold Gr(n N) where N is the number of rows of D.\n\nIf the constructor is called with a big NtimesN matrix, then the projection is performed the following way: \n\nbeginpmatrix\nA B_1 \nB_2 D\nendpmatrix mapsto \nbeginpmatrix\nbarmathbbO -B_2^T \nB_2 mathbbO\nendpmatrix\n\nThis can also be seen as the operation:\n\nD mapsto Omega(E DE - EE^TDE)\n\nwhere Omega is the horizontal lift GeometricMachineLearning.Ω.\n\n\n\n\n\n","category":"method"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.GlobalSection-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.GlobalSection","text":"GlobalSection(Y::AbstractMatrix)\n\nConstruct a global section for Y. \n\nA global section lambda is a mapping from a homogeneous space mathcalM to the corresponding Lie group G such that \n\nlambda(Y)E = Y\n\nAlso see apply_section and global_rep.\n\nImplementation\n\nFor an implementation of GlobalSection for a custom array (especially manifolds), the function global_section has to be generalized.\n\n\n\n\n\n","category":"type"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.global_section-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.global_section","text":"global_section(Y::StiefelManifold)\n\nCompute a matrix of size Ntimes(N-n) whose columns are orthogonal to the columns in Y.\n\nThis matrix is also called Y_perp [6, 10, 11].\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: global_section\nimport Random\n\nRandom.seed!(123)\n\nY = StiefelManifold([1. 0.; 0. 1.; 0. 0.; 0. 0.])\n\nround.(Matrix(global_section(Y)); digits = 3)\n\n# output\n\n4×2 Matrix{Float64}:\n 0.0 -0.0\n 0.0 0.0\n 0.936 -0.353\n 0.353 0.936\n\nFurther note that we convert the QRCompactWYQ object to a Matrix before we display it.\n\nImplementation\n\nThe implementation is done with a QR decomposition (LinearAlgebra.qr!). Internally we do: \n\nA = randn(N, N - n) # or the gpu equivalent\nA = A - Y.A * (Y.A' * A)\nqr!(A).Q\n\n\n\n\n\nglobal_section(Y::GrassmannManifold)\n\nCompute a matrix of size Ntimes(N-n) whose columns are orthogonal to the columns in Y.\n\nThe method global_section for the Grassmann manifold is equivalent to that for the StiefelManifold (we represent the Grassmann manifold as an embedding in the Stiefel manifold). \n\nSee the documentation for global_section(Y::StiefelManifold{T}) where T. \n\n\n\n\n\n","category":"function"},{"location":"arrays/global_tangent_spaces/#GeometricMachineLearning.global_rep-arrays-global_tangent_spaces","page":"Global Tangent Spaces","title":"GeometricMachineLearning.global_rep","text":"global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:StiefelManifold{T}}\n\nExpress Δ (an the tangent space of Y) as an instance of StiefelLieAlgHorMatrix.\n\nThis maps an element from T_YmathcalM to an element of mathfrakg^mathrmhor. \n\nThese two spaces are isomorphic where the isomorphism where the isomorphism is established through lambda(Y)inG via:\n\nT_YmathcalM to mathfrakg^mathrmhor Delta mapsto lambda(Y)^-1Omega(Y Delta)lambda(Y)\n\nAlso see GeometricMachineLearning.Ω.\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round\nimport Random \n\nRandom.seed!(123)\n\nY = rand(StiefelManifold, 6, 3)\nΔ = rgrad(Y, randn(6, 3))\nλY = GlobalSection(Y)\n\n_round(global_rep(λY, Δ); digits = 3)\n\n# output\n\n6×6 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:\n 0.0 0.679 1.925 0.981 -2.058 0.4\n -0.679 0.0 0.298 -0.424 0.733 -0.919\n -1.925 -0.298 0.0 -1.815 1.409 1.085\n -0.981 0.424 1.815 0.0 0.0 0.0\n 2.058 -0.733 -1.409 0.0 0.0 0.0\n -0.4 0.919 -1.085 0.0 0.0 0.0\n\nImplementation\n\nThe function global_rep does in fact not perform the entire map lambda(Y)^-1Omega(Y Delta)lambda(Y) but only\n\nDelta mapsto mathrmskew(Y^TDelta)\n\nto get the small skew-symmetric matrix and \n\nDelta mapsto (lambda(Y)_1N nN^T Delta)_1(N-n) 1n\n\nfor the arbitrary matrix.\n\n\n\n\n\nglobal_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:GrassmannManifold{T}}\n\nExpress Δ (an the tangent space of Y) as an instance of GrassmannLieAlgHorMatrix.\n\nThe method global_rep for GrassmannManifold is similar to that for StiefelManifold.\n\nExamples\n\nusing GeometricMachineLearning\nusing GeometricMachineLearning: _round\nimport Random \n\nRandom.seed!(123)\n\nY = rand(GrassmannManifold, 6, 3)\nΔ = rgrad(Y, randn(6, 3))\nλY = GlobalSection(Y)\n\n_round(global_rep(λY, Δ); digits = 3)\n\n# output\n\n6×6 GrassmannLieAlgHorMatrix{Float64, Matrix{Float64}}:\n 0.0 0.0 0.0 0.981 -2.058 0.4\n 0.0 0.0 0.0 -0.424 0.733 -0.919\n 0.0 0.0 0.0 -1.815 1.409 1.085\n -0.981 0.424 1.815 0.0 0.0 0.0\n 2.058 -0.733 -1.409 0.0 0.0 0.0\n -0.4 0.919 -1.085 0.0 0.0 0.0\n\n\n\n\n\n","category":"function"},{"location":"arrays/global_tangent_spaces/#References","page":"Global Tangent Spaces","title":"References","text":"","category":"section"},{"location":"arrays/global_tangent_spaces/","page":"Global Tangent Spaces","title":"Global Tangent Spaces","text":"P.-A. Absil, R. Mahony and R. Sepulchre. Riemannian geometry of Grassmann manifolds with a view on algorithmic computation. Acta Applicandae Mathematica 80, 199–220 (2004).\n\n\n\nP.-A. Absil, R. Mahony and R. Sepulchre. Optimization algorithms on matrix manifolds (Princeton University Press, Princeton, New Jersey, 2008).\n\n\n\nT. Bendokat, R. Zimmermann and P.-A. Absil. A Grassmann manifold handbook: Basic geometry and computational aspects, arXiv preprint arXiv:2011.13699 (2020).\n\n\n\nB. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).\n\n\n\n","category":"page"},{"location":"tutorials/grassmann_layer/#Example-of-a-Neural-Network-with-a-Grassmann-Layer","page":"Grassmann manifold","title":"Example of a Neural Network with a Grassmann Layer","text":"","category":"section"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"Here we show how to implement a neural network that contains a layer whose weight is an element of the Grassmann manifold and where this might be useful. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"To answer where we would need this consider the following scenario","category":"page"},{"location":"tutorials/grassmann_layer/#Problem-statement","page":"Grassmann manifold","title":"Problem statement","text":"","category":"section"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"We are given data in a big space mathcalD=d_i_iinmathcalIsubsetmathbbR^N and know these data live on an n-dimensional[1] submanifold[2] in mathbbR^N. Based on these data we would now like to generate new samples from the distributions that produced our original data. This is where the Grassmann manifold is useful: each element V of the Grassmann manifold is an n-dimensional subspace of mathbbR^N from which we can easily sample. We can then construct a (bijective) mapping from this space V onto a space that contains our data points mathcalD. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"[1]: We may know n exactly or approximately. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"[2]: Problems and solutions related to this scenario are commonly summarized under the term manifold learning (see [39]).","category":"page"},{"location":"tutorials/grassmann_layer/#Example","page":"Grassmann manifold","title":"Example","text":"","category":"section"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"Consider the following toy example: We want to sample from the graph of the (scaled) Rosenbrock function f(xy) = ((1 - x)^2 + 100(y - x^2)^2)1000 while pretending we do not know the function. ","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"using Plots # hide\n# hide\nrosenbrock(x::Vector) = ((1.0 - x[1]) ^ 2 + 100.0 * (x[2] - x[1] ^ 2) ^ 2) / 1000\nx, y = -1.5:0.1:1.5, -1.5:0.1:1.5\nz = Surface((x,y)->rosenbrock([x,y]), x, y)\np = surface(x,y,z; camera=(30,20), alpha=.6, colorbar=false, xlims=(-1.5, 1.5), ylims=(-1.5, 1.5), zlims=(0.0, rosenbrock([-1.5, -1.5])))","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"We now build a neural network whose task it is to map a product of two Gaussians mathcalN(01)timesmathcalN(01) onto the graph of the Rosenbrock function where the range for x and for y is -1515.","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"For computing the loss between the two distributions, i.e. Psi(mathcalN(01)timesmathcalN(01)) and f(-1515 -1515) we use the Wasserstein distance[3].","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"[3]: The implementation of the Wasserstein distance is taken from [40].","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"using GeometricMachineLearning, Zygote, BrenierTwoFluid\nusing LinearAlgebra: norm # hide\nimport Random # hide \nRandom.seed!(123)\n\nmodel = Chain(GrassmannLayer(2,3), Dense(3, 8, tanh), Dense(8, 3, identity))\n\nnn = NeuralNetwork(model, CPU(), Float64)\n\n# this computes the cost that is associated to the Wasserstein distance\nc = (x,y) -> .5 * norm(x - y)^2\n∇c = (x,y) -> x - y\n\nconst ε = 0.1 # entropic regularization. √ε is a length. # hide\nconst q = 1.0 # annealing parameter # hide\nconst Δ = 1.0 # characteristic domain size # hide\nconst s = ε # current scale: no annealing -> equals ε # hide\nconst tol = 1e-6 # marginal condition tolerance # hide \nconst crit_it = 20 # acceleration inference # hide\nconst p_η = 2\n\nfunction compute_wasserstein_gradient(ensemble1::AT, ensemble2::AT) where AT<:AbstractArray\n number_of_particles1 = size(ensemble1, 2)\n number_of_particles2 = size(ensemble2, 2)\n V = SinkhornVariable(copy(ensemble1'), ones(number_of_particles1) / number_of_particles1)\n W = SinkhornVariable(copy(ensemble2'), ones(number_of_particles2) / number_of_particles2)\n params = SinkhornParameters(; ε=ε,q=1.0,Δ=1.0,s=s,tol=tol,crit_it=crit_it,p_η=p_η,sym=false,acc=true) # hide\n S = SinkhornDivergence(V, W, c, params; islog = true)\n initialize_potentials!(S)\n compute!(S)\n value(S), x_gradient!(S, ∇c)'\nend\n\nxyz_points = hcat([[x,y,rosenbrock([x,y])] for x in x for y in y]...)\n\nfunction compute_gradient(ps::Tuple)\n samples = randn(2, size(xyz_points, 2))\n\n estimate, nn_pullback = Zygote.pullback(ps -> model(samples, ps), ps)\n\n valS, wasserstein_gradient = compute_wasserstein_gradient(estimate, xyz_points)\n valS, nn_pullback(wasserstein_gradient)[1]\nend\n\n# note the very high value for the learning rate\noptimizer = Optimizer(nn, AdamOptimizer(1e-1))\n\n# note the small number of training steps\nconst training_steps = 40\nloss_array = zeros(training_steps)\nfor i in 1:training_steps\n val, dp = compute_gradient(nn.params)\n loss_array[i] = val\n optimization_step!(optimizer, model, nn.params, dp)\nend\nplot(loss_array, xlabel=\"training step\", label=\"loss\")","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"Now we plot a few points to check how well they match the graph:","category":"page"},{"location":"tutorials/grassmann_layer/","page":"Grassmann manifold","title":"Grassmann manifold","text":"const number_of_points = 35\n\ncoordinates = nn(randn(2, number_of_points))\nscatter3d!(p, [coordinates[1, :]], [coordinates[2, :]], [coordinates[3, :]], alpha=.5, color=4, label=\"mapped points\")","category":"page"},{"location":"architectures/linear_symplectic_transformer/#Linear-Symplectic-Transformer","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"","category":"section"},{"location":"architectures/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"The linear symplectic transformer consists of a combination of linear symplectic attention and gradient layers and is visualized below: ","category":"page"},{"location":"architectures/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"Main.include_graphics(\"../tikz/linear_symplectic_transformer\"; caption = raw\"Visualization of the linear symplectic transformer architecutre. ``\\mathtt{n\\_sympnet}`` refers to the number of SympNet layers (``\\mathtt{n\\_sympnet}=2`` in this figure) and ``\\mathtt{L}`` refers to the number of transformer blocks (``\\mathtt{L=1}`` in this figure).\", width = .3) # hide","category":"page"},{"location":"architectures/linear_symplectic_transformer/#Library-Functions","page":"Linear Symplectic Transformer","title":"Library Functions","text":"","category":"section"},{"location":"architectures/linear_symplectic_transformer/","page":"Linear Symplectic Transformer","title":"Linear Symplectic Transformer","text":"LinearSymplecticTransformer","category":"page"},{"location":"architectures/linear_symplectic_transformer/#GeometricMachineLearning.LinearSymplecticTransformer-architectures-linear_symplectic_transformer","page":"Linear Symplectic Transformer","title":"GeometricMachineLearning.LinearSymplecticTransformer","text":"Realizes the linear Symplectic Transformer.\n\nConstructor:\n\nThe constructor is called with the following arguments\n\ndim::Int: System dimension \nseq_length::Int: Number of time steps that the transformer considers. \n\nOptional keyword arguments:\n\nn_sympnet::Int=2: The number of sympnet layers in the transformer.\nupscaling_dimension::Int=2*dim: The upscaling that is done by the gradient layer. \nL::Int=1: The number of transformer units. \nactivation=tanh: The activation function for the SympNet layers. \ninit_upper::Bool=true: Specifies if the first layer is a Q-type layer (init_upper=true) or if it is a P-type layer (init_upper=false).\n\n\n\n\n\n","category":"type"},{"location":"layers/volume_preserving_feedforward/#Volume-Preserving-Feedforward-Layer","page":"Volume-Preserving Layers","title":"Volume-Preserving Feedforward Layer","text":"","category":"section"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"Volume preserving feedforward layers are a special type of ResNet layer for which we restrict the weight matrices to be of a particular form. I.e. each layer computes: ","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"x mapsto x + sigma(Ax + b)","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"where sigma is a nonlinearity, A is the weight and b is the bias. The matrix A is either a lower-triangular matrix L or an upper-triangular matrix U[1]. The lower triangular matrix is of the form (the upper-triangular layer is simply the transpose of the lower triangular): ","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"[1]: Implemented as LowerTriangular and UpperTriangular in GeometricMachineLearning.","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"L = beginpmatrix\n 0 0 cdots 0 \n a_21 ddots vdots \n vdots ddots ddots vdots \n a_n1 cdots a_n(n-1) 0 \nendpmatrix","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"The Jacobian of a layer of the above form then is of the form","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"J = beginpmatrix\n 1 0 cdots 0 \n b_21 ddots vdots \n vdots ddots ddots vdots \n b_n1 cdots b_n(n-1) 1 \nendpmatrix","category":"page"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"and the determinant of J is 1, i.e. the map is volume-preserving. ","category":"page"},{"location":"layers/volume_preserving_feedforward/#Library-Functions","page":"Volume-Preserving Layers","title":"Library Functions","text":"","category":"section"},{"location":"layers/volume_preserving_feedforward/","page":"Volume-Preserving Layers","title":"Volume-Preserving Layers","text":"VolumePreservingFeedForwardLayer","category":"page"},{"location":"layers/volume_preserving_feedforward/#GeometricMachineLearning.VolumePreservingFeedForwardLayer-layers-volume_preserving_feedforward","page":"Volume-Preserving Layers","title":"GeometricMachineLearning.VolumePreservingFeedForwardLayer","text":"Super-type of VolumePreservingLowerLayer and VolumePreservingUpperLayer. The layers do the following: \n\nx mapsto begincases sigma(Lx + b) textwhere L is mathttLowerTriangular sigma(Ux + b) textwhere U is mathttUpperTriangular endcases\n\nThe functor can be applied to a vecotr, a matrix or a tensor. \n\nConstructor\n\nThe constructors are called with:\n\nsys_dim::Int: the system dimension. \nactivation=tanh: the activation function. \ninclude_bias::Bool=true (keyword argument): specifies whether a bias should be used. \n\n\n\n\n\n","category":"type"},{"location":"architectures/sympnet/#SympNet-Architecture","page":"SympNet","title":"SympNet Architecture","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"This document discusses the SympNet architecture and its implementation in GeometricMachineLearning.jl.","category":"page"},{"location":"architectures/sympnet/#Quick-overview-of-the-theory-of-SympNets","page":"SympNet","title":"Quick overview of the theory of SympNets","text":"","category":"section"},{"location":"architectures/sympnet/#Principle","page":"SympNet","title":"Principle","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"SympNets (see [31] for the eponymous paper) are a type of neural network that can model the trajectory of a Hamiltonian system in phase space. Take (q^Tp^T)^T=(q_1ldotsq_dp_1ldotsp_d)^Tin mathbbR^2d as the coordinates in phase space, where q=(q_1 ldots q_d)^Tin mathbbR^d is refered to as the position and p=(p_1 ldots p_d)^Tin mathbbR^d the momentum. Given a point (q^Tp^T)^T in mathbbR^2d the SympNet aims to compute the next position ((q)^T(p)^T)^T and thus predicts the trajectory while preserving the symplectic structure of the system. SympNets are enforcing symplecticity strongly, meaning that this property is hard-coded into the network architecture. The layers are reminiscent of traditional neural network feedforward layers, but have a strong restriction imposed on them in order to be symplectic.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"SympNets can be viewed as a \"symplectic integrator\" (see [7] and [24]). Their goal is to predict, based on an initial condition ((q^(0))^T(p^(0))^T)^T, a sequence of points in phase space that fit the training data as well as possible:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"beginpmatrix q^(0) p^(0) endpmatrix cdots beginpmatrix tildeq^(1) tildep^(1) endpmatrix cdots beginpmatrix tildeq^(n) tildep^(n) endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The tilde in the above equation indicates predicted data. The time step between predictions is not a parameter we can choose but is related to the temporal frequency of the training data. This means that if data is recorded in an interval of e.g. 0.1 seconds, then this will be the time step of our integrator.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" Main.include_graphics(\"../tikz/sympnet_architecture\"; # hide\n label = \"fig:SympNetArchitecture\", # hide\n caption = \"Visualization of the SympNet architecture\" # hide\n ) # hide","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"There are two types of SympNet architectures: LA-SympNets and G-SympNets. ","category":"page"},{"location":"architectures/sympnet/#LA-SympNet","page":"SympNet","title":"LA-SympNet","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The first type of SympNets, LA-SympNets, are obtained from composing two types of layers: symplectic linear layers and symplectic activation layers. For a given integer n, a symplectic linear layer is defined by","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"mathcalL^nq\nbeginpmatrix\n q \n p \nendpmatrix\n = \nbeginpmatrix \n I S^n0 \n 0S^n I \nendpmatrix\n cdots \nbeginpmatrix \n I 0 \n S^2 I \nendpmatrix\nbeginpmatrix \n I S^1 \n 0 I \nendpmatrix\nbeginpmatrix\n q \n p \nendpmatrix\n+ b ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"or ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"mathcalL^np\nbeginpmatrix q \n p endpmatrix = \n beginpmatrix \n I 0S^n \n S^n0 I\n endpmatrix cdots \n beginpmatrix \n I S^2 \n 0 I\n endpmatrix\n beginpmatrix \n I 0 \n S^1 I\n endpmatrix\n beginpmatrix q \n p endpmatrix\n + b ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The superscripts q and p indicate whether the q or the p part is changed. The learnable parameters are the symmetric matrices S^iinmathbbR^dtimes d and the bias binmathbbR^2d. The integer n is the width of the symplectic linear layer. It can be shown that five of these layers, i.e. ngeq5, can represent any linear symplectic map (see [32]), so n need not be larger than five. We denote the set of symplectic linear layers by mathcalM^L.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The second type of layer needed for LA-SympNets are so-called activation layers:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalA^q beginpmatrix q \n p endpmatrix = \n beginbmatrix \n Ihatsigma^a \n 0I\n endbmatrix beginpmatrix q \n p endpmatrix =\n beginpmatrix \n mathrmdiag(a)sigma(p)+q \n p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"and","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalA^p beginpmatrix q \n p endpmatrix = \n beginbmatrix \n I0 \n hatsigma^aI\n endbmatrix beginpmatrix q \n p endpmatrix\n =\n beginpmatrix \n q \n mathrmdiag(a)sigma(q)+p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The activation function sigma can be any nonlinearity (on which minor restrictions are imposed below). Here the scaling vector ainmathbbR^d constitutes the learnable weights. We denote the set of symplectic activation layers by mathcalM^A. ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"An LA-SympNet is a function of the form Psi=l_k circ a_k circ l_k-1 circ cdots circ a_1 circ l_0 where (l_i)_0leq ileq k subset (mathcalM^L)^k+1 and (a_i)_1leq ileq k subset (mathcalM^A)^k. We will refer to k as the number of hidden layers of the SympNet[1] and the number n above as the depth of the linear layer.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"[1]: Note that if k=1 then the LA-SympNet consists of only one linear layer.","category":"page"},{"location":"architectures/sympnet/#G-SympNets","page":"SympNet","title":"G-SympNets","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"G-SympNets are an alternative to LA-SympNets. They are built with only one kind of layer, called gradient layer. For a given activation function sigma and an integer ngeq d, a gradient layers is a symplectic map from mathbbR^2d to mathbbR^2d defined by","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalG^up beginpmatrix q \n p endpmatrix = \n beginbmatrix \n Ihatsigma^Kab \n 0I\n endbmatrix beginpmatrix q \n p endpmatrix =\n beginpmatrix \n K^T mathrmdiag(a)sigma(Kp+b)+q \n p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"or","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":" mathcalG^low beginpmatrix q \n p endpmatrix = \n beginbmatrix \n I0 \n hatsigma^KabI\n endbmatrix beginpmatrix q \n p endpmatrix\n =\n beginpmatrix \n q \n K^T mathrmdiag(a)sigma(Kq+b)+p\n endpmatrix","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The parameters of this layer are the scaling matrix KinmathbbR^mtimes d, the bias binmathbbR^m and the scaling vector ainmathbbR^m. The name \"gradient layer\" has its origin in the fact that the expression K^Tmathrmdiag(a)sigma(Kq+b)_i = sum_jk_jia_jsigma(sum_ellk_jellq_ell+b_j) is the gradient of a function sum_ja_jtildesigma(sum_ellk_jellq_ell+b_j), where tildesigma is the antiderivative of sigma. The first dimension of K we refer to as the upscaling dimension.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"If we denote by mathcalM^G the set of gradient layers, a G-SympNet is a function of the form Psi=g_k circ g_k-1 circ cdots circ g_0 where (g_i)_0leq ileq k subset (mathcalM^G)^k. The index k is again the number of hidden layers.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Further note here the different roles played by round and square brackets: the latter indicates a nonlinear operation as opposed to a regular vector or matrix. ","category":"page"},{"location":"architectures/sympnet/#Universal-approximation-theorems","page":"SympNet","title":"Universal approximation theorems","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"In order to state the universal approximation theorem for both architectures we first need a few definitions:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Let U be an open set of mathbbR^2d, and let us denote by mathcalSP^r(U) the set of C^r smooth symplectic maps on U. We now define a topology on C^r(K mathbbR^n), the set of C^r-smooth maps from a compact set KsubsetmathbbR^n to mathbbR^n through the norm","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"f_C^r(KmathbbR^n) = undersetalphaleq rsum underset1leq i leq nmaxundersetxin Ksup D^alpha f_i(x)","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"where the differential operator D^alpha is defined by ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"D^alpha f = fracpartial^alpha fpartial x_1^alpha_1x_n^alpha_n","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"with alpha = alpha_1 ++ alpha_n. ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Definition sigma is r-finite if sigmain C^r(mathbbRmathbbR) and int D^rsigma(x)dx +infty.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Definition Let mnrin mathbbN with mn0 be given, U an open set of mathbbR^m, and IJsubset C^r(UmathbbR^n). We say J is r-uniformly dense on compacta in I if J subset I and for any fin I, epsilon0, and any compact Ksubset U, there exists gin J such that f-g_C^r(KmathbbR^n) epsilon.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"We can now state the universal approximation theorems:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Theorem (Approximation theorem for LA-SympNet) For any positive integer r0 and open set Uin mathbbR^2d, the set of LA-SympNet is r-uniformly dense on compacta in SP^r(U) if the activation function sigma is r-finite.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Theorem (Approximation theorem for G-SympNet) For any positive integer r0 and open set Uin mathbbR^2d, the set of G-SympNet is r-uniformly dense on compacta in SP^r(U) if the activation function sigma is r-finite.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"There are many r-finite activation functions commonly used in neural networks, for example:","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"sigmoid sigma(x)=frac11+e^-x for any positive integer r, \ntanh tanh(x)=frace^x-e^-xe^x+e^-x for any positive integer r. ","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on mathbbR^2d. But this does not tell us anything about how to optimize the network. This is can be done with any common neural network optimizer and these neural network optimizers always rely on a corresponding loss function. ","category":"page"},{"location":"architectures/sympnet/#Loss-function","page":"SympNet","title":"Loss function","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"To train the SympNet, one need data along a trajectory such that the model is trained to perform an integration. These data are (QP) where Qij (respectively Pij) is the real number q_j(t_i) (respectively pij) which is the j-th coordinates of the generalized position (respectively momentum) at the i-th time step. One also need a loss function defined as :","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"Loss(QP) = undersetisum d(Phi(Qi-Pi-) Qi- Pi-^T)","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"where d is a distance on mathbbR^d.","category":"page"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"See the tutorial section for an introduction into using SympNets with GeometricMachineLearning.jl.","category":"page"},{"location":"architectures/sympnet/#References","page":"SympNet","title":"References","text":"","category":"section"},{"location":"architectures/sympnet/","page":"SympNet","title":"SympNet","text":"P. Jin, Z. Zhang, A. Zhu, Y. Tang and G. E. Karniadakis. SympNets: Intrinsic structure-preserving symplectic networks for identifying Hamiltonian systems. Neural Networks 132, 166–179 (2020).\n\n\n\n","category":"page"}] } diff --git a/latest/tutorials/grassmann_layer/feb29668.svg b/latest/tutorials/grassmann_layer/1790e027.svg similarity index 65% rename from latest/tutorials/grassmann_layer/feb29668.svg rename to latest/tutorials/grassmann_layer/1790e027.svg index 61d55a235..38a677dea 100644 --- a/latest/tutorials/grassmann_layer/feb29668.svg +++ b/latest/tutorials/grassmann_layer/1790e027.svg @@ -1,992 +1,989 @@ - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/grassmann_layer/6f2cf159.svg b/latest/tutorials/grassmann_layer/a08bbe8e.svg similarity index 70% rename from latest/tutorials/grassmann_layer/6f2cf159.svg rename to latest/tutorials/grassmann_layer/a08bbe8e.svg index 1480eb186..41579a0f8 100644 --- a/latest/tutorials/grassmann_layer/6f2cf159.svg +++ b/latest/tutorials/grassmann_layer/a08bbe8e.svg @@ -1,44 +1,44 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/grassmann_layer/a35e350a.svg b/latest/tutorials/grassmann_layer/fed7261b.svg similarity index 65% rename from latest/tutorials/grassmann_layer/a35e350a.svg rename to latest/tutorials/grassmann_layer/fed7261b.svg index 5fa1e0d7a..758e4b9cf 100644 --- a/latest/tutorials/grassmann_layer/a35e350a.svg +++ b/latest/tutorials/grassmann_layer/fed7261b.svg @@ -1,963 +1,963 @@ - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/grassmann_layer/index.html b/latest/tutorials/grassmann_layer/index.html index 7a2fdc6a4..3ea2efc2d 100644 --- a/latest/tutorials/grassmann_layer/index.html +++ b/latest/tutorials/grassmann_layer/index.html @@ -1,8 +1,8 @@ -Grassmann manifold · GeometricMachineLearning.jl

      Example of a Neural Network with a Grassmann Layer

      Here we show how to implement a neural network that contains a layer whose weight is an element of the Grassmann manifold and where this might be useful.

      To answer where we would need this consider the following scenario

      Problem statement

      We are given data in a big space $\mathcal{D}=[d_i]_{i\in\mathcal{I}}\subset\mathbb{R}^N$ and know these data live on an $n$-dimensional[1] submanifold[2] in $\mathbb{R}^N$. Based on these data we would now like to generate new samples from the distributions that produced our original data. This is where the Grassmann manifold is useful: each element $V$ of the Grassmann manifold is an $n$-dimensional subspace of $\mathbb{R}^N$ from which we can easily sample. We can then construct a (bijective) mapping from this space $V$ onto a space that contains our data points $\mathcal{D}$.

      Example

      Consider the following toy example: We want to sample from the graph of the (scaled) Rosenbrock function $f(x,y) = ((1 - x)^2 + 100(y - x^2)^2)/1000$ while pretending we do not know the function.

      rosenbrock(x::Vector) = ((1.0 - x[1]) ^ 2 + 100.0 * (x[2] - x[1] ^ 2) ^ 2) / 1000
      +Grassmann manifold · GeometricMachineLearning.jl

      Example of a Neural Network with a Grassmann Layer

      Here we show how to implement a neural network that contains a layer whose weight is an element of the Grassmann manifold and where this might be useful.

      To answer where we would need this consider the following scenario

      Problem statement

      We are given data in a big space $\mathcal{D}=[d_i]_{i\in\mathcal{I}}\subset\mathbb{R}^N$ and know these data live on an $n$-dimensional[1] submanifold[2] in $\mathbb{R}^N$. Based on these data we would now like to generate new samples from the distributions that produced our original data. This is where the Grassmann manifold is useful: each element $V$ of the Grassmann manifold is an $n$-dimensional subspace of $\mathbb{R}^N$ from which we can easily sample. We can then construct a (bijective) mapping from this space $V$ onto a space that contains our data points $\mathcal{D}$.

      Example

      Consider the following toy example: We want to sample from the graph of the (scaled) Rosenbrock function $f(x,y) = ((1 - x)^2 + 100(y - x^2)^2)/1000$ while pretending we do not know the function.

      rosenbrock(x::Vector) = ((1.0 - x[1]) ^ 2 + 100.0 * (x[2] - x[1] ^ 2) ^ 2) / 1000
       x, y = -1.5:0.1:1.5, -1.5:0.1:1.5
       z = Surface((x,y)->rosenbrock([x,y]), x, y)
      -p = surface(x,y,z; camera=(30,20), alpha=.6, colorbar=false, xlims=(-1.5, 1.5), ylims=(-1.5, 1.5), zlims=(0.0, rosenbrock([-1.5, -1.5])))
      Example block output

      We now build a neural network whose task it is to map a product of two Gaussians $\mathcal{N}(0,1)\times\mathcal{N}(0,1)$ onto the graph of the Rosenbrock function where the range for $x$ and for $y$ is $[-1.5,1.5]$.

      For computing the loss between the two distributions, i.e. $\Psi(\mathcal{N}(0,1)\times\mathcal{N}(0,1))$ and $f([-1.5,1.5], [-1.5,1.5])$ we use the Wasserstein distance[3].

      using GeometricMachineLearning, Zygote, BrenierTwoFluid
      +p = surface(x,y,z; camera=(30,20), alpha=.6, colorbar=false, xlims=(-1.5, 1.5), ylims=(-1.5, 1.5), zlims=(0.0, rosenbrock([-1.5, -1.5])))
      Example block output

      We now build a neural network whose task it is to map a product of two Gaussians $\mathcal{N}(0,1)\times\mathcal{N}(0,1)$ onto the graph of the Rosenbrock function where the range for $x$ and for $y$ is $[-1.5,1.5]$.

      For computing the loss between the two distributions, i.e. $\Psi(\mathcal{N}(0,1)\times\mathcal{N}(0,1))$ and $f([-1.5,1.5], [-1.5,1.5])$ we use the Wasserstein distance[3].

      using GeometricMachineLearning, Zygote, BrenierTwoFluid
       import Random # hide
       Random.seed!(123)
       
      @@ -50,7 +50,7 @@
           loss_array[i] = val
           optimization_step!(optimizer, model, nn.params, dp)
       end
      -plot(loss_array, xlabel="training step", label="loss")
      Example block output

      Now we plot a few points to check how well they match the graph:

      const number_of_points = 35
      +plot(loss_array, xlabel="training step", label="loss")
      Example block output

      Now we plot a few points to check how well they match the graph:

      const number_of_points = 35
       
       coordinates = nn(randn(2, number_of_points))
      -scatter3d!(p, [coordinates[1, :]], [coordinates[2, :]], [coordinates[3, :]], alpha=.5, color=4, label="mapped points")
      Example block output
      • 1We may know $n$ exactly or approximately.
      • 2Problems and solutions related to this scenario are commonly summarized under the term manifold learning (see [37]).
      • 3The implementation of the Wasserstein distance is taken from [38].
      +scatter3d!(p, [coordinates[1, :]], [coordinates[2, :]], [coordinates[3, :]], alpha=.5, color=4, label="mapped points")
      Example block output
      • 1We may know $n$ exactly or approximately.
      • 2Problems and solutions related to this scenario are commonly summarized under the term manifold learning (see [39]).
      • 3The implementation of the Wasserstein distance is taken from [40].
      diff --git a/latest/tutorials/linear_symplectic_transformer/59dd74ca.svg b/latest/tutorials/linear_symplectic_transformer/8b9246ed.svg similarity index 85% rename from latest/tutorials/linear_symplectic_transformer/59dd74ca.svg rename to latest/tutorials/linear_symplectic_transformer/8b9246ed.svg index 090102fce..b46ced072 100644 --- a/latest/tutorials/linear_symplectic_transformer/59dd74ca.svg +++ b/latest/tutorials/linear_symplectic_transformer/8b9246ed.svg @@ -1,54 +1,54 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/linear_symplectic_transformer/a58e3d30.svg b/latest/tutorials/linear_symplectic_transformer/9fb33f4f.svg similarity index 95% rename from latest/tutorials/linear_symplectic_transformer/a58e3d30.svg rename to latest/tutorials/linear_symplectic_transformer/9fb33f4f.svg index 3939c9878..428d44f0b 100644 --- a/latest/tutorials/linear_symplectic_transformer/a58e3d30.svg +++ b/latest/tutorials/linear_symplectic_transformer/9fb33f4f.svg @@ -1,46 +1,46 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/linear_symplectic_transformer/index.html b/latest/tutorials/linear_symplectic_transformer/index.html index 463225286..579a6ee2f 100644 --- a/latest/tutorials/linear_symplectic_transformer/index.html +++ b/latest/tutorials/linear_symplectic_transformer/index.html @@ -1,5 +1,5 @@ -Linear Symplectic Transformer · GeometricMachineLearning.jl

      Linear Symplectic Transformer

      In this tutorial we compare the linear symplectic transformer to the standard transformer.

      using GeometricProblems.CoupledHarmonicOscillator: hodeensemble, default_parameters
      +Linear Symplectic Transformer · GeometricMachineLearning.jl

      Linear Symplectic Transformer

      In this tutorial we compare the linear symplectic transformer to the standard transformer.

      using GeometricProblems.CoupledHarmonicOscillator: hodeensemble, default_parameters
       using GeometricIntegrators: ImplicitMidpoint, integrate
       using LaTeXStrings
       using Plots
      @@ -45,7 +45,7 @@
       plot!(p_train, loss_array_symplectic; color = 4, label = "LST")
       plot!(p_train, loss_array_sympnet; color = 3, label = "SympNet")
       
      -p_train
      Example block output

      We further evaluate a trajectory with the trained networks:

      const index = 1
      +p_train
      Example block output

      We further evaluate a trajectory with the trained networks:

      const index = 1
       init_con = dl.input[:, 1:seq_length, index]
       
       const n_steps = 30
      @@ -63,4 +63,4 @@
           p_validate
       end
       
      -make_validation_plot(; linewidth = 2)
      Example block output

      We can see that the standard transformer is not able to stay close to the trajectory coming from implicit midpoint very well. The linear symplectic transformer outperforms the standard transformer as well as the SympNet while needed much fewer parameters than the standard transformer:

      parameterlength(nn_standard), parameterlength(nn_symplectic), parameterlength(nn_sympnet)
      (108, 84, 64)

      It is also interesting to note that the training error for the SympNet gets lower than the one for the linear symplectic transformer, but it does not manage to outperform it when looking at the validation.

      +make_validation_plot(; linewidth = 2)
      Example block output

      We can see that the standard transformer is not able to stay close to the trajectory coming from implicit midpoint very well. The linear symplectic transformer outperforms the standard transformer as well as the SympNet while needed much fewer parameters than the standard transformer:

      parameterlength(nn_standard), parameterlength(nn_symplectic), parameterlength(nn_sympnet)
      (108, 84, 64)

      It is also interesting to note that the training error for the SympNet gets lower than the one for the linear symplectic transformer, but it does not manage to outperform it when looking at the validation.

      diff --git a/latest/tutorials/mnist_tutorial/index.html b/latest/tutorials/mnist_tutorial/index.html index 9f1698782..28ae0c8fd 100644 --- a/latest/tutorials/mnist_tutorial/index.html +++ b/latest/tutorials/mnist_tutorial/index.html @@ -1,5 +1,5 @@ -MNIST · GeometricMachineLearning.jl

      MNIST tutorial

      This is a short tutorial that shows how we can use GeometricMachineLearning to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. This is also the result presented in [36].

      First, we need to import the relevant packages:

      using GeometricMachineLearning, CUDA, Plots
      +MNIST · GeometricMachineLearning.jl

      MNIST tutorial

      This is a short tutorial that shows how we can use GeometricMachineLearning to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. This is also the result presented in [38].

      First, we need to import the relevant packages:

      using GeometricMachineLearning, CUDA, Plots
       import Zygote, MLDatasets, KernelAbstractions

      For the AD routine we here use the GeometricMachineLearning default and we get the dataset from MLDatasets. First we need to load the data set, and put it on GPU (if you have one):

      train_x, train_y = MLDatasets.MNIST(split=:train)[:]
       test_x, test_y = MLDatasets.MNIST(split=:test)[:]
       train_x = train_x |> cu 
      @@ -7,7 +7,7 @@
       train_y = train_y |> cu 
       test_y = test_y |> cu

      GeometricMachineLearning has built-in data loaders that make it particularly easy to handle data:

      patch_length = 7
       dl = DataLoader(train_x, train_y, patch_length=patch_length)
      -dl_test = DataLoader(train_x, train_y, patch_length=patch_length)

      Here patch_length indicates the size one patch has. One image in MNIST is of dimension $28\times28$, this means that we decompose this into 16 $(7\times7)$ images (also see [36]).

      We next define the model with which we want to train:

      model = ClassificationTransformer(dl, n_heads=n_heads, n_layers=n_layers, Stiefel=true)

      Here we have chosen a ClassificationTransformer, i.e. a composition of a specific number of transformer layers composed with a classification layer. We also set the Stiefel option to true, i.e. we are optimizing on the Stiefel manifold.

      We now have to initialize the neural network weights. This is done with the constructor for NeuralNetwork:

      backend = KernelAbstractions.get_backend(dl)
      +dl_test = DataLoader(train_x, train_y, patch_length=patch_length)

      Here patch_length indicates the size one patch has. One image in MNIST is of dimension $28\times28$, this means that we decompose this into 16 $(7\times7)$ images (also see [38]).

      We next define the model with which we want to train:

      model = ClassificationTransformer(dl, n_heads=n_heads, n_layers=n_layers, Stiefel=true)

      Here we have chosen a ClassificationTransformer, i.e. a composition of a specific number of transformer layers composed with a classification layer. We also set the Stiefel option to true, i.e. we are optimizing on the Stiefel manifold.

      We now have to initialize the neural network weights. This is done with the constructor for NeuralNetwork:

      backend = KernelAbstractions.get_backend(dl)
       T = eltype(dl)
       nn = NeuralNetwork(model, backend, T)

      And with this we can finally perform the training:

      # an instance of batch is needed for the optimizer
       batch = Batch(batch_size)
      @@ -19,4 +19,4 @@
       
       loss_array = optimizer_instance(nn, dl, batch, n_epochs)
       
      -println("final test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")

      It is instructive to play with n_layers, n_epochs and the Stiefel property.

      [36]
      B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
      +println("final test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")

      It is instructive to play with n_layers, n_epochs and the Stiefel property.

      [38]
      B. Brantner. Generalizing Adam To Manifolds For Efficiently Training Transformers, arXiv preprint arXiv:2305.16901 (2023).
      diff --git a/latest/tutorials/symplectic_autoencoder/a7fb2654.svg b/latest/tutorials/symplectic_autoencoder/25132818.svg similarity index 87% rename from latest/tutorials/symplectic_autoencoder/a7fb2654.svg rename to latest/tutorials/symplectic_autoencoder/25132818.svg index 1e3cae9d5..d5440f354 100644 --- a/latest/tutorials/symplectic_autoencoder/a7fb2654.svg +++ b/latest/tutorials/symplectic_autoencoder/25132818.svg @@ -1,50 +1,50 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/symplectic_autoencoder/34c1e399.svg b/latest/tutorials/symplectic_autoencoder/e80b6398.svg similarity index 85% rename from latest/tutorials/symplectic_autoencoder/34c1e399.svg rename to latest/tutorials/symplectic_autoencoder/e80b6398.svg index f0746af1b..e30ab3796 100644 --- a/latest/tutorials/symplectic_autoencoder/34c1e399.svg +++ b/latest/tutorials/symplectic_autoencoder/e80b6398.svg @@ -1,48 +1,48 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/symplectic_autoencoder/index.html b/latest/tutorials/symplectic_autoencoder/index.html index a2dd8110b..6a5602dd4 100644 --- a/latest/tutorials/symplectic_autoencoder/index.html +++ b/latest/tutorials/symplectic_autoencoder/index.html @@ -1,5 +1,5 @@ -Symplectic Autoencoders · GeometricMachineLearning.jl

      Symplectic Autoencoders and the Toda Lattice

      In this tutorial we use a SymplecticAutoencoder to approximate the linear wave equation with a lower-dimensional Hamiltonian model and compare it with standard proper symplectic decomposition (PSD).

      The system

      The Toda lattice is a prototypical example of a Hamiltonian PDE. It is described by

      \[ H(q, p) = \sum_{n\in\mathbb{Z}}\left( \frac{p_n^2}{2} + \alpha e^{q_n - q_{n+1}} \right).\]

      We further assume a finite number of particles $N$ and impose periodic boundary conditions:

      \[\begin{aligned} +Symplectic Autoencoders · GeometricMachineLearning.jl

      Symplectic Autoencoders and the Toda Lattice

      In this tutorial we use a SymplecticAutoencoder to approximate the linear wave equation with a lower-dimensional Hamiltonian model and compare it with standard proper symplectic decomposition (PSD).

      The system

      The Toda lattice is a prototypical example of a Hamiltonian PDE. It is described by

      \[ H(q, p) = \sum_{n\in\mathbb{Z}}\left( \frac{p_n^2}{2} + \alpha e^{q_n - q_{n+1}} \right).\]

      We further assume a finite number of particles $N$ and impose periodic boundary conditions:

      \[\begin{aligned} q_{n+N} & \equiv q_n \\ p_{n+N} & \equiv p_n. \end{aligned}\]

      In this tutorial we want to reduce the dimension of the big system by a significant factor with (i) proper symplectic decomposition (PSD) and (ii) symplectic autoencoders. The first approach is strictly linear whereas the second one allows for more general mappings.

      Using the Toda lattice in numerical experiments

      In order to use the Toda lattice in numerical experiments we have to pick suitable initial conditions. For this, consider the third-degree spline:

      \[h(s) = \begin{cases} @@ -34,7 +34,7 @@ sae_error = o(sae_nn, dl, Batch(batch_size), n_epochs) hline([psd_error]; color = 2, label = "PSD error") -plot!(sae_error; color = 3, label = "SAE error", xlabel = "epoch", ylabel = "training error")Example block output

      The online stage

      After having trained our neural network we can now evaluate it in the online stage of reduced complexity modeling:

      psd_rs = HRedSys(pr, encoder(psd_nn), decoder(psd_nn); integrator = ImplicitMidpoint())
      +plot!(sae_error; color = 3, label = "SAE error", xlabel = "epoch", ylabel = "training error")
      Example block output

      The online stage

      After having trained our neural network we can now evaluate it in the online stage of reduced complexity modeling:

      psd_rs = HRedSys(pr, encoder(psd_nn), decoder(psd_nn); integrator = ImplicitMidpoint())
       sae_rs = HRedSys(pr, encoder(sae_nn), decoder(sae_nn); integrator = ImplicitMidpoint())
       
       projection_error(psd_rs)
      0.6172687774821377
      projection_error(sae_rs)
      0.11220703644622755

      Next we plot a comparison between the PSD prediction and the symplectic autoencoder prediction:

      sol_full = integrate_full_system(psd_rs)
      @@ -44,4 +44,4 @@
       const t_step = 100
       plot(sol_full.s.q[t_step], label = "Implicit Midpoint")
       plot!(psd_rs.decoder((q = sol_psd_reduced.s.q[t_step], p = sol_psd_reduced.s.p[t_step])).q, label = "PSD")
      -plot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_step], p = sol_sae_reduced.s.p[t_step])).q, label = "SAE")
      Example block output

      We can see that the autoencoder approach has much more approximation capabilities than the psd approach. The jiggly lines are due to the fact that training was done for only 8 epochs.

      References

      [32]
      P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).
      [33]
      L. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).
      [34]
      C. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).
      +plot!(sae_rs.decoder((q = sol_sae_reduced.s.q[t_step], p = sol_sae_reduced.s.p[t_step])).q, label = "SAE")Example block output

      We can see that the autoencoder approach has much more approximation capabilities than the psd approach. The jiggly lines are due to the fact that training was done for only 8 epochs.

      References

      [34]
      P. Buchfink, S. Glas and B. Haasdonk. Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder. SIAM Journal on Scientific Computing 45, A289–A311 (2023).
      [35]
      L. Peng and K. Mohseni. Symplectic model reduction of Hamiltonian systems. SIAM Journal on Scientific Computing 38, A1–A27 (2016).
      [36]
      C. Greif and K. Urban. Decay of the Kolmogorov N-width for wave problems. Applied Mathematics Letters 96, 216–222 (2019).
      diff --git a/latest/tutorials/sympnet_tutorial/8e47bd72.svg b/latest/tutorials/sympnet_tutorial/362d7549.svg similarity index 88% rename from latest/tutorials/sympnet_tutorial/8e47bd72.svg rename to latest/tutorials/sympnet_tutorial/362d7549.svg index f32042291..569c94d83 100644 --- a/latest/tutorials/sympnet_tutorial/8e47bd72.svg +++ b/latest/tutorials/sympnet_tutorial/362d7549.svg @@ -1,42 +1,42 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/sympnet_tutorial/105b2366.svg b/latest/tutorials/sympnet_tutorial/bd088bbf.svg similarity index 86% rename from latest/tutorials/sympnet_tutorial/105b2366.svg rename to latest/tutorials/sympnet_tutorial/bd088bbf.svg index 936157771..f2c4373cd 100644 --- a/latest/tutorials/sympnet_tutorial/105b2366.svg +++ b/latest/tutorials/sympnet_tutorial/bd088bbf.svg @@ -1,50 +1,50 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/sympnet_tutorial/index.html b/latest/tutorials/sympnet_tutorial/index.html index 38218c4ad..d42e88689 100644 --- a/latest/tutorials/sympnet_tutorial/index.html +++ b/latest/tutorials/sympnet_tutorial/index.html @@ -1,5 +1,5 @@ -Sympnets · GeometricMachineLearning.jl

      SympNets with GeometricMachineLearning.jl

      This page serves as a short introduction into using SympNets with GeometricMachineLearning.jl. For the general theory see the theory section.

      With GeometricMachineLearning.jl one can easily implement SympNets. The steps are the following :

      • Specify the architecture with the functions GSympNet and LASympNet,
      • Specify the type and the backend with NeuralNetwork,
      • Pick an optimizer for training the network,
      • Train the neural networks!

      We discuss these points is some detail:

      Specifying the architecture

      To call an $LA$-SympNet, one needs to write

      lasympnet = LASympNet(dim; depth=5, nhidden=1, activation=tanh, init_upper_linear=true, init_upper_act=true) 

      LASympNet takes one obligatory argument:

      • dim : the dimension of the phase space (i.e. an integer) or optionally an instance of DataLoader. This latter option will be used below.

      and several keywords argument :

      • depth : the depth for all the linear layers. The default value set to 5 (if width>5, width is set to 5). See the theory section for more details; there depth was called $n$.
      • nhidden : the number of pairs of linear and activation layers with default value set to 1 (i.e the $LA$-SympNet is a composition of a linear layer, an activation layer and then again a single layer).
      • activation : the activation function for all the activations layers with default set to tanh,
      • initupperlinear : a boolean that indicates whether the first linear layer changes $q$ first. By default this is true.
      • initupperact : a boolean that indicates whether the first activation layer changes $q$ first. By default this is true.

      G-SympNet

      To call a G-SympNet, one needs to write

      gsympnet = GSympNet(dim; upscaling_dimension=2*dim, n_layers=2, activation=tanh, init_upper=true) 

      GSympNet takes one obligatory argument:

      • dim : the dimension of the phase space (i.e. an integer) or optionally an instance of DataLoader. This latter option will be used below.

      and severals keywords argument :

      • upscaling_dimension: The first dimension of the matrix with which the input is multiplied. In the theory section this matrix is called $K$ and the upscaling dimension is called $m$.
      • n_layers: the number of gradient layers with default value set to 2.
      • activation : the activation function for all the activations layers with default set to tanh.
      • init_upper : a boolean that indicates whether the first gradient layer changes $q$ first. By default this is true.

      Loss function

      The loss function described in the theory section is the default choice used in GeometricMachineLearning.jl for training SympNets.

      Data Structures in GeometricMachineLearning.jl

      Examples

      Let us see how to use it on several examples.

      Example of a pendulum with G-SympNet

      Let us begin with a simple example, the pendulum system, the Hamiltonian of which is

      \[H:(q,p)\in\mathbb{R}^2 \mapsto \frac{1}{2}p^2-cos(q) \in \mathbb{R}.\]

      Here we generate pendulum data with the script GeometricMachineLearning/scripts/pendulum.jl:

      Random.seed!(1234)
      +Sympnets · GeometricMachineLearning.jl

      SympNets with GeometricMachineLearning.jl

      This page serves as a short introduction into using SympNets with GeometricMachineLearning.jl. For the general theory see the theory section.

      With GeometricMachineLearning.jl one can easily implement SympNets. The steps are the following :

      • Specify the architecture with the functions GSympNet and LASympNet,
      • Specify the type and the backend with NeuralNetwork,
      • Pick an optimizer for training the network,
      • Train the neural networks!

      We discuss these points is some detail:

      Specifying the architecture

      To call an $LA$-SympNet, one needs to write

      lasympnet = LASympNet(dim; depth=5, nhidden=1, activation=tanh, init_upper_linear=true, init_upper_act=true) 

      LASympNet takes one obligatory argument:

      • dim : the dimension of the phase space (i.e. an integer) or optionally an instance of DataLoader. This latter option will be used below.

      and several keywords argument :

      • depth : the depth for all the linear layers. The default value set to 5 (if width>5, width is set to 5). See the theory section for more details; there depth was called $n$.
      • nhidden : the number of pairs of linear and activation layers with default value set to 1 (i.e the $LA$-SympNet is a composition of a linear layer, an activation layer and then again a single layer).
      • activation : the activation function for all the activations layers with default set to tanh,
      • initupperlinear : a boolean that indicates whether the first linear layer changes $q$ first. By default this is true.
      • initupperact : a boolean that indicates whether the first activation layer changes $q$ first. By default this is true.

      G-SympNet

      To call a G-SympNet, one needs to write

      gsympnet = GSympNet(dim; upscaling_dimension=2*dim, n_layers=2, activation=tanh, init_upper=true) 

      GSympNet takes one obligatory argument:

      • dim : the dimension of the phase space (i.e. an integer) or optionally an instance of DataLoader. This latter option will be used below.

      and severals keywords argument :

      • upscaling_dimension: The first dimension of the matrix with which the input is multiplied. In the theory section this matrix is called $K$ and the upscaling dimension is called $m$.
      • n_layers: the number of gradient layers with default value set to 2.
      • activation : the activation function for all the activations layers with default set to tanh.
      • init_upper : a boolean that indicates whether the first gradient layer changes $q$ first. By default this is true.

      Loss function

      The loss function described in the theory section is the default choice used in GeometricMachineLearning.jl for training SympNets.

      Data Structures in GeometricMachineLearning.jl

      Examples

      Let us see how to use it on several examples.

      Example of a pendulum with G-SympNet

      Let us begin with a simple example, the pendulum system, the Hamiltonian of which is

      \[H:(q,p)\in\mathbb{R}^2 \mapsto \frac{1}{2}p^2-cos(q) \in \mathbb{R}.\]

      Here we generate pendulum data with the script GeometricMachineLearning/scripts/pendulum.jl:

      Random.seed!(1234)
       
       # load script
       include("../../../scripts/pendulum.jl")
      @@ -38,39 +38,39 @@
       
       # perform training (returns array that contains the total loss for each training step)
       g_loss_array = g_opt(g_nn, dl, batch, nepochs)
      -la_loss_array = la_opt(la_nn, dl, batch, nepochs)
      
      Progress:   1%|▎                                        |  ETA: 0:16:26
      +la_loss_array = la_opt(la_nn, dl, batch, nepochs)
      
      Progress:   1%|▎                                        |  ETA: 0:17:10
         TrainingLoss:  2.066915272918649
      -
      
      Progress:   2%|▋                                        |  ETA: 0:06:38
      +
      
      Progress:   2%|▋                                        |  ETA: 0:06:54
         TrainingLoss:  1.8703184124525745
      -
      
      Progress:   3%|█▏                                       |  ETA: 0:04:11
      +
      
      Progress:   3%|█▏                                       |  ETA: 0:04:20
         TrainingLoss:  1.6846847999494905
      -
      
      Progress:   4%|█▌                                       |  ETA: 0:03:04
      +
      
      Progress:   4%|█▌                                       |  ETA: 0:03:11
         TrainingLoss:  1.5086112497809228
      -
      
      Progress:   5%|█▉                                       |  ETA: 0:02:25
      +
      
      Progress:   5%|█▉                                       |  ETA: 0:02:31
         TrainingLoss:  1.3439071450227986
      -
      
      Progress:   6%|██▍                                      |  ETA: 0:02:00
      +
      
      Progress:   6%|██▍                                      |  ETA: 0:02:05
         TrainingLoss:  1.186293874832636
      -
      
      Progress:   7%|██▊                                      |  ETA: 0:01:43
      +
      
      Progress:   7%|██▊                                      |  ETA: 0:01:46
         TrainingLoss:  1.0344771776867112
      -
      
      Progress:   8%|███▏                                     |  ETA: 0:01:30
      +
      
      Progress:   8%|███▏                                     |  ETA: 0:01:33
         TrainingLoss:  0.8930031108804007
      -
      
      Progress:   9%|███▌                                     |  ETA: 0:01:20
      +
      
      Progress:   9%|███▌                                     |  ETA: 0:01:22
         TrainingLoss:  0.8125105791328189
      -
      
      Progress:  10%|████                                     |  ETA: 0:01:12
      +
      
      Progress:  10%|████                                     |  ETA: 0:01:14
         TrainingLoss:  0.7701447707109804
      -
      
      Progress:  11%|████▍                                    |  ETA: 0:01:06
      +
      
      Progress:  11%|████▍                                    |  ETA: 0:01:07
         TrainingLoss:  0.7294809790664659
      -
      
      Progress:  12%|████▊                                    |  ETA: 0:01:00
      +
      
      Progress:  12%|████▊                                    |  ETA: 0:01:02
         TrainingLoss:  0.6900982280884332
      -
      
      Progress:  13%|█████▎                                   |  ETA: 0:00:56
      +
      
      Progress:  13%|█████▎                                   |  ETA: 0:00:57
         TrainingLoss:  0.6518342493954282
      -
      
      Progress:  14%|█████▋                                   |  ETA: 0:00:52
      +
      
      Progress:  14%|█████▋                                   |  ETA: 0:00:53
         TrainingLoss:  0.6151141651633734
      -
      
      Progress:  15%|██████                                   |  ETA: 0:00:49
      +
      
      Progress:  15%|██████                                   |  ETA: 0:00:50
         TrainingLoss:  0.5795305642991055
       
      
      Progress:  16%|██████▍                                  |  ETA: 0:00:46
         TrainingLoss:  0.5454176240643746
      -
      
      Progress:  17%|██████▉                                  |  ETA: 0:00:43
      +
      
      Progress:  17%|██████▉                                  |  ETA: 0:00:44
         TrainingLoss:  0.512139612673924
       
      
      Progress:  18%|███████▎                                 |  ETA: 0:00:41
         TrainingLoss:  0.48025543212518773
      @@ -80,468 +80,374 @@
         TrainingLoss:  0.42008194710163416
       
      
      Progress:  21%|████████▌                                |  ETA: 0:00:35
         TrainingLoss:  0.391673619413957
      -
      
      Progress:  22%|████████▉                                |  ETA: 0:00:33
      +
      
      Progress:  22%|████████▉                                |  ETA: 0:00:34
         TrainingLoss:  0.36410816685451014
       
      
      Progress:  23%|█████████▎                               |  ETA: 0:00:32
         TrainingLoss:  0.3376667614179997
      -
      
      Progress:  23%|█████████▋                               |  ETA: 0:00:32
      -  TrainingLoss:  0.3203657906778988
      -
      
      Progress:  24%|██████████                               |  ETA: 0:00:31
      -  TrainingLoss:  0.29517668875582004
      -
      
      Progress:  25%|██████████▍                              |  ETA: 0:00:30
      -  TrainingLoss:  0.27137801705521297
      -
      
      Progress:  26%|██████████▊                              |  ETA: 0:00:29
      -  TrainingLoss:  0.2502549870097778
      -
      
      Progress:  27%|███████████▎                             |  ETA: 0:00:28
      -  TrainingLoss:  0.23114201862633169
      -
      
      Progress:  28%|███████████▋                             |  ETA: 0:00:27
      -  TrainingLoss:  0.21365153615163696
      -
      
      Progress:  29%|████████████                             |  ETA: 0:00:26
      -  TrainingLoss:  0.1969084075382077
      -
      
      Progress:  30%|████████████▍                            |  ETA: 0:00:25
      -  TrainingLoss:  0.18129902423240213
      -
      
      Progress:  31%|████████████▉                            |  ETA: 0:00:24
      -  TrainingLoss:  0.1656113210958745
      -
      
      Progress:  32%|█████████████▎                           |  ETA: 0:00:23
      -  TrainingLoss:  0.15238531821230655
      -
      
      Progress:  33%|█████████████▋                           |  ETA: 0:00:22
      -  TrainingLoss:  0.13938667935516103
      -
      
      Progress:  34%|██████████████▏                          |  ETA: 0:00:21
      -  TrainingLoss:  0.12882023911089932
      -
      
      Progress:  35%|██████████████▌                          |  ETA: 0:00:21
      -  TrainingLoss:  0.11913914187794586
      -
      
      Progress:  36%|██████████████▉                          |  ETA: 0:00:20
      -  TrainingLoss:  0.1100650786475275
      -
      
      Progress:  37%|███████████████▎                         |  ETA: 0:00:19
      -  TrainingLoss:  0.10214805434969505
      -
      
      Progress:  38%|███████████████▊                         |  ETA: 0:00:19
      -  TrainingLoss:  0.09445516142360166
      -
      
      Progress:  39%|████████████████▏                        |  ETA: 0:00:18
      -  TrainingLoss:  0.08678295684473218
      -
      
      Progress:  40%|████████████████▌                        |  ETA: 0:00:18
      -  TrainingLoss:  0.07933469655790801
      -
      
      Progress:  41%|█████████████████                        |  ETA: 0:00:17
      -  TrainingLoss:  0.07346752156948202
      -
      
      Progress:  42%|█████████████████▍                       |  ETA: 0:00:17
      -  TrainingLoss:  0.06764174633954036
      -
      
      Progress:  43%|█████████████████▊                       |  ETA: 0:00:16
      -  TrainingLoss:  0.062061921315855875
      -
      
      Progress:  44%|██████████████████▏                      |  ETA: 0:00:16
      -  TrainingLoss:  0.05662239968786139
      -
      
      Progress:  45%|██████████████████▋                      |  ETA: 0:00:15
      -  TrainingLoss:  0.051704936150728374
      -
      
      Progress:  46%|███████████████████                      |  ETA: 0:00:15
      -  TrainingLoss:  0.04695284932198228
      -
      
      Progress:  47%|███████████████████▍                     |  ETA: 0:00:14
      -  TrainingLoss:  0.04200143483510628
      -
      
      Progress:  48%|███████████████████▉                     |  ETA: 0:00:14
      -  TrainingLoss:  0.036292413137893595
      -
      
      Progress:  49%|████████████████████▎                    |  ETA: 0:00:13
      -  TrainingLoss:  0.028798873088661154
      -
      
      Progress:  50%|████████████████████▋                    |  ETA: 0:00:13
      -  TrainingLoss:  0.019033674264089497
      -
      
      Progress:  51%|█████████████████████                    |  ETA: 0:00:13
      -  TrainingLoss:  0.007909490092278357
      -
      
      Progress:  52%|█████████████████████▌                   |  ETA: 0:00:12
      -  TrainingLoss:  0.005460913657699731
      -
      
      Progress:  53%|█████████████████████▉                   |  ETA: 0:00:12
      -  TrainingLoss:  0.005290036139303021
      -
      
      Progress:  54%|██████████████████████▎                  |  ETA: 0:00:12
      -  TrainingLoss:  0.005461737899869764
      -
      
      Progress:  55%|██████████████████████▋                  |  ETA: 0:00:11
      -  TrainingLoss:  0.004791428240671279
      -
      
      Progress:  56%|███████████████████████▏                 |  ETA: 0:00:11
      -  TrainingLoss:  0.004875967272004448
      -
      
      Progress:  57%|███████████████████████▌                 |  ETA: 0:00:10
      -  TrainingLoss:  0.004880888695386631
      -
      
      Progress:  58%|███████████████████████▉                 |  ETA: 0:00:10
      -  TrainingLoss:  0.004780432849241519
      -
      
      Progress:  59%|████████████████████████▍                |  ETA: 0:00:10
      -  TrainingLoss:  0.004767025132388249
      -
      
      Progress:  60%|████████████████████████▊                |  ETA: 0:00:09
      -  TrainingLoss:  0.004835297324113116
      -
      
      Progress:  61%|█████████████████████████▏               |  ETA: 0:00:09
      -  TrainingLoss:  0.00479255062095828
      -
      
      Progress:  62%|█████████████████████████▌               |  ETA: 0:00:09
      -  TrainingLoss:  0.004743659480898135
      -
      
      Progress:  63%|██████████████████████████               |  ETA: 0:00:09
      -  TrainingLoss:  0.004723566834719385
      -
      
      Progress:  64%|██████████████████████████▍              |  ETA: 0:00:08
      -  TrainingLoss:  0.004711421674545685
      -
      
      Progress:  65%|██████████████████████████▊              |  ETA: 0:00:08
      -  TrainingLoss:  0.004760963425232122
      -
      
      Progress:  66%|███████████████████████████▎             |  ETA: 0:00:08
      -  TrainingLoss:  0.004830894653247495
      -
      
      Progress:  67%|███████████████████████████▋             |  ETA: 0:00:07
      -  TrainingLoss:  0.0047316198389865186
      -
      
      Progress:  68%|████████████████████████████             |  ETA: 0:00:07
      -  TrainingLoss:  0.004670912908827502
      -
      
      Progress:  69%|████████████████████████████▍            |  ETA: 0:00:07
      -  TrainingLoss:  0.004645681006617864
      -
      
      Progress:  70%|████████████████████████████▉            |  ETA: 0:00:07
      -  TrainingLoss:  0.004732471362379578
      -
      
      Progress:  71%|█████████████████████████████▎           |  ETA: 0:00:06
      -  TrainingLoss:  0.004642294976392116
      -
      
      Progress:  72%|█████████████████████████████▋           |  ETA: 0:00:06
      -  TrainingLoss:  0.004702787452750585
      -
      
      Progress:  73%|██████████████████████████████▏          |  ETA: 0:00:06
      -  TrainingLoss:  0.00464700524333026
      -
      
      Progress:  74%|██████████████████████████████▌          |  ETA: 0:00:06
      -  TrainingLoss:  0.0046673825682351875
      -
      
      Progress:  75%|██████████████████████████████▉          |  ETA: 0:00:05
      -  TrainingLoss:  0.004683917119711849
      -
      
      Progress:  76%|███████████████████████████████▎         |  ETA: 0:00:05
      -  TrainingLoss:  0.004673836042990395
      -
      
      Progress:  77%|███████████████████████████████▊         |  ETA: 0:00:05
      -  TrainingLoss:  0.004756774221330526
      -
      
      Progress:  78%|████████████████████████████████▏        |  ETA: 0:00:05
      -  TrainingLoss:  0.004721549933145167
      -
      
      Progress:  79%|████████████████████████████████▌        |  ETA: 0:00:04
      -  TrainingLoss:  0.004691942301217118
      -
      
      Progress:  80%|████████████████████████████████▉        |  ETA: 0:00:04
      -  TrainingLoss:  0.004640835999720821
      -
      
      Progress:  81%|█████████████████████████████████▍       |  ETA: 0:00:04
      -  TrainingLoss:  0.004679708340549987
      -
      
      Progress:  82%|█████████████████████████████████▊       |  ETA: 0:00:04
      -  TrainingLoss:  0.004642215543798928
      -
      
      Progress:  83%|██████████████████████████████████▏      |  ETA: 0:00:03
      -  TrainingLoss:  0.004657299348975781
      -
      
      Progress:  84%|██████████████████████████████████▋      |  ETA: 0:00:03
      -  TrainingLoss:  0.0046399008428807806
      -
      
      Progress:  85%|███████████████████████████████████      |  ETA: 0:00:03
      -  TrainingLoss:  0.004709683376919811
      -
      
      Progress:  86%|███████████████████████████████████▍     |  ETA: 0:00:03
      -  TrainingLoss:  0.004704964647819707
      -
      
      Progress:  87%|███████████████████████████████████▊     |  ETA: 0:00:03
      -  TrainingLoss:  0.004679559720378963
      -
      
      Progress:  88%|████████████████████████████████████▎    |  ETA: 0:00:02
      -  TrainingLoss:  0.0047068593866385604
      -
      
      Progress:  89%|████████████████████████████████████▋    |  ETA: 0:00:02
      -  TrainingLoss:  0.004683858265289059
      -
      
      Progress:  90%|█████████████████████████████████████    |  ETA: 0:00:02
      -  TrainingLoss:  0.004673505114427705
      -
      
      Progress:  91%|█████████████████████████████████████▌   |  ETA: 0:00:02
      -  TrainingLoss:  0.004711778604807164
      -
      
      Progress:  92%|█████████████████████████████████████▉   |  ETA: 0:00:02
      -  TrainingLoss:  0.00469229955789331
      -
      
      Progress:  93%|██████████████████████████████████████▎  |  ETA: 0:00:01
      -  TrainingLoss:  0.004713797075692871
      -
      
      Progress:  94%|██████████████████████████████████████▋  |  ETA: 0:00:01
      -  TrainingLoss:  0.004672252340952394
      -
      
      Progress:  95%|███████████████████████████████████████▏ |  ETA: 0:00:01
      -  TrainingLoss:  0.004680727802547775
      -
      
      Progress:  96%|███████████████████████████████████████▌ |  ETA: 0:00:01
      -  TrainingLoss:  0.004681480769186995
      -
      
      Progress:  97%|███████████████████████████████████████▉ |  ETA: 0:00:01
      -  TrainingLoss:  0.0046665076806056615
      -
      
      Progress:  98%|████████████████████████████████████████▍|  ETA: 0:00:00
      -  TrainingLoss:  0.004672724399100245
      -
      
      Progress:  99%|████████████████████████████████████████▊|  ETA: 0:00:00
      -  TrainingLoss:  0.004630296347433375
      -
      
      Progress: 100%|█████████████████████████████████████████| Time: 0:00:19
      +
      
      Progress:  24%|█████████▊                               |  ETA: 0:00:31
      +  TrainingLoss:  0.3117466680932882
      +
      
      Progress:  25%|██████████▏                              |  ETA: 0:00:29
      +  TrainingLoss:  0.28726516974918415
      +
      
      Progress:  26%|██████████▌                              |  ETA: 0:00:28
      +  TrainingLoss:  0.2644213225249751
      +
      
      Progress:  27%|██████████▉                              |  ETA: 0:00:27
      +  TrainingLoss:  0.2435906213726624
      +
      
      Progress:  28%|███████████▍                             |  ETA: 0:00:26
      +  TrainingLoss:  0.22522483957707845
      +
      
      Progress:  29%|███████████▊                             |  ETA: 0:00:25
      +  TrainingLoss:  0.20791234994296595
      +
      
      Progress:  30%|████████████▏                            |  ETA: 0:00:24
      +  TrainingLoss:  0.19134504774458824
      +
      
      Progress:  31%|████████████▋                            |  ETA: 0:00:23
      +  TrainingLoss:  0.17581312280649225
      +
      
      Progress:  32%|█████████████                            |  ETA: 0:00:22
      +  TrainingLoss:  0.16107281875861387
      +
      
      Progress:  33%|█████████████▍                           |  ETA: 0:00:22
      +  TrainingLoss:  0.14812331593409667
      +
      
      Progress:  34%|█████████████▊                           |  ETA: 0:00:21
      +  TrainingLoss:  0.13536704008881206
      +
      
      Progress:  35%|██████████████▎                          |  ETA: 0:00:20
      +  TrainingLoss:  0.12544292691878578
      +
      
      Progress:  36%|██████████████▋                          |  ETA: 0:00:20
      +  TrainingLoss:  0.11617364388544663
      +
      
      Progress:  37%|███████████████                          |  ETA: 0:00:19
      +  TrainingLoss:  0.10724613619458485
      +
      
      Progress:  38%|███████████████▌                         |  ETA: 0:00:18
      +  TrainingLoss:  0.09958061392905966
      +
      
      Progress:  39%|███████████████▉                         |  ETA: 0:00:18
      +  TrainingLoss:  0.09207089008086612
      +
      
      Progress:  40%|████████████████▎                        |  ETA: 0:00:17
      +  TrainingLoss:  0.08414515776453761
      +
      
      Progress:  41%|████████████████▋                        |  ETA: 0:00:17
      +  TrainingLoss:  0.07722332469135873
      +
      
      Progress:  42%|█████████████████▏                       |  ETA: 0:00:16
      +  TrainingLoss:  0.07144191665838037
      +
      
      Progress:  43%|█████████████████▌                       |  ETA: 0:00:16
      +  TrainingLoss:  0.06575799347239383
      +
      
      Progress:  44%|█████████████████▉                       |  ETA: 0:00:15
      +  TrainingLoss:  0.06020064788313979
      +
      
      Progress:  45%|██████████████████▍                      |  ETA: 0:00:15
      +  TrainingLoss:  0.054785199334496916
      +
      
      Progress:  46%|██████████████████▊                      |  ETA: 0:00:14
      +  TrainingLoss:  0.05016874272028034
      +
      
      Progress:  47%|███████████████████▏                     |  ETA: 0:00:14
      +  TrainingLoss:  0.04546757510484421
      +
      
      Progress:  48%|███████████████████▌                     |  ETA: 0:00:13
      +  TrainingLoss:  0.040279378316333186
      +
      
      Progress:  49%|████████████████████                     |  ETA: 0:00:13
      +  TrainingLoss:  0.034089255118204896
      +
      
      Progress:  50%|████████████████████▍                    |  ETA: 0:00:13
      +  TrainingLoss:  0.025960275523185673
      +
      
      Progress:  51%|████████████████████▊                    |  ETA: 0:00:12
      +  TrainingLoss:  0.015431223997084553
      +
      
      Progress:  52%|█████████████████████▏                   |  ETA: 0:00:12
      +  TrainingLoss:  0.005609538584701016
      +
      
      Progress:  53%|█████████████████████▋                   |  ETA: 0:00:11
      +  TrainingLoss:  0.005440026993213228
      +
      
      Progress:  54%|██████████████████████                   |  ETA: 0:00:11
      +  TrainingLoss:  0.005300084264464761
      +
      
      Progress:  55%|██████████████████████▍                  |  ETA: 0:00:11
      +  TrainingLoss:  0.005304507817408504
      +
      
      Progress:  56%|██████████████████████▉                  |  ETA: 0:00:10
      +  TrainingLoss:  0.004853049302755273
      +
      
      Progress:  57%|███████████████████████▎                 |  ETA: 0:00:10
      +  TrainingLoss:  0.004825374608426655
      +
      
      Progress:  58%|███████████████████████▋                 |  ETA: 0:00:10
      +  TrainingLoss:  0.004818719893531698
      +
      
      Progress:  59%|████████████████████████                 |  ETA: 0:00:09
      +  TrainingLoss:  0.004768005915841431
      +
      
      Progress:  60%|████████████████████████▌                |  ETA: 0:00:09
      +  TrainingLoss:  0.004771524248350398
      +
      
      Progress:  61%|████████████████████████▉                |  ETA: 0:00:09
      +  TrainingLoss:  0.004807307962753018
      +
      
      Progress:  62%|█████████████████████████▎               |  ETA: 0:00:09
      +  TrainingLoss:  0.004973153845831019
      +
      
      Progress:  63%|█████████████████████████▊               |  ETA: 0:00:08
      +  TrainingLoss:  0.0047145226742005535
      +
      
      Progress:  64%|██████████████████████████▏              |  ETA: 0:00:08
      +  TrainingLoss:  0.004741000130922493
      +
      
      Progress:  65%|██████████████████████████▌              |  ETA: 0:00:08
      +  TrainingLoss:  0.004736521729691566
      +
      
      Progress:  66%|██████████████████████████▉              |  ETA: 0:00:07
      +  TrainingLoss:  0.004869904810849127
      +
      
      Progress:  67%|███████████████████████████▍             |  ETA: 0:00:07
      +  TrainingLoss:  0.004728030457748477
      +
      
      Progress:  68%|███████████████████████████▊             |  ETA: 0:00:07
      +  TrainingLoss:  0.004707416099283754
      +
      
      Progress:  69%|████████████████████████████▏            |  ETA: 0:00:07
      +  TrainingLoss:  0.0046362689615856555
      +
      
      Progress:  70%|████████████████████████████▋            |  ETA: 0:00:06
      +  TrainingLoss:  0.004654598349936831
      +
      
      Progress:  71%|█████████████████████████████            |  ETA: 0:00:06
      +  TrainingLoss:  0.004648325911828553
      +
      
      Progress:  72%|█████████████████████████████▍           |  ETA: 0:00:06
      +  TrainingLoss:  0.004687763713267868
      +
      
      Progress:  73%|█████████████████████████████▊           |  ETA: 0:00:06
      +  TrainingLoss:  0.004676605436548893
      +
      
      Progress:  74%|██████████████████████████████▎          |  ETA: 0:00:05
      +  TrainingLoss:  0.004670974629842191
      +
      
      Progress:  75%|██████████████████████████████▋          |  ETA: 0:00:05
      +  TrainingLoss:  0.004659862268577371
      +
      
      Progress:  76%|███████████████████████████████          |  ETA: 0:00:05
      +  TrainingLoss:  0.004731471266646188
      +
      
      Progress:  77%|███████████████████████████████▍         |  ETA: 0:00:05
      +  TrainingLoss:  0.004666985399962608
      +
      
      Progress:  78%|███████████████████████████████▉         |  ETA: 0:00:04
      +  TrainingLoss:  0.004653920445876631
      +
      
      Progress:  79%|████████████████████████████████▎        |  ETA: 0:00:04
      +  TrainingLoss:  0.004711881683730391
      +
      
      Progress:  80%|████████████████████████████████▋        |  ETA: 0:00:04
      +  TrainingLoss:  0.00464768319869945
      +
      
      Progress:  81%|█████████████████████████████████▏       |  ETA: 0:00:04
      +  TrainingLoss:  0.004653350879508869
      +
      
      Progress:  82%|█████████████████████████████████▌       |  ETA: 0:00:04
      +  TrainingLoss:  0.004718903861504744
      +
      
      Progress:  83%|█████████████████████████████████▉       |  ETA: 0:00:03
      +  TrainingLoss:  0.004692871820533947
      +
      
      Progress:  84%|██████████████████████████████████▎      |  ETA: 0:00:03
      +  TrainingLoss:  0.004666079566370797
      +
      
      Progress:  85%|██████████████████████████████████▊      |  ETA: 0:00:03
      +  TrainingLoss:  0.00468287399288829
      +
      
      Progress:  86%|███████████████████████████████████▏     |  ETA: 0:00:03
      +  TrainingLoss:  0.004669810901114719
      +
      
      Progress:  87%|███████████████████████████████████▌     |  ETA: 0:00:03
      +  TrainingLoss:  0.0047944477530601545
      +
      
      Progress:  88%|████████████████████████████████████     |  ETA: 0:00:02
      +  TrainingLoss:  0.0046526164801039705
      +
      
      Progress:  89%|████████████████████████████████████▍    |  ETA: 0:00:02
      +  TrainingLoss:  0.004648488256435065
      +
      
      Progress:  90%|████████████████████████████████████▊    |  ETA: 0:00:02
      +  TrainingLoss:  0.004668943509366575
      +
      
      Progress:  91%|█████████████████████████████████████▏   |  ETA: 0:00:02
      +  TrainingLoss:  0.004725994842383211
      +
      
      Progress:  92%|█████████████████████████████████████▋   |  ETA: 0:00:02
      +  TrainingLoss:  0.004657545592884774
      +
      
      Progress:  93%|██████████████████████████████████████   |  ETA: 0:00:01
      +  TrainingLoss:  0.0046529768208486266
      +
      
      Progress:  94%|██████████████████████████████████████▍  |  ETA: 0:00:01
      +  TrainingLoss:  0.004728062433071903
      +
      
      Progress:  95%|██████████████████████████████████████▉  |  ETA: 0:00:01
      +  TrainingLoss:  0.004729714548138945
      +
      
      Progress:  96%|███████████████████████████████████████▎ |  ETA: 0:00:01
      +  TrainingLoss:  0.0046891800561685085
      +
      
      Progress:  97%|███████████████████████████████████████▋ |  ETA: 0:00:01
      +  TrainingLoss:  0.00465449149089789
      +
      
      Progress:  98%|████████████████████████████████████████ |  ETA: 0:00:00
      +  TrainingLoss:  0.004710124590121675
      +
      
      Progress:  99%|████████████████████████████████████████▌|  ETA: 0:00:00
      +  TrainingLoss:  0.004636433683370136
      +
      
      Progress:  99%|████████████████████████████████████████▉|  ETA: 0:00:00
      +  TrainingLoss:  0.004689001221814609
      +
      
      Progress: 100%|█████████████████████████████████████████| Time: 0:00:18
         TrainingLoss:  0.004720388712892675
      -
      Progress:   1%|▎                                        |  ETA: 0:07:36
      +
      Progress:   1%|▎                                        |  ETA: 0:08:00
         TrainingLoss:  4.764918703517992
      -
      
      Progress:   1%|▌                                        |  ETA: 0:03:55
      -  TrainingLoss:  4.149525431576709
      -
      
      Progress:   2%|▉                                        |  ETA: 0:02:41
      -  TrainingLoss:  3.6935078077507173
      -
      
      Progress:   3%|█▏                                       |  ETA: 0:02:04
      +
      
      Progress:   2%|▋                                        |  ETA: 0:03:18
      +  TrainingLoss:  3.903288980931844
      +
      
      Progress:   3%|█▏                                       |  ETA: 0:02:08
         TrainingLoss:  3.3926356393443307
      -
      
      Progress:   3%|█▍                                       |  ETA: 0:01:42
      -  TrainingLoss:  3.1783712725527833
      -
      
      Progress:   4%|█▋                                       |  ETA: 0:01:27
      -  TrainingLoss:  3.005326824443142
      -
      
      Progress:   5%|█▉                                       |  ETA: 0:01:16
      +
      
      Progress:   4%|█▌                                       |  ETA: 0:01:36
      +  TrainingLoss:  3.087466354877015
      +
      
      Progress:   5%|█▉                                       |  ETA: 0:01:17
         TrainingLoss:  2.8640253642354585
      -
      
      Progress:   5%|██▏                                      |  ETA: 0:01:08
      -  TrainingLoss:  2.751086307635365
      -
      
      Progress:   6%|██▌                                      |  ETA: 0:01:02
      -  TrainingLoss:  2.6584661955093254
      +
      
      Progress:   6%|██▍                                      |  ETA: 0:01:05
      +  TrainingLoss:  2.7019862705095608
       
      
      Progress:   7%|██▊                                      |  ETA: 0:00:57
         TrainingLoss:  2.575508232897102
      -
      
      Progress:   7%|███                                      |  ETA: 0:00:53
      -  TrainingLoss:  2.508445385737147
      -
      
      Progress:   8%|███▎                                     |  ETA: 0:00:50
      -  TrainingLoss:  2.4549463379475434
      -
      
      Progress:   9%|███▌                                     |  ETA: 0:00:47
      +
      
      Progress:   8%|███▏                                     |  ETA: 0:00:51
      +  TrainingLoss:  2.4809636247200064
      +
      
      Progress:   9%|███▌                                     |  ETA: 0:00:46
         TrainingLoss:  2.408416390526461
      -
      
      Progress:   9%|███▉                                     |  ETA: 0:00:44
      +
      
      Progress:   9%|███▉                                     |  ETA: 0:00:43
         TrainingLoss:  2.362491372359014
      -
      
      Progress:  10%|████▏                                    |  ETA: 0:00:42
      -  TrainingLoss:  2.3289198179000103
      -
      
      Progress:  11%|████▍                                    |  ETA: 0:00:40
      -  TrainingLoss:  2.2919852145213055
      -
      
      Progress:  11%|████▋                                    |  ETA: 0:00:38
      +
      
      Progress:  10%|████▎                                    |  ETA: 0:00:40
      +  TrainingLoss:  2.3113057854276713
      +
      
      Progress:  11%|████▋                                    |  ETA: 0:00:37
         TrainingLoss:  2.259637969508258
      -
      
      Progress:  12%|████▉                                    |  ETA: 0:00:37
      -  TrainingLoss:  2.223561000758798
      -
      
      Progress:  13%|█████▎                                   |  ETA: 0:00:35
      -  TrainingLoss:  2.193998077293802
      -
      
      Progress:  13%|█████▌                                   |  ETA: 0:00:34
      +
      
      Progress:  12%|█████                                    |  ETA: 0:00:35
      +  TrainingLoss:  2.20806885525755
      +
      
      Progress:  13%|█████▌                                   |  ETA: 0:00:33
         TrainingLoss:  2.1615784028944924
      -
      
      Progress:  14%|█████▊                                   |  ETA: 0:00:33
      -  TrainingLoss:  2.132609220167556
      -
      
      Progress:  15%|██████                                   |  ETA: 0:00:32
      -  TrainingLoss:  2.1040604120812683
      -
      
      Progress:  15%|██████▎                                  |  ETA: 0:00:31
      +
      
      Progress:  14%|█████▉                                   |  ETA: 0:00:31
      +  TrainingLoss:  2.1184179893114563
      +
      
      Progress:  15%|██████▎                                  |  ETA: 0:00:29
         TrainingLoss:  2.0752614716746995
      -
      
      Progress:  16%|██████▌                                  |  ETA: 0:00:30
      -  TrainingLoss:  2.0468987558717897
      -
      
      Progress:  17%|██████▉                                  |  ETA: 0:00:29
      -  TrainingLoss:  2.0211738300100075
      -
      
      Progress:  17%|███████▏                                 |  ETA: 0:00:28
      +
      
      Progress:  16%|██████▊                                  |  ETA: 0:00:28
      +  TrainingLoss:  2.0338695647843785
      +
      
      Progress:  17%|███████▏                                 |  ETA: 0:00:27
         TrainingLoss:  1.992038173169533
      -
      
      Progress:  18%|███████▍                                 |  ETA: 0:00:27
      -  TrainingLoss:  1.9648364209665865
      -
      
      Progress:  19%|███████▋                                 |  ETA: 0:00:27
      -  TrainingLoss:  1.93788028729644
      -
      
      Progress:  19%|███████▉                                 |  ETA: 0:00:26
      +
      
      Progress:  18%|███████▌                                 |  ETA: 0:00:25
      +  TrainingLoss:  1.951884507658022
      +
      
      Progress:  19%|███████▉                                 |  ETA: 0:00:24
         TrainingLoss:  1.9017192666850282
      -
      
      Progress:  20%|████████▎                                |  ETA: 0:00:26
      -  TrainingLoss:  1.8644494950841657
      -
      
      Progress:  21%|████████▌                                |  ETA: 0:00:25
      -  TrainingLoss:  1.8202586679548638
      -
      
      Progress:  21%|████████▊                                |  ETA: 0:00:24
      +
      
      Progress:  20%|████████▍                                |  ETA: 0:00:23
      +  TrainingLoss:  1.8431769481200155
      +
      
      Progress:  21%|████████▊                                |  ETA: 0:00:23
         TrainingLoss:  1.7734480999972297
      -
      
      Progress:  22%|█████████                                |  ETA: 0:00:24
      -  TrainingLoss:  1.7211103680639954
      -
      
      Progress:  23%|█████████▎                               |  ETA: 0:00:23
      -  TrainingLoss:  1.6610457494821507
      -
      
      Progress:  23%|█████████▋                               |  ETA: 0:00:23
      +
      
      Progress:  22%|█████████▏                               |  ETA: 0:00:22
      +  TrainingLoss:  1.6919458980468487
      +
      
      Progress:  23%|█████████▋                               |  ETA: 0:00:21
         TrainingLoss:  1.5869796144252146
      -
      
      Progress:  24%|█████████▉                               |  ETA: 0:00:22
      +
      
      Progress:  24%|█████████▉                               |  ETA: 0:00:21
         TrainingLoss:  1.4877818607323579
      -
      
      Progress:  25%|██████████▏                              |  ETA: 0:00:22
      -  TrainingLoss:  1.356780527861289
      -
      
      Progress:  25%|██████████▍                              |  ETA: 0:00:21
      -  TrainingLoss:  1.1803259580360312
      -
      
      Progress:  26%|██████████▋                              |  ETA: 0:00:21
      +
      
      Progress:  25%|██████████▎                              |  ETA: 0:00:20
      +  TrainingLoss:  1.2743201173265302
      +
      
      Progress:  26%|██████████▋                              |  ETA: 0:00:19
         TrainingLoss:  0.9641837492440516
      -
      
      Progress:  27%|██████████▉                              |  ETA: 0:00:21
      -  TrainingLoss:  0.7211642638827758
      -
      
      Progress:  27%|███████████▎                             |  ETA: 0:00:20
      -  TrainingLoss:  0.5497197207581
      -
      
      Progress:  28%|███████████▋                             |  ETA: 0:00:20
      -  TrainingLoss:  0.4027005466806767
      -
      
      Progress:  29%|███████████▉                             |  ETA: 0:00:19
      +
      
      Progress:  27%|███████████▏                             |  ETA: 0:00:19
      +  TrainingLoss:  0.6181293032702986
      +
      
      Progress:  28%|███████████▌                             |  ETA: 0:00:18
      +  TrainingLoss:  0.4466965580535619
      +
      
      Progress:  29%|███████████▉                             |  ETA: 0:00:18
         TrainingLoss:  0.3302860397887618
      -
      
      Progress:  30%|████████████▏                            |  ETA: 0:00:19
      -  TrainingLoss:  0.26864269612916597
      -
      
      Progress:  30%|████████████▍                            |  ETA: 0:00:19
      -  TrainingLoss:  0.22241727459642718
      -
      
      Progress:  31%|████████████▊                            |  ETA: 0:00:18
      -  TrainingLoss:  0.18177948582411996
      -
      
      Progress:  32%|█████████████                            |  ETA: 0:00:18
      +
      
      Progress:  30%|████████████▎                            |  ETA: 0:00:17
      +  TrainingLoss:  0.2442973172799506
      +
      
      Progress:  31%|████████████▋                            |  ETA: 0:00:17
      +  TrainingLoss:  0.20220201224702233
      +
      
      Progress:  32%|█████████████                            |  ETA: 0:00:16
         TrainingLoss:  0.1444980008270192
      -
      
      Progress:  32%|█████████████▎                           |  ETA: 0:00:18
      -  TrainingLoss:  0.10918518863547835
      -
      
      Progress:  33%|█████████████▌                           |  ETA: 0:00:18
      -  TrainingLoss:  0.0838829692807436
      -
      
      Progress:  34%|█████████████▊                           |  ETA: 0:00:17
      +
      
      Progress:  33%|█████████████▍                           |  ETA: 0:00:16
      +  TrainingLoss:  0.09483717484805718
      +
      
      Progress:  34%|█████████████▊                           |  ETA: 0:00:16
         TrainingLoss:  0.07220173183112341
      -
      
      Progress:  34%|██████████████▏                          |  ETA: 0:00:17
      -  TrainingLoss:  0.06656237559405179
      -
      
      Progress:  35%|██████████████▍                          |  ETA: 0:00:17
      -  TrainingLoss:  0.06260132780893796
      -
      
      Progress:  36%|██████████████▋                          |  ETA: 0:00:16
      +
      
      Progress:  35%|██████████████▎                          |  ETA: 0:00:15
      +  TrainingLoss:  0.06455632507016339
      +
      
      Progress:  36%|██████████████▋                          |  ETA: 0:00:15
         TrainingLoss:  0.05895367200146605
      -
      
      Progress:  36%|██████████████▉                          |  ETA: 0:00:16
      -  TrainingLoss:  0.05556925749605625
      -
      
      Progress:  37%|███████████████▏                         |  ETA: 0:00:16
      -  TrainingLoss:  0.05246299900377187
      -
      
      Progress:  38%|███████████████▌                         |  ETA: 0:00:16
      +
      
      Progress:  37%|███████████████                          |  ETA: 0:00:14
      +  TrainingLoss:  0.05399480474380367
      +
      
      Progress:  38%|███████████████▌                         |  ETA: 0:00:14
         TrainingLoss:  0.04919161165054553
      -
      
      Progress:  38%|███████████████▊                         |  ETA: 0:00:15
      +
      
      Progress:  38%|███████████████▊                         |  ETA: 0:00:14
         TrainingLoss:  0.046303660536947934
      -
      
      Progress:  39%|████████████████                         |  ETA: 0:00:15
      -  TrainingLoss:  0.04348422481425553
      -
      
      Progress:  40%|████████████████▎                        |  ETA: 0:00:15
      -  TrainingLoss:  0.04043706274313017
      -
      
      Progress:  40%|████████████████▌                        |  ETA: 0:00:15
      +
      
      Progress:  39%|████████████████▏                        |  ETA: 0:00:13
      +  TrainingLoss:  0.041957815720500835
      +
      
      Progress:  40%|████████████████▌                        |  ETA: 0:00:13
         TrainingLoss:  0.03773400517229236
      -
      
      Progress:  41%|████████████████▊                        |  ETA: 0:00:14
      -  TrainingLoss:  0.03522977321155205
      -
      
      Progress:  42%|█████████████████▏                       |  ETA: 0:00:14
      -  TrainingLoss:  0.03273922063771832
      -
      
      Progress:  42%|█████████████████▍                       |  ETA: 0:00:14
      +
      
      Progress:  41%|█████████████████                        |  ETA: 0:00:13
      +  TrainingLoss:  0.03402071406737607
      +
      
      Progress:  42%|█████████████████▍                       |  ETA: 0:00:12
         TrainingLoss:  0.03070501777523684
      -
      
      Progress:  43%|█████████████████▋                       |  ETA: 0:00:14
      -  TrainingLoss:  0.02830502362002834
      -
      
      Progress:  44%|█████████████████▉                       |  ETA: 0:00:14
      -  TrainingLoss:  0.02654515275457816
      -
      
      Progress:  44%|██████████████████▏                      |  ETA: 0:00:13
      +
      
      Progress:  43%|█████████████████▊                       |  ETA: 0:00:12
      +  TrainingLoss:  0.027513565947533285
      +
      
      Progress:  44%|██████████████████▏                      |  ETA: 0:00:12
         TrainingLoss:  0.024853885496584623
      -
      
      Progress:  45%|██████████████████▌                      |  ETA: 0:00:13
      -  TrainingLoss:  0.023206726787833563
      -
      
      Progress:  46%|██████████████████▊                      |  ETA: 0:00:13
      -  TrainingLoss:  0.022090932141495025
      -
      
      Progress:  46%|███████████████████                      |  ETA: 0:00:13
      +
      
      Progress:  45%|██████████████████▋                      |  ETA: 0:00:12
      +  TrainingLoss:  0.022531384639026562
      +
      
      Progress:  46%|███████████████████                      |  ETA: 0:00:11
         TrainingLoss:  0.021300690817579054
      -
      
      Progress:  47%|███████████████████▎                     |  ETA: 0:00:12
      -  TrainingLoss:  0.020436704260762092
      -
      
      Progress:  48%|███████████████████▌                     |  ETA: 0:00:12
      -  TrainingLoss:  0.019939298154515166
      -
      
      Progress:  48%|███████████████████▉                     |  ETA: 0:00:12
      +
      
      Progress:  47%|███████████████████▍                     |  ETA: 0:00:11
      +  TrainingLoss:  0.020359222083524597
      +
      
      Progress:  48%|███████████████████▉                     |  ETA: 0:00:11
         TrainingLoss:  0.019370561972260232
      -
      
      Progress:  49%|████████████████████▏                    |  ETA: 0:00:12
      -  TrainingLoss:  0.01930837016100105
      -
      
      Progress:  50%|████████████████████▍                    |  ETA: 0:00:12
      -  TrainingLoss:  0.018632144262279167
      -
      
      Progress:  50%|████████████████████▋                    |  ETA: 0:00:11
      +
      
      Progress:  49%|████████████████████▎                    |  ETA: 0:00:10
      +  TrainingLoss:  0.01877042322254
      +
      
      Progress:  50%|████████████████████▋                    |  ETA: 0:00:10
         TrainingLoss:  0.018368882180086456
      -
      
      Progress:  51%|████████████████████▉                    |  ETA: 0:00:11
      -  TrainingLoss:  0.018116894718239226
      -
      
      Progress:  52%|█████████████████████▏                   |  ETA: 0:00:11
      -  TrainingLoss:  0.017902079936340812
      -
      
      Progress:  52%|█████████████████████▌                   |  ETA: 0:00:11
      +
      
      Progress:  51%|█████████████████████                    |  ETA: 0:00:10
      +  TrainingLoss:  0.01815033018668624
      +
      
      Progress:  52%|█████████████████████▌                   |  ETA: 0:00:10
         TrainingLoss:  0.017634005361439678
      -
      
      Progress:  53%|█████████████████████▊                   |  ETA: 0:00:11
      +
      
      Progress:  53%|█████████████████████▊                   |  ETA: 0:00:09
         TrainingLoss:  0.017141251126900073
      -
      
      Progress:  54%|██████████████████████                   |  ETA: 0:00:11
      -  TrainingLoss:  0.017022568412287276
      -
      
      Progress:  54%|██████████████████████▎                  |  ETA: 0:00:10
      -  TrainingLoss:  0.01693559599864045
      -
      
      Progress:  55%|██████████████████████▌                  |  ETA: 0:00:10
      +
      
      Progress:  54%|██████████████████████▏                  |  ETA: 0:00:09
      +  TrainingLoss:  0.016956637517224163
      +
      
      Progress:  55%|██████████████████████▌                  |  ETA: 0:00:09
         TrainingLoss:  0.016657733459372596
      -
      
      Progress:  56%|██████████████████████▉                  |  ETA: 0:00:10
      -  TrainingLoss:  0.016275757237828606
      -
      
      Progress:  56%|███████████████████████▏                 |  ETA: 0:00:10
      -  TrainingLoss:  0.01600351684397219
      -
      
      Progress:  57%|███████████████████████▍                 |  ETA: 0:00:10
      +
      
      Progress:  56%|███████████████████████                  |  ETA: 0:00:09
      +  TrainingLoss:  0.01627316861454865
      +
      
      Progress:  57%|███████████████████████▍                 |  ETA: 0:00:08
         TrainingLoss:  0.016030014479670472
      -
      
      Progress:  58%|███████████████████████▋                 |  ETA: 0:00:09
      -  TrainingLoss:  0.015843190729208843
      -
      
      Progress:  58%|███████████████████████▉                 |  ETA: 0:00:09
      -  TrainingLoss:  0.015571593385975338
      -
      
      Progress:  59%|████████████████████████▎                |  ETA: 0:00:09
      +
      
      Progress:  58%|███████████████████████▊                 |  ETA: 0:00:08
      +  TrainingLoss:  0.0156640385359093
      +
      
      Progress:  59%|████████████████████████▎                |  ETA: 0:00:08
         TrainingLoss:  0.015557855742059368
      -
      
      Progress:  60%|████████████████████████▌                |  ETA: 0:00:09
      -  TrainingLoss:  0.0154484946227229
      -
      
      Progress:  60%|████████████████████████▊                |  ETA: 0:00:09
      -  TrainingLoss:  0.015154002943593503
      -
      
      Progress:  61%|█████████████████████████                |  ETA: 0:00:09
      +
      
      Progress:  60%|████████████████████████▋                |  ETA: 0:00:08
      +  TrainingLoss:  0.015175301637515121
      +
      
      Progress:  61%|█████████████████████████                |  ETA: 0:00:08
         TrainingLoss:  0.014956482921344657
      -
      
      Progress:  62%|█████████████████████████▎               |  ETA: 0:00:08
      -  TrainingLoss:  0.01477036507546704
      -
      
      Progress:  62%|█████████████████████████▌               |  ETA: 0:00:08
      -  TrainingLoss:  0.014552404528709446
      -
      
      Progress:  63%|█████████████████████████▉               |  ETA: 0:00:08
      -  TrainingLoss:  0.014509826165904651
      -
      
      Progress:  64%|██████████████████████████▏              |  ETA: 0:00:08
      +
      
      Progress:  62%|█████████████████████████▍               |  ETA: 0:00:07
      +  TrainingLoss:  0.014676611233097967
      +
      
      Progress:  63%|█████████████████████████▊               |  ETA: 0:00:07
      +  TrainingLoss:  0.01446427803258848
      +
      
      Progress:  64%|██████████████████████████▏              |  ETA: 0:00:07
         TrainingLoss:  0.014153587884684312
      -
      
      Progress:  64%|██████████████████████████▍              |  ETA: 0:00:08
      -  TrainingLoss:  0.014010226556284321
      -
      
      Progress:  65%|██████████████████████████▋              |  ETA: 0:00:08
      -  TrainingLoss:  0.01403790750628399
      +
      
      Progress:  65%|██████████████████████████▌              |  ETA: 0:00:07
      +  TrainingLoss:  0.014010113717092765
       
      
      Progress:  66%|██████████████████████████▉              |  ETA: 0:00:07
         TrainingLoss:  0.013768540982916114
      -
      
      Progress:  66%|███████████████████████████▎             |  ETA: 0:00:07
      -  TrainingLoss:  0.013798913101986401
      -
      
      Progress:  67%|███████████████████████████▌             |  ETA: 0:00:07
      -  TrainingLoss:  0.013472507515205281
      -
      
      Progress:  68%|███████████████████████████▊             |  ETA: 0:00:07
      -  TrainingLoss:  0.01355845232409697
      -
      
      Progress:  68%|████████████████████████████             |  ETA: 0:00:07
      +
      
      Progress:  67%|███████████████████████████▍             |  ETA: 0:00:06
      +  TrainingLoss:  0.014012277407742926
      +
      
      Progress:  67%|███████████████████████████▋             |  ETA: 0:00:06
      +  TrainingLoss:  0.013485206733060983
      +
      
      Progress:  68%|████████████████████████████             |  ETA: 0:00:06
         TrainingLoss:  0.013633078302595344
      -
      
      Progress:  69%|████████████████████████████▎            |  ETA: 0:00:07
      -  TrainingLoss:  0.013462479519396461
      -
      
      Progress:  70%|████████████████████████████▋            |  ETA: 0:00:07
      -  TrainingLoss:  0.012985521129432059
      +
      
      Progress:  69%|████████████████████████████▍            |  ETA: 0:00:06
      +  TrainingLoss:  0.013181456498050975
       
      
      Progress:  70%|████████████████████████████▉            |  ETA: 0:00:06
         TrainingLoss:  0.012877404962695838
      -
      
      Progress:  71%|█████████████████████████████▏           |  ETA: 0:00:06
      -  TrainingLoss:  0.012910789558263334
      -
      
      Progress:  72%|█████████████████████████████▍           |  ETA: 0:00:06
      -  TrainingLoss:  0.012878518507095463
      -
      
      Progress:  72%|█████████████████████████████▋           |  ETA: 0:00:06
      +
      
      Progress:  71%|█████████████████████████████▎           |  ETA: 0:00:05
      +  TrainingLoss:  0.012831480735025793
      +
      
      Progress:  72%|█████████████████████████████▋           |  ETA: 0:00:05
         TrainingLoss:  0.01279811761159351
      -
      
      Progress:  73%|█████████████████████████████▉           |  ETA: 0:00:06
      -  TrainingLoss:  0.01259186022952515
      -
      
      Progress:  74%|██████████████████████████████▎          |  ETA: 0:00:06
      -  TrainingLoss:  0.012462164747857335
      +
      
      Progress:  73%|██████████████████████████████▏          |  ETA: 0:00:05
      +  TrainingLoss:  0.012568200062361395
       
      
      Progress:  74%|██████████████████████████████▌          |  ETA: 0:00:05
         TrainingLoss:  0.012397634378263437
      -
      
      Progress:  75%|██████████████████████████████▊          |  ETA: 0:00:05
      -  TrainingLoss:  0.01243981041390678
      -
      
      Progress:  76%|███████████████████████████████          |  ETA: 0:00:05
      -  TrainingLoss:  0.01245138554995391
      -
      
      Progress:  76%|███████████████████████████████▎         |  ETA: 0:00:05
      +
      
      Progress:  75%|██████████████████████████████▉          |  ETA: 0:00:05
      +  TrainingLoss:  0.012378383055596947
      +
      
      Progress:  76%|███████████████████████████████▎         |  ETA: 0:00:04
         TrainingLoss:  0.01220216581074358
      -
      
      Progress:  77%|███████████████████████████████▋         |  ETA: 0:00:05
      -  TrainingLoss:  0.012178592619319006
      -
      
      Progress:  78%|███████████████████████████████▉         |  ETA: 0:00:05
      -  TrainingLoss:  0.012279446295909983
      -
      
      Progress:  78%|████████████████████████████████▏        |  ETA: 0:00:05
      +
      
      Progress:  77%|███████████████████████████████▊         |  ETA: 0:00:04
      +  TrainingLoss:  0.012190484871973266
      +
      
      Progress:  78%|████████████████████████████████▏        |  ETA: 0:00:04
         TrainingLoss:  0.011901936946685957
      -
      
      Progress:  79%|████████████████████████████████▍        |  ETA: 0:00:04
      -  TrainingLoss:  0.011948628542612772
      -
      
      Progress:  80%|████████████████████████████████▋        |  ETA: 0:00:04
      -  TrainingLoss:  0.011888060707142304
      +
      
      Progress:  79%|████████████████████████████████▌        |  ETA: 0:00:04
      +  TrainingLoss:  0.011866279626222087
       
      
      Progress:  80%|████████████████████████████████▉        |  ETA: 0:00:04
         TrainingLoss:  0.011670627248929052
      -
      
      Progress:  81%|█████████████████████████████████▎       |  ETA: 0:00:04
      -  TrainingLoss:  0.011708414523225135
      -
      
      Progress:  82%|█████████████████████████████████▌       |  ETA: 0:00:04
      -  TrainingLoss:  0.011576706535144293
      -
      
      Progress:  82%|█████████████████████████████████▊       |  ETA: 0:00:04
      +
      
      Progress:  81%|█████████████████████████████████▍       |  ETA: 0:00:03
      +  TrainingLoss:  0.01156576601130681
      +
      
      Progress:  82%|█████████████████████████████████▊       |  ETA: 0:00:03
         TrainingLoss:  0.011497897410598791
      -
      
      Progress:  83%|██████████████████████████████████       |  ETA: 0:00:04
      -  TrainingLoss:  0.011615574585236061
      -
      
      Progress:  84%|██████████████████████████████████▎      |  ETA: 0:00:03
      -  TrainingLoss:  0.011473407162736916
      +
      
      Progress:  83%|██████████████████████████████████▏      |  ETA: 0:00:03
      +  TrainingLoss:  0.011470162739434122
       
      
      Progress:  84%|██████████████████████████████████▋      |  ETA: 0:00:03
         TrainingLoss:  0.011136392188671016
      -
      
      Progress:  85%|██████████████████████████████████▉      |  ETA: 0:00:03
      -  TrainingLoss:  0.011109448447577584
      -
      
      Progress:  86%|███████████████████████████████████▏     |  ETA: 0:00:03
      -  TrainingLoss:  0.011445986941645527
      -
      
      Progress:  86%|███████████████████████████████████▍     |  ETA: 0:00:03
      +
      
      Progress:  85%|███████████████████████████████████      |  ETA: 0:00:03
      +  TrainingLoss:  0.011284952296176428
      +
      
      Progress:  86%|███████████████████████████████████▍     |  ETA: 0:00:02
         TrainingLoss:  0.011094228369856466
      -
      
      Progress:  87%|███████████████████████████████████▋     |  ETA: 0:00:03
      -  TrainingLoss:  0.010900326773211079
      -
      
      Progress:  88%|████████████████████████████████████     |  ETA: 0:00:03
      -  TrainingLoss:  0.011024074475794838
      +
      
      Progress:  87%|███████████████████████████████████▊     |  ETA: 0:00:02
      +  TrainingLoss:  0.010988629393725908
       
      
      Progress:  88%|████████████████████████████████████▎    |  ETA: 0:00:02
         TrainingLoss:  0.010806754002840268
      -
      
      Progress:  89%|████████████████████████████████████▌    |  ETA: 0:00:02
      -  TrainingLoss:  0.010735338280986977
      -
      
      Progress:  90%|████████████████████████████████████▊    |  ETA: 0:00:02
      -  TrainingLoss:  0.010631276555144977
      +
      
      Progress:  89%|████████████████████████████████████▋    |  ETA: 0:00:02
      +  TrainingLoss:  0.010637258001345181
       
      
      Progress:  90%|█████████████████████████████████████    |  ETA: 0:00:02
         TrainingLoss:  0.010645285945758457
      -
      
      Progress:  91%|█████████████████████████████████████▎   |  ETA: 0:00:02
      -  TrainingLoss:  0.010623480322480347
      -
      
      Progress:  92%|█████████████████████████████████████▋   |  ETA: 0:00:02
      -  TrainingLoss:  0.010543306031568313
      -
      
      Progress:  92%|█████████████████████████████████████▉   |  ETA: 0:00:02
      +
      
      Progress:  91%|█████████████████████████████████████▌   |  ETA: 0:00:02
      +  TrainingLoss:  0.010564227068238764
      +
      
      Progress:  92%|█████████████████████████████████████▉   |  ETA: 0:00:01
         TrainingLoss:  0.010645563056914515
      -
      
      Progress:  93%|██████████████████████████████████████▏  |  ETA: 0:00:01
      -  TrainingLoss:  0.010402342294266882
      -
      
      Progress:  94%|██████████████████████████████████████▍  |  ETA: 0:00:01
      -  TrainingLoss:  0.010457428707745119
      +
      
      Progress:  93%|██████████████████████████████████████▎  |  ETA: 0:00:01
      +  TrainingLoss:  0.010686137240518213
       
      
      Progress:  94%|██████████████████████████████████████▋  |  ETA: 0:00:01
         TrainingLoss:  0.01022368429748466
       
      
      Progress:  95%|███████████████████████████████████████  |  ETA: 0:00:01
         TrainingLoss:  0.01015864129945173
      -
      
      Progress:  96%|███████████████████████████████████████▎ |  ETA: 0:00:01
      -  TrainingLoss:  0.010169760805362799
      -
      
      Progress:  96%|███████████████████████████████████████▌ |  ETA: 0:00:01
      -  TrainingLoss:  0.01018649299379593
      +
      
      Progress:  96%|███████████████████████████████████████▍ |  ETA: 0:00:01
      +  TrainingLoss:  0.010170321620988503
       
      
      Progress:  97%|███████████████████████████████████████▊ |  ETA: 0:00:01
         TrainingLoss:  0.010513572956393793
      -
      
      Progress:  98%|████████████████████████████████████████ |  ETA: 0:00:00
      -  TrainingLoss:  0.010331344665641295
      -
      
      Progress:  98%|████████████████████████████████████████▍|  ETA: 0:00:00
      -  TrainingLoss:  0.010337522947509805
      +
      
      Progress:  98%|████████████████████████████████████████▏|  ETA: 0:00:00
      +  TrainingLoss:  0.010256546225489788
       
      
      Progress:  99%|████████████████████████████████████████▋|  ETA: 0:00:00
         TrainingLoss:  0.01007392026570153
      -
      
      Progress:  99%|████████████████████████████████████████▉|  ETA: 0:00:00
      -  TrainingLoss:  0.009934259884979441
      -
      
      Progress: 100%|█████████████████████████████████████████| Time: 0:00:20
      +
      
      Progress: 100%|█████████████████████████████████████████| Time: 0:00:17
         TrainingLoss:  0.01009880380133582

      We can also plot the training errors against the epoch (here the $y$-axis is in log-scale):

      using Plots
       p1 = plot(g_loss_array, xlabel="Epoch", ylabel="Training error", label="G-SympNet", color=3, yaxis=:log)
      -plot!(p1, la_loss_array, label="LA-SympNet", color=2)
      Example block output

      The trainings data data_q and data_p must be matrices of $\mathbb{R}^{n\times d}$ where $n$ is the length of data and $d$ is the half of the dimension of the system, i.e data_q[i,j] is $q_j(t_i)$ where $(t_1,...,t_n)$ are the corresponding time of the training data.

      Now we can make a prediction. Let's compare the initial data with a prediction starting from the same phase space point using the function iterate:

      ics = (q=qp_data.q[:,1], p=qp_data.p[:,1])
      +plot!(p1, la_loss_array, label="LA-SympNet", color=2)
      Example block output

      The trainings data data_q and data_p must be matrices of $\mathbb{R}^{n\times d}$ where $n$ is the length of data and $d$ is the half of the dimension of the system, i.e data_q[i,j] is $q_j(t_i)$ where $(t_1,...,t_n)$ are the corresponding time of the training data.

      Now we can make a prediction. Let's compare the initial data with a prediction starting from the same phase space point using the function iterate:

      ics = (q=qp_data.q[:,1], p=qp_data.p[:,1])
       
       steps_to_plot = 200
       
      @@ -552,4 +458,4 @@
       using Plots
       p2 = plot(qp_data.q'[1:steps_to_plot], qp_data.p'[1:steps_to_plot], label="training data")
       plot!(p2, la_trajectory.q', la_trajectory.p', label="LA Sympnet")
      -plot!(p2, g_trajectory.q', g_trajectory.p', label="G Sympnet")
      Example block output

      We see that GSympNet outperforms the LASympNet on this problem.

      +plot!(p2, g_trajectory.q', g_trajectory.p', label="G Sympnet")
      Example block output

      We see that GSympNet outperforms the LASympNet on this problem.

      diff --git a/latest/tutorials/volume_preserving_attention/338983b3.svg b/latest/tutorials/volume_preserving_attention/0e515873.svg similarity index 85% rename from latest/tutorials/volume_preserving_attention/338983b3.svg rename to latest/tutorials/volume_preserving_attention/0e515873.svg index 1d5bfcbaf..e4cb33c49 100644 --- a/latest/tutorials/volume_preserving_attention/338983b3.svg +++ b/latest/tutorials/volume_preserving_attention/0e515873.svg @@ -1,54 +1,54 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/b5294845.svg b/latest/tutorials/volume_preserving_attention/21efc475.svg similarity index 92% rename from latest/tutorials/volume_preserving_attention/b5294845.svg rename to latest/tutorials/volume_preserving_attention/21efc475.svg index 6e7069473..25cd95885 100644 --- a/latest/tutorials/volume_preserving_attention/b5294845.svg +++ b/latest/tutorials/volume_preserving_attention/21efc475.svg @@ -1,48 +1,48 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/5d3be767.svg b/latest/tutorials/volume_preserving_attention/23654bc1.svg similarity index 90% rename from latest/tutorials/volume_preserving_attention/5d3be767.svg rename to latest/tutorials/volume_preserving_attention/23654bc1.svg index 60e2df578..d5bb1d171 100644 --- a/latest/tutorials/volume_preserving_attention/5d3be767.svg +++ b/latest/tutorials/volume_preserving_attention/23654bc1.svg @@ -1,58 +1,58 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/c1333eea.svg b/latest/tutorials/volume_preserving_attention/244e9e6f.svg similarity index 85% rename from latest/tutorials/volume_preserving_attention/c1333eea.svg rename to latest/tutorials/volume_preserving_attention/244e9e6f.svg index f82a8a944..0920d4670 100644 --- a/latest/tutorials/volume_preserving_attention/c1333eea.svg +++ b/latest/tutorials/volume_preserving_attention/244e9e6f.svg @@ -1,54 +1,54 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/8bc8bd61.svg b/latest/tutorials/volume_preserving_attention/6cc831d5.svg similarity index 88% rename from latest/tutorials/volume_preserving_attention/8bc8bd61.svg rename to latest/tutorials/volume_preserving_attention/6cc831d5.svg index 8411dda53..4eb2b40a7 100644 --- a/latest/tutorials/volume_preserving_attention/8bc8bd61.svg +++ b/latest/tutorials/volume_preserving_attention/6cc831d5.svg @@ -1,52 +1,52 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/ccbb9215.svg b/latest/tutorials/volume_preserving_attention/76f565ff.svg similarity index 94% rename from latest/tutorials/volume_preserving_attention/ccbb9215.svg rename to latest/tutorials/volume_preserving_attention/76f565ff.svg index c709d6cc8..2b5614ae2 100644 --- a/latest/tutorials/volume_preserving_attention/ccbb9215.svg +++ b/latest/tutorials/volume_preserving_attention/76f565ff.svg @@ -1,44 +1,44 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/b1e4a67b.svg b/latest/tutorials/volume_preserving_attention/92210c6c.svg similarity index 94% rename from latest/tutorials/volume_preserving_attention/b1e4a67b.svg rename to latest/tutorials/volume_preserving_attention/92210c6c.svg index f93a6008c..113028ab2 100644 --- a/latest/tutorials/volume_preserving_attention/b1e4a67b.svg +++ b/latest/tutorials/volume_preserving_attention/92210c6c.svg @@ -1,46 +1,46 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/30157b05.svg b/latest/tutorials/volume_preserving_attention/b94db844.svg similarity index 85% rename from latest/tutorials/volume_preserving_attention/30157b05.svg rename to latest/tutorials/volume_preserving_attention/b94db844.svg index 0773b4a7d..c7d34ab81 100644 --- a/latest/tutorials/volume_preserving_attention/30157b05.svg +++ b/latest/tutorials/volume_preserving_attention/b94db844.svg @@ -1,56 +1,56 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/latest/tutorials/volume_preserving_attention/index.html b/latest/tutorials/volume_preserving_attention/index.html index 52d7cd139..a51798e5e 100644 --- a/latest/tutorials/volume_preserving_attention/index.html +++ b/latest/tutorials/volume_preserving_attention/index.html @@ -1,5 +1,5 @@ -Volume-Preserving Attention · GeometricMachineLearning.jl

      Comparison of different VolumePreservingAttention

      In the section of volume-preserving attention we mentioned two ways of computing volume-preserving attention: one where we compute the correlations with a skew-symmetric matrix and one where we compute the correlations with an arbitrary matrix. Here we compare the two approaches. When calling the VolumePreservingAttention layer we can specify whether we want to use the skew-symmetric or the arbitrary weighting by setting the keyword skew_sym = true and skew_sym = false respectively.

      In here we demonstrate the differences between the two approaches for computing correlations. For this we first generate a training set consisting of two collections of curves: (i) sine curves and (ii) cosine curve.

      import Random # hide
      +Volume-Preserving Attention · GeometricMachineLearning.jl

      Comparison of different VolumePreservingAttention

      In the section of volume-preserving attention we mentioned two ways of computing volume-preserving attention: one where we compute the correlations with a skew-symmetric matrix and one where we compute the correlations with an arbitrary matrix. Here we compare the two approaches. When calling the VolumePreservingAttention layer we can specify whether we want to use the skew-symmetric or the arbitrary weighting by setting the keyword skew_sym = true and skew_sym = false respectively.

      In here we demonstrate the differences between the two approaches for computing correlations. For this we first generate a training set consisting of two collections of curves: (i) sine curves and (ii) cosine curve.

      import Random # hide
       
       sine_cosine = zeros(1, 1000, 2)
       sine_cosine[1, :, 1] .= sin.(0.:.1:99.9)
      @@ -7,7 +7,7 @@
       
       
       const dl = DataLoader(Float16.(sine_cosine))
      DataLoader{Float16, Array{Float16, 3}, Nothing, :TimeSeries}(Float16[0.0 0.09985 … -0.6675 -0.59;;; 1.0 0.995 … 0.7446 0.8076], nothing, 1, 1000, 2, nothing, nothing)

      The third axis (i.e. the parameter axis) has length two, meaning we have two different kinds of curves:

      plot(dl.input[1, :, 1], label = "sine")
      -plot!(dl.input[1, :, 2], label = "cosine")
      Example block output

      We want to train a single neural network on both these curves. We compare three networks which are of the following form:

      \[\mathtt{network} = \mathcal{NN}_d\circ\Psi\circ\mathcal{NN}_u,\]

      where $\mathcal{NN}_u$ refers to a neural network that scales up and $\mathcal{NN}_d$ refers to a neural network that scales down. The up and down scaling is done with simple dense layers:

      \[\mathcal{NN}_u(x) = \mathrm{tanh}(a_ux + b_u) \text{ and } \mathcal{NN}_d(x) = a_d^Tx + b_d,\]

      where $a_u, b_u, a_d\in\mathbb{R}^\mathrm{ud}$ and $b_d$ is a scalar. ud refers to upscaling dimension. For $\Psi$ we consider three different choices:

      1. a volume-preserving attention with skew-symmetric weighting,
      2. a volume-preserving attention with arbitrary weighting,
      3. an identity layer.

      We further choose a sequence length 5 (i.e. the network always sees the last 5 time steps) and always predict one step into the future (i.e. the prediction window is set to 1):

      const seq_length = 3
      +plot!(dl.input[1, :, 2], label = "cosine")
      Example block output

      We want to train a single neural network on both these curves. We compare three networks which are of the following form:

      \[\mathtt{network} = \mathcal{NN}_d\circ\Psi\circ\mathcal{NN}_u,\]

      where $\mathcal{NN}_u$ refers to a neural network that scales up and $\mathcal{NN}_d$ refers to a neural network that scales down. The up and down scaling is done with simple dense layers:

      \[\mathcal{NN}_u(x) = \mathrm{tanh}(a_ux + b_u) \text{ and } \mathcal{NN}_d(x) = a_d^Tx + b_d,\]

      where $a_u, b_u, a_d\in\mathbb{R}^\mathrm{ud}$ and $b_d$ is a scalar. ud refers to upscaling dimension. For $\Psi$ we consider three different choices:

      1. a volume-preserving attention with skew-symmetric weighting,
      2. a volume-preserving attention with arbitrary weighting,
      3. an identity layer.

      We further choose a sequence length 5 (i.e. the network always sees the last 5 time steps) and always predict one step into the future (i.e. the prediction window is set to 1):

      const seq_length = 3
       const prediction_window = 1
       
       const upscale_dimension_1 = 2
      @@ -61,7 +61,7 @@
           p
       end
       
      -plot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)
      Example block output

      Looking at the training errors, we can see that the network with the skew-symmetric weighting is stuck at a relatively high error rate, whereas the loss for the network with the arbitrary weighting is decreasing to a significantly lower level. The feedforward network without the attention mechanism is not able to learn anything useful (as was expected).

      The following demonstrates the predictions of our approaches[1]:

      initial_condition = dl.input[:, 1:seq_length, 2]
      +plot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)
      Example block output

      Looking at the training errors, we can see that the network with the skew-symmetric weighting is stuck at a relatively high error rate, whereas the loss for the network with the arbitrary weighting is decreasing to a significantly lower level. The feedforward network without the attention mechanism is not able to learn anything useful (as was expected).

      The following demonstrates the predictions of our approaches[1]:

      initial_condition = dl.input[:, 1:seq_length, 2]
       
       function make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)
           nn_skew = NeuralNetwork(GeometricMachineLearning.DummyTransformer(seq_length), nn_skew.model, nn_skew.params, CPU())
      @@ -88,9 +88,9 @@
           p2
       end
       
      -p2 = produce_validation_plot(40)
      Example block output

      In the above plot we can see that the network with the arbitrary weighting performs much better; even though the green line does not fit the blue line very well either, it manages to least qualitatively reflect the training data. We can also plot the predictions for longer time intervals:

      p3 = produce_validation_plot(400)
      Example block output

      We can also plot the comparison with the sine function:

      initial_condition = dl.input[:, 1:seq_length, 1]
      +p2 = produce_validation_plot(40)
      Example block output

      In the above plot we can see that the network with the arbitrary weighting performs much better; even though the green line does not fit the blue line very well either, it manages to least qualitatively reflect the training data. We can also plot the predictions for longer time intervals:

      p3 = produce_validation_plot(400)
      Example block output

      We can also plot the comparison with the sine function:

      initial_condition = dl.input[:, 1:seq_length, 1]
       
      -p2 = produce_validation_plot(40, initial_condition = initial_condition, type = :sin)
      Example block output

      This advantage of the volume-preserving attention with arbitrary weighting may however be due to the fact that the skew-symmetric attention only has 3 learnable parameters, as opposed to 9 for the arbitrary weighting. If we increase the upscaling dimension the result changes:

      const upscale_dimension_2 = 10
      +p2 = produce_validation_plot(40, initial_condition = initial_condition, type = :sin)
      Example block output

      This advantage of the volume-preserving attention with arbitrary weighting may however be due to the fact that the skew-symmetric attention only has 3 learnable parameters, as opposed to 9 for the arbitrary weighting. If we increase the upscaling dimension the result changes:

      const upscale_dimension_2 = 10
       
       nn_skew, nn_arb, nn_comp = set_up_networks(upscale_dimension_2)
       
      @@ -98,8 +98,8 @@
       
       loss_array_skew, loss_array_arb, loss_array_comp = train_networks!(nn_skew, nn_arb, nn_comp)
       
      -plot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)
      Example block output
      initial_condition = dl.input[:, 1:seq_length, 2]
      +plot_training_losses(loss_array_skew, loss_array_arb, loss_array_comp)
      Example block output
      initial_condition = dl.input[:, 1:seq_length, 2]
       
       nn_skew, nn_arb, nn_comp = make_networks_neural_network_integrators(nn_skew, nn_arb, nn_comp)
       
      -p2 = produce_validation_plot(40, nn_skew, nn_arb, nn_comp)
      Example block output

      And for a longer time interval:

      p3 = produce_validation_plot(200, nn_skew, nn_arb, nn_comp)
      Example block output
      • 1Here we have to use the architectures DummyTransformer and DummyNNIntegrator to reformulate the three neural networks defined here as NeuralNetworkIntegrators. Normally the user should try to use predefined architectures in GeometricMachineLearning, that way they never use DummyTransformer and DummyNNIntegrator.
      +p2 = produce_validation_plot(40, nn_skew, nn_arb, nn_comp)
      Example block output

      And for a longer time interval:

      p3 = produce_validation_plot(200, nn_skew, nn_arb, nn_comp)
      Example block output
      • 1Here we have to use the architectures DummyTransformer and DummyNNIntegrator to reformulate the three neural networks defined here as NeuralNetworkIntegrators. Normally the user should try to use predefined architectures in GeometricMachineLearning, that way they never use DummyTransformer and DummyNNIntegrator.